Skip to content

Commit 1112f7a

Browse files
authored
bug: Support read_parquet glob file paths (#34)
* Add 'test_read_parquet_local_glob' python test * Fix glob behavior by only converting to globs once * Clean up
1 parent 5d3cfba commit 1112f7a

2 files changed

Lines changed: 28 additions & 10 deletions

File tree

python/sedonadb/tests/test_context.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,20 @@ def test_read_parquet(con, geoarrow_data):
3535
assert len(tab) == 244
3636

3737

38+
def test_read_parquet_local_glob(con, geoarrow_data):
39+
# The above test uses .glob() method, this test uses the raw string
40+
tab = con.read_parquet(
41+
geoarrow_data / "example/files/*_geo.parquet"
42+
).to_arrow_table()
43+
assert tab["geometry"].type.extension_name == "geoarrow.wkb"
44+
assert len(tab) == 244
45+
46+
tab = con.read_parquet(
47+
geoarrow_data / "example/files/example_polygon-*geo.parquet"
48+
).to_arrow_table()
49+
assert len(tab) == 12
50+
51+
3852
def test_read_parquet_error(con):
3953
with pytest.raises(sedonadb._lib.SedonaError, match="No table paths were provided"):
4054
con.read_parquet([])

rust/sedona-geoparquet/src/provider.rs

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ use datafusion::{
2424
file_format::parquet::ParquetFormat,
2525
listing::{ListingOptions, ListingTable, ListingTableConfig, ListingTableUrl},
2626
},
27-
execution::{context::DataFilePaths, options::ReadOptions, SessionState},
27+
execution::{options::ReadOptions, SessionState},
2828
prelude::{ParquetReadOptions, SessionConfig, SessionContext},
2929
};
3030
use datafusion_common::{exec_err, Result};
@@ -36,12 +36,11 @@ use crate::format::GeoParquetFormat;
3636
/// Because [ListingTable] implements `TableProvider`, this can be used to
3737
/// implement geo-aware Parquet reading with interfaces that are otherwise
3838
/// hard-coded to the built-in Parquet reader.
39-
pub async fn geoparquet_listing_table<P: DataFilePaths>(
39+
pub async fn geoparquet_listing_table(
4040
context: &SessionContext,
41-
table_paths: P,
41+
table_paths: Vec<ListingTableUrl>,
4242
options: GeoParquetReadOptions<'_>,
4343
) -> Result<ListingTable> {
44-
let table_paths = table_paths.to_urls()?;
4544
let session_config = context.copied_config();
4645
let listing_options =
4746
options.to_listing_options(&session_config, context.copied_table_options());
@@ -134,7 +133,9 @@ mod test {
134133
let data_dir = geoarrow_data_dir().unwrap();
135134
let tab = geoparquet_listing_table(
136135
&ctx,
137-
format!("{data_dir}/example/files/*_geo.parquet"),
136+
vec![
137+
ListingTableUrl::parse(format!("{data_dir}/example/files/*_geo.parquet")).unwrap(),
138+
],
138139
GeoParquetReadOptions::default(),
139140
)
140141
.await
@@ -169,15 +170,18 @@ mod test {
169170
#[tokio::test]
170171
async fn listing_table_errors() {
171172
let ctx = SessionContext::new();
172-
let err =
173-
geoparquet_listing_table(&ctx, Vec::<String>::new(), GeoParquetReadOptions::default())
174-
.await
175-
.unwrap_err();
173+
let err = geoparquet_listing_table(
174+
&ctx,
175+
Vec::<ListingTableUrl>::new(),
176+
GeoParquetReadOptions::default(),
177+
)
178+
.await
179+
.unwrap_err();
176180
assert_eq!(err.message(), "No table paths were provided");
177181

178182
let err = geoparquet_listing_table(
179183
&ctx,
180-
"foofy.wrongextension",
184+
vec![ListingTableUrl::parse("foofy.wrongextension").unwrap()],
181185
GeoParquetReadOptions::default(),
182186
)
183187
.await

0 commit comments

Comments
 (0)