Skip to content

Commit a25a370

Browse files
feat(python/sedonadb): Support specifying layer name and accessing sub-paths inside archives when working with pyogrio sources. (#778)
Co-authored-by: Dewey Dunnington <dewey@dunnington.ca>
1 parent bcb47d8 commit a25a370

3 files changed

Lines changed: 54 additions & 5 deletions

File tree

python/sedonadb/python/sedonadb/context.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -284,8 +284,13 @@ def read_pyogrio(
284284
table_paths: A str, Path, or iterable of paths containing URLs or
285285
paths. Globs (i.e., `path/*.gpkg`), directories, and zipped
286286
versions of otherwise readable files are supported.
287-
options: An optional mapping of key/value pairs (open options)
288-
passed to GDAL/OGR.
287+
options: An optional mapping of key/value pairs passed to
288+
pyogrio/GDAL. Supports pyogrio keyword arguments (e.g.,
289+
``layer``, ``where``, ``sql``, ``max_features``) as well
290+
as GDAL driver-specific dataset open options. Additionally,
291+
``path_suffix`` can append a subpath to the resolved
292+
GDAL source (e.g., ``{"path_suffix": "data.gdb"}`` for
293+
a GDB stored inside a .zip file).
289294
extension: An optional file extension (e.g., `"fgb"`) used when
290295
`table_paths` specifies one or more directories or a glob
291296
that does not enforce a file extension.

python/sedonadb/python/sedonadb/datasource.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,10 @@ def open_reader(self, args):
144144
if ogr_src.endswith(".zip"):
145145
ogr_src = f"/vsizip/{ogr_src}"
146146

147+
path_suffix = self._options.get("path_suffix")
148+
if path_suffix is not None:
149+
ogr_src = f"{ogr_src}/{path_suffix}"
150+
147151
if args.is_projected():
148152
file_columns = args.file_schema.names
149153
columns = [file_columns[i] for i in args.file_projection]
@@ -164,10 +168,12 @@ def open_reader(self, args):
164168
else:
165169
bbox = None
166170

171+
ogr_kwargs = {**self._options}
172+
ogr_kwargs.update(columns=columns, batch_size=batch_size, bbox=bbox)
173+
ogr_kwargs.pop("path_suffix", None)
174+
167175
return PyogrioReaderShelter(
168-
pyogrio.raw.ogr_open_arrow(
169-
ogr_src, {}, columns=columns, batch_size=batch_size, bbox=bbox
170-
),
176+
pyogrio.raw.open_arrow(ogr_src, **ogr_kwargs),
171177
columns,
172178
)
173179

python/sedonadb/tests/io/test_pyogrio.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
import io
1919
import tempfile
2020
import warnings
21+
import zipfile
2122
from pathlib import Path
2223

2324
import geoarrow.pyarrow as ga
@@ -131,6 +132,43 @@ def test_read_ogr_filter(con):
131132
)
132133

133134

135+
def test_read_ogr_layer_selection(con):
136+
series = geopandas.GeoSeries.from_xy([0, 1], [1, 2], crs="EPSG:3857")
137+
gdf = geopandas.GeoDataFrame({"val": ["a", "b"], "geom": series})
138+
gdf = gdf.set_geometry(gdf["geom"])
139+
140+
with tempfile.TemporaryDirectory() as td:
141+
gpkg_path = f"{td}/test.gpkg"
142+
gdf.to_file(gpkg_path, layer="my_layer")
143+
144+
# Reading with the correct layer name should work
145+
geopandas.testing.assert_geodataframe_equal(
146+
con.read_pyogrio(gpkg_path, options={"layer": "my_layer"}).to_pandas(),
147+
gdf,
148+
)
149+
150+
151+
def test_read_ogr_path_suffix(con):
152+
series = geopandas.GeoSeries.from_xy([0, 1], [1, 2], crs="EPSG:3857")
153+
gdf = geopandas.GeoDataFrame({"val": ["a", "b"], "geom": series})
154+
gdf = gdf.set_geometry(gdf["geom"])
155+
156+
with tempfile.TemporaryDirectory() as td:
157+
gpkg_path = f"{td}/data.gpkg"
158+
gdf.to_file(gpkg_path)
159+
160+
zip_path = f"{td}/archive.zip"
161+
with zipfile.ZipFile(zip_path, "w") as zf:
162+
zf.write(gpkg_path, "nested/data.gpkg")
163+
164+
geopandas.testing.assert_geodataframe_equal(
165+
con.read_pyogrio(
166+
zip_path, options={"path_suffix": "nested/data.gpkg"}
167+
).to_pandas(),
168+
gdf,
169+
)
170+
171+
134172
def test_read_ogr_file_not_found(con):
135173
with pytest.raises(
136174
sedonadb._lib.SedonaError, match="Can't infer schema for zero objects"

0 commit comments

Comments
 (0)