Skip to content

Commit bce5015

Browse files
authored
BUG: fix read_parquet to_pandas_kwargs with complex dtypes (geopandas#3640)
1 parent ecf789d commit bce5015

File tree

3 files changed

+57
-8
lines changed

3 files changed

+57
-8
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ Bug fixes:
66

77
- Fix an issue that caused an error in `GeoDataFrame.from_features` when there is no `properties` field (#3599).
88
- Fix `read_file` and `to_file` errors (#3682)
9+
- Fix `read_parquet` with `to_pandas_kwargs` for complex (list/struct) arrow types (#3640)
910

1011
## Version 1.1.1 (June 27, 2025)
1112

geopandas/io/arrow.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -494,13 +494,15 @@ def _arrow_to_geopandas(table, geo_metadata=None, to_pandas_kwargs=None):
494494
if geo_metadata is None:
495495
# Note: this path of not passing metadata is also used by dask-geopandas
496496
geo_metadata = _validate_and_decode_metadata(table.schema.metadata)
497+
if to_pandas_kwargs is None:
498+
to_pandas_kwargs = {}
497499

498500
# Find all geometry columns that were read from the file. May
499501
# be a subset if 'columns' parameter is used.
500502
geometry_columns = [
501503
col for col in geo_metadata["columns"] if col in table.column_names
502504
]
503-
result_column_names = list(table.slice(0, 0).to_pandas().columns)
505+
result_column_names = list(table.slice(0, 0).to_pandas(**to_pandas_kwargs).columns)
504506
geometry_columns.sort(key=result_column_names.index)
505507

506508
if not len(geometry_columns):
@@ -526,8 +528,6 @@ def _arrow_to_geopandas(table, geo_metadata=None, to_pandas_kwargs=None):
526528
)
527529

528530
table_attr = table.drop(geometry_columns)
529-
if to_pandas_kwargs is None:
530-
to_pandas_kwargs = {}
531531
df = table_attr.to_pandas(**to_pandas_kwargs)
532532

533533
# Convert the WKB columns that are present back to geometry.

geopandas/io/tests/test_arrow.py

Lines changed: 53 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
import json
22
import os
33
import pathlib
4+
import re
45
from itertools import product
56
from packaging.version import Version
67

78
import numpy as np
8-
from pandas import ArrowDtype, DataFrame
9+
from pandas import ArrowDtype, DataFrame, Index, Series
910
from pandas import read_parquet as pd_read_parquet
1011

1112
import shapely
@@ -763,11 +764,29 @@ def test_write_empty_bbox(tmpdir, geometry):
763764
assert "bbox" not in metadata["columns"]["geometry"]
764765

765766

767+
@pytest.mark.skipif(
768+
Version(pyarrow.__version__) < Version("19.0.0"),
769+
reason="This version of pyarrow does not support reading complex types",
770+
)
766771
@pytest.mark.parametrize("format", ["feather", "parquet"])
767772
def test_write_read_to_pandas_kwargs(tmpdir, format):
768773
filename = os.path.join(str(tmpdir), f"test.{format}")
769-
g = box(0, 0, 10, 10)
770-
gdf = geopandas.GeoDataFrame({"geometry": [g], "i": [1], "s": ["a"]})
774+
775+
# Use arrow types to ensure that we can assert the roundtrip was successful
776+
int_type = ArrowDtype(pyarrow.int64())
777+
str_type = ArrowDtype(pyarrow.string())
778+
complex_type = ArrowDtype(pyarrow.struct([pyarrow.field("foo", pyarrow.string())]))
779+
index = Index([0], dtype=ArrowDtype(pyarrow.int64()))
780+
781+
gdf = geopandas.GeoDataFrame(
782+
{
783+
"geometry": [box(0, 0, 10, 10)],
784+
"i": Series([1], index=index, dtype=int_type),
785+
"s": Series(["a"], index=index, dtype=str_type),
786+
"c": Series([{"foo": "bar"}], index=index, dtype=complex_type),
787+
},
788+
index=index,
789+
)
771790

772791
if format == "feather":
773792
gdf.to_feather(filename)
@@ -779,8 +798,37 @@ def test_write_read_to_pandas_kwargs(tmpdir, format):
779798
# simulate the `dtype_backend="pyarrow"` option in `pandas.read_parquet`
780799
gdf_roundtrip = read_func(filename, to_pandas_kwargs={"types_mapper": ArrowDtype})
781800
assert isinstance(gdf_roundtrip, geopandas.GeoDataFrame)
782-
assert isinstance(gdf_roundtrip.dtypes["i"], ArrowDtype)
783-
assert isinstance(gdf_roundtrip.dtypes["s"], ArrowDtype)
801+
assert gdf_roundtrip.dtypes["i"] == int_type
802+
assert gdf_roundtrip.dtypes["s"] == str_type
803+
assert gdf_roundtrip.dtypes["c"] == complex_type
804+
assert_geodataframe_equal(gdf_roundtrip, gdf, check_dtype=True)
805+
806+
807+
@pytest.mark.parametrize("format", ["feather", "parquet"])
808+
def test_read_complex_type_with_numpy_backend_xfail(tmpdir, format):
809+
filename = os.path.join(str(tmpdir), f"test.{format}")
810+
complex_type = ArrowDtype(pyarrow.struct([pyarrow.field("foo", pyarrow.string())]))
811+
index = Index([0], dtype=ArrowDtype(pyarrow.int64()))
812+
gdf = geopandas.GeoDataFrame(
813+
{
814+
"geometry": [box(0, 0, 10, 10)],
815+
"c": Series([{"foo": "bar"}], index=index, dtype=complex_type),
816+
},
817+
index=index,
818+
)
819+
if format == "feather":
820+
gdf.to_feather(filename)
821+
read_func = read_feather
822+
else:
823+
gdf.to_parquet(filename)
824+
read_func = read_parquet
825+
# Note: due to bugs in pyarrow, we can't read complex types without using
826+
# the types mapper. This is a long standing pandas issue as noted here:
827+
# - https://github.com/pandas-dev/pandas/issues/53011
828+
# - https://github.com/apache/arrow/issues/39914
829+
match = re.escape("data type 'struct<foo: string>[pyarrow]' not understood")
830+
with pytest.raises(TypeError, match=match):
831+
read_func(filename)
784832

785833

786834
@pytest.mark.parametrize("format", ["feather", "parquet"])

0 commit comments

Comments
 (0)