BUG: fix read_parquet to_pandas_kwargs with complex dtypes (geopandas#3640)

bretttully · web-flow · commit bce5015cd86d · 2025-08-30T22:45:46.000+02:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,7 @@ Bug fixes:
 
 - Fix an issue that caused an error in `GeoDataFrame.from_features` when there is no `properties` field (#3599).
 - Fix `read_file` and `to_file` errors (#3682)
+- Fix `read_parquet` with `to_pandas_kwargs` for complex (list/struct) arrow types (#3640)
 
 ## Version 1.1.1 (June 27, 2025)
 
diff --git a/geopandas/io/arrow.py b/geopandas/io/arrow.py
@@ -494,13 +494,15 @@ def _arrow_to_geopandas(table, geo_metadata=None, to_pandas_kwargs=None):
     if geo_metadata is None:
         # Note: this path of not passing metadata is also used by dask-geopandas
         geo_metadata = _validate_and_decode_metadata(table.schema.metadata)
+    if to_pandas_kwargs is None:
+        to_pandas_kwargs = {}
 
     # Find all geometry columns that were read from the file.  May
     # be a subset if 'columns' parameter is used.
     geometry_columns = [
         col for col in geo_metadata["columns"] if col in table.column_names
     ]
-    result_column_names = list(table.slice(0, 0).to_pandas().columns)
+    result_column_names = list(table.slice(0, 0).to_pandas(**to_pandas_kwargs).columns)
     geometry_columns.sort(key=result_column_names.index)
 
     if not len(geometry_columns):
@@ -526,8 +528,6 @@ def _arrow_to_geopandas(table, geo_metadata=None, to_pandas_kwargs=None):
             )
 
     table_attr = table.drop(geometry_columns)
-    if to_pandas_kwargs is None:
-        to_pandas_kwargs = {}
     df = table_attr.to_pandas(**to_pandas_kwargs)
 
     # Convert the WKB columns that are present back to geometry.
diff --git a/geopandas/io/tests/test_arrow.py b/geopandas/io/tests/test_arrow.py
@@ -1,11 +1,12 @@
 import json
 import os
 import pathlib
+import re
 from itertools import product
 from packaging.version import Version
 
 import numpy as np
-from pandas import ArrowDtype, DataFrame
+from pandas import ArrowDtype, DataFrame, Index, Series
 from pandas import read_parquet as pd_read_parquet
 
 import shapely
@@ -763,11 +764,29 @@ def test_write_empty_bbox(tmpdir, geometry):
     assert "bbox" not in metadata["columns"]["geometry"]
 
 
+@pytest.mark.skipif(
+    Version(pyarrow.__version__) < Version("19.0.0"),
+    reason="This version of pyarrow does not support reading complex types",
+)
 @pytest.mark.parametrize("format", ["feather", "parquet"])
 def test_write_read_to_pandas_kwargs(tmpdir, format):
     filename = os.path.join(str(tmpdir), f"test.{format}")
-    g = box(0, 0, 10, 10)
-    gdf = geopandas.GeoDataFrame({"geometry": [g], "i": [1], "s": ["a"]})
+
+    # Use arrow types to ensure that we can assert the roundtrip was successful
+    int_type = ArrowDtype(pyarrow.int64())
+    str_type = ArrowDtype(pyarrow.string())
+    complex_type = ArrowDtype(pyarrow.struct([pyarrow.field("foo", pyarrow.string())]))
+    index = Index([0], dtype=ArrowDtype(pyarrow.int64()))
+
+    gdf = geopandas.GeoDataFrame(
+        {
+            "geometry": [box(0, 0, 10, 10)],
+            "i": Series([1], index=index, dtype=int_type),
+            "s": Series(["a"], index=index, dtype=str_type),
+            "c": Series([{"foo": "bar"}], index=index, dtype=complex_type),
+        },
+        index=index,
+    )
 
     if format == "feather":
         gdf.to_feather(filename)
@@ -779,8 +798,37 @@ def test_write_read_to_pandas_kwargs(tmpdir, format):
     # simulate the `dtype_backend="pyarrow"` option in `pandas.read_parquet`
     gdf_roundtrip = read_func(filename, to_pandas_kwargs={"types_mapper": ArrowDtype})
     assert isinstance(gdf_roundtrip, geopandas.GeoDataFrame)
-    assert isinstance(gdf_roundtrip.dtypes["i"], ArrowDtype)
-    assert isinstance(gdf_roundtrip.dtypes["s"], ArrowDtype)
+    assert gdf_roundtrip.dtypes["i"] == int_type
+    assert gdf_roundtrip.dtypes["s"] == str_type
+    assert gdf_roundtrip.dtypes["c"] == complex_type
+    assert_geodataframe_equal(gdf_roundtrip, gdf, check_dtype=True)
+
+
+@pytest.mark.parametrize("format", ["feather", "parquet"])
+def test_read_complex_type_with_numpy_backend_xfail(tmpdir, format):
+    filename = os.path.join(str(tmpdir), f"test.{format}")
+    complex_type = ArrowDtype(pyarrow.struct([pyarrow.field("foo", pyarrow.string())]))
+    index = Index([0], dtype=ArrowDtype(pyarrow.int64()))
+    gdf = geopandas.GeoDataFrame(
+        {
+            "geometry": [box(0, 0, 10, 10)],
+            "c": Series([{"foo": "bar"}], index=index, dtype=complex_type),
+        },
+        index=index,
+    )
+    if format == "feather":
+        gdf.to_feather(filename)
+        read_func = read_feather
+    else:
+        gdf.to_parquet(filename)
+        read_func = read_parquet
+    # Note: due to bugs in pyarrow, we can't read complex types without using
+    # the types mapper. This is a long standing pandas issue as noted here:
+    # - https://github.com/pandas-dev/pandas/issues/53011
+    # - https://github.com/apache/arrow/issues/39914
+    match = re.escape("data type 'struct<foo: string>[pyarrow]' not understood")
+    with pytest.raises(TypeError, match=match):
+        read_func(filename)
 
 
 @pytest.mark.parametrize("format", ["feather", "parquet"])