1919 pa_version_under11p0 ,
2020 pa_version_under13p0 ,
2121 pa_version_under15p0 ,
22+ pa_version_under19p0 ,
2223)
2324
2425import pandas as pd
@@ -110,10 +111,7 @@ def fp(request):
110111
111112@pytest .fixture
112113def df_compat ():
113- # TODO(infer_string) should this give str columns?
114- return pd .DataFrame (
115- {"A" : [1 , 2 , 3 ], "B" : "foo" }, columns = pd .Index (["A" , "B" ], dtype = object )
116- )
114+ return pd .DataFrame ({"A" : [1 , 2 , 3 ], "B" : "foo" }, columns = pd .Index (["A" , "B" ]))
117115
118116
119117@pytest .fixture
@@ -261,8 +259,10 @@ def test_invalid_engine(df_compat):
261259 check_round_trip (df_compat , "foo" , "bar" )
262260
263261
264- def test_options_py (df_compat , pa ):
262+ def test_options_py (df_compat , pa , using_infer_string ):
265263 # use the set option
264+ if using_infer_string and not pa_version_under19p0 :
265+ df_compat .columns = df_compat .columns .astype ("str" )
266266
267267 with pd .option_context ("io.parquet.engine" , "pyarrow" ):
268268 check_round_trip (df_compat )
@@ -798,18 +798,21 @@ def test_unsupported_float16_cleanup(self, pa, path_type):
798798
799799 def test_categorical (self , pa ):
800800 # supported in >= 0.7.0
801- df = pd .DataFrame ()
802- df ["a" ] = pd .Categorical (list ("abcdef" ))
803-
804- # test for null, out-of-order values, and unobserved category
805- df ["b" ] = pd .Categorical (
806- ["bar" , "foo" , "foo" , "bar" , None , "bar" ],
807- dtype = pd .CategoricalDtype (["foo" , "bar" , "baz" ]),
808- )
809-
810- # test for ordered flag
811- df ["c" ] = pd .Categorical (
812- ["a" , "b" , "c" , "a" , "c" , "b" ], categories = ["b" , "c" , "d" ], ordered = True
801+ df = pd .DataFrame (
802+ {
803+ "a" : pd .Categorical (list ("abcdef" )),
804+ # test for null, out-of-order values, and unobserved category
805+ "b" : pd .Categorical (
806+ ["bar" , "foo" , "foo" , "bar" , None , "bar" ],
807+ dtype = pd .CategoricalDtype (["foo" , "bar" , "baz" ]),
808+ ),
809+ # test for ordered flag
810+ "c" : pd .Categorical (
811+ ["a" , "b" , "c" , "a" , "c" , "b" ],
812+ categories = ["b" , "c" , "d" ],
813+ ordered = True ,
814+ ),
815+ }
813816 )
814817
815818 check_round_trip (df , pa )
@@ -878,11 +881,13 @@ def test_s3_roundtrip_for_dir(
878881 repeat = 1 ,
879882 )
880883
881- def test_read_file_like_obj_support (self , df_compat ):
884+ def test_read_file_like_obj_support (self , df_compat , using_infer_string ):
882885 pytest .importorskip ("pyarrow" )
883886 buffer = BytesIO ()
884887 df_compat .to_parquet (buffer )
885888 df_from_buf = read_parquet (buffer )
889+ if using_infer_string and not pa_version_under19p0 :
890+ df_compat .columns = df_compat .columns .astype ("str" )
886891 tm .assert_frame_equal (df_compat , df_from_buf )
887892
888893 def test_expand_user (self , df_compat , monkeypatch ):
@@ -949,7 +954,7 @@ def test_additional_extension_arrays(self, pa, using_infer_string):
949954 "c" : pd .Series (["a" , None , "c" ], dtype = "string" ),
950955 }
951956 )
952- if using_infer_string :
957+ if using_infer_string and pa_version_under19p0 :
953958 check_round_trip (df , pa , expected = df .astype ({"c" : "str" }))
954959 else :
955960 check_round_trip (df , pa )
@@ -963,7 +968,10 @@ def test_pyarrow_backed_string_array(self, pa, string_storage, using_infer_strin
963968 df = pd .DataFrame ({"a" : pd .Series (["a" , None , "c" ], dtype = "string[pyarrow]" )})
964969 with pd .option_context ("string_storage" , string_storage ):
965970 if using_infer_string :
966- expected = df .astype ("str" )
971+ if pa_version_under19p0 :
972+ expected = df .astype ("str" )
973+ else :
974+ expected = df .astype (f"string[{ string_storage } ]" )
967975 expected .columns = expected .columns .astype ("str" )
968976 else :
969977 expected = df .astype (f"string[{ string_storage } ]" )
@@ -1128,17 +1136,24 @@ def test_df_attrs_persistence(self, tmp_path, pa):
11281136 new_df = read_parquet (path , engine = pa )
11291137 assert new_df .attrs == df .attrs
11301138
1131- def test_string_inference (self , tmp_path , pa ):
1139+ def test_string_inference (self , tmp_path , pa , using_infer_string ):
11321140 # GH#54431
11331141 path = tmp_path / "test_string_inference.p"
11341142 df = pd .DataFrame (data = {"a" : ["x" , "y" ]}, index = ["a" , "b" ])
1135- df .to_parquet (path , engine = "pyarrow" )
1143+ df .to_parquet (path , engine = pa )
11361144 with pd .option_context ("future.infer_string" , True ):
1137- result = read_parquet (path , engine = "pyarrow" )
1145+ result = read_parquet (path , engine = pa )
1146+ dtype = pd .StringDtype (na_value = np .nan )
11381147 expected = pd .DataFrame (
11391148 data = {"a" : ["x" , "y" ]},
1140- dtype = pd .StringDtype (na_value = np .nan ),
1141- index = pd .Index (["a" , "b" ], dtype = pd .StringDtype (na_value = np .nan )),
1149+ dtype = dtype ,
1150+ index = pd .Index (["a" , "b" ], dtype = dtype ),
1151+ columns = pd .Index (
1152+ ["a" ],
1153+ dtype = object
1154+ if pa_version_under19p0 and not using_infer_string
1155+ else dtype ,
1156+ ),
11421157 )
11431158 tm .assert_frame_equal (result , expected )
11441159
@@ -1151,7 +1166,10 @@ def test_roundtrip_decimal(self, tmp_path, pa):
11511166 df = pd .DataFrame ({"a" : [Decimal ("123.00" )]}, dtype = "string[pyarrow]" )
11521167 df .to_parquet (path , schema = pa .schema ([("a" , pa .decimal128 (5 ))]))
11531168 result = read_parquet (path )
1154- expected = pd .DataFrame ({"a" : ["123" ]}, dtype = "string[python]" )
1169+ if pa_version_under19p0 :
1170+ expected = pd .DataFrame ({"a" : ["123" ]}, dtype = "string[python]" )
1171+ else :
1172+ expected = pd .DataFrame ({"a" : [Decimal ("123.00" )]}, dtype = "object" )
11551173 tm .assert_frame_equal (result , expected )
11561174
11571175 def test_infer_string_large_string_type (self , tmp_path , pa ):
0 commit comments