From 46e36042f104d8bb830015e2703048162fbf2376 Mon Sep 17 00:00:00 2001 From: Harshit Pande Date: Wed, 24 Sep 2025 17:47:21 -0400 Subject: [PATCH 01/10] Update test_to_csv.py _return_result_expected updated to temp file fixture --- pandas/tests/frame/methods/test_to_csv.py | 39 ++++++++++++----------- 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index 34d120145b381..0cf5330c01f67 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -248,6 +248,7 @@ def _return_result_expected( self, df, chunksize, + temp_file, r_dtype=None, c_dtype=None, rnlvl=None, @@ -260,15 +261,16 @@ def _return_result_expected( kwargs["index_col"] = list(range(rnlvl)) kwargs["header"] = list(range(cnlvl)) - with tm.ensure_clean("__tmp_to_csv_moar__") as path: - df.to_csv(path, encoding="utf8", chunksize=chunksize) - recons = self.read_csv(path, **kwargs) + + path = str(temp_file) + df.to_csv(path, encoding="utf8", chunksize=chunksize) + recons = self.read_csv(path, **kwargs) else: kwargs["header"] = 0 - with tm.ensure_clean("__tmp_to_csv_moar__") as path: - df.to_csv(path, encoding="utf8", chunksize=chunksize) - recons = self.read_csv(path, **kwargs) + path = str(temp_file) + df.to_csv(path, encoding="utf8", chunksize=chunksize) + recons = self.read_csv(path, **kwargs) def _to_uni(x): if not isinstance(x, str): @@ -353,13 +355,13 @@ def _to_uni(x): @pytest.mark.parametrize( "nrows", [2, 10, 99, 100, 101, 102, 198, 199, 200, 201, 202, 249, 250, 251] ) - def test_to_csv_nrows(self, nrows): + def test_to_csv_nrows(self, nrows, temp_file): df = DataFrame( np.ones((nrows, 4)), index=date_range("2020-01-01", periods=nrows), columns=Index(list("abcd"), dtype=object), ) - result, expected = self._return_result_expected(df, 1000, "dt", "s") + result, expected = self._return_result_expected(df, 1000, temp_file, "dt", "s") expected.index = expected.index.astype("M8[ns]") tm.assert_frame_equal(result, expected, check_names=False) @@ -372,7 +374,7 @@ def test_to_csv_nrows(self, nrows): ) @pytest.mark.parametrize("ncols", [1, 2, 3, 4]) @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") - def test_to_csv_idx_types(self, nrows, r_idx_type, c_idx_type, ncols): + def test_to_csv_idx_types(self, nrows, r_idx_type, c_idx_type, ncols, temp_file): axes = { "i": lambda n: Index(np.arange(n), dtype=np.int64), "s": lambda n: Index([f"{i}_{chr(i)}" for i in range(97, 97 + n)]), @@ -387,6 +389,7 @@ def test_to_csv_idx_types(self, nrows, r_idx_type, c_idx_type, ncols): result, expected = self._return_result_expected( df, 1000, + temp_file, r_idx_type, c_idx_type, ) @@ -401,13 +404,13 @@ def test_to_csv_idx_types(self, nrows, r_idx_type, c_idx_type, ncols): "nrows", [10, 98, 99, 100, 101, 102, 198, 199, 200, 201, 202, 249, 250, 251] ) @pytest.mark.parametrize("ncols", [1, 2, 3, 4]) - def test_to_csv_idx_ncols(self, nrows, ncols): + def test_to_csv_idx_ncols(self, nrows, ncols, temp_file): df = DataFrame( np.ones((nrows, ncols)), index=Index([f"i-{i}" for i in range(nrows)], name="a"), columns=Index([f"i-{i}" for i in range(ncols)], name="a"), ) - result, expected = self._return_result_expected(df, 1000) + result, expected = self._return_result_expected(df, 1000, temp_file) tm.assert_frame_equal(result, expected, check_names=False) @pytest.mark.slow @@ -427,17 +430,17 @@ def test_to_csv_dup_cols(self, nrows): ix[-2:] = ["rdupe", "rdupe"] df.index = ix df.columns = cols - result, expected = self._return_result_expected(df, 1000, dupe_col=True) + result, expected = self._return_result_expected(df, 1000, temp_file, dupe_col=True) tm.assert_frame_equal(result, expected, check_names=False) @pytest.mark.slow - def test_to_csv_empty(self): + def test_to_csv_empty(self, temp_file): df = DataFrame(index=np.arange(10, dtype=np.int64)) - result, expected = self._return_result_expected(df, 1000) + result, expected = self._return_result_expected(df, 1000, temp_file) tm.assert_frame_equal(result, expected, check_column_type=False) @pytest.mark.slow - def test_to_csv_chunksize(self): + def test_to_csv_chunksize(self, temp_file): chunksize = 1000 rows = chunksize // 2 + 1 df = DataFrame( @@ -445,7 +448,7 @@ def test_to_csv_chunksize(self): columns=Index(list("ab")), index=MultiIndex.from_arrays([range(rows) for _ in range(2)]), ) - result, expected = self._return_result_expected(df, chunksize, rnlvl=2) + result, expected = self._return_result_expected(df, chunksize, temp_file, rnlvl=2) tm.assert_frame_equal(result, expected, check_names=False) @pytest.mark.slow @@ -461,7 +464,7 @@ def test_to_csv_chunksize(self): [{"r_idx_nlevels": 2, "c_idx_nlevels": 2}, {"rnlvl": 2, "cnlvl": 2}], ], ) - def test_to_csv_params(self, nrows, df_params, func_params, ncols): + def test_to_csv_params(self, nrows, df_params, func_params, ncols, temp_file): if df_params.get("r_idx_nlevels"): index = MultiIndex.from_arrays( [f"i-{i}" for i in range(nrows)] @@ -478,7 +481,7 @@ def test_to_csv_params(self, nrows, df_params, func_params, ncols): else: columns = Index([f"i-{i}" for i in range(ncols)]) df = DataFrame(np.ones((nrows, ncols)), index=index, columns=columns) - result, expected = self._return_result_expected(df, 1000, **func_params) + result, expected = self._return_result_expected(df, 1000, temp_file, **func_params) tm.assert_frame_equal(result, expected, check_names=False) def test_to_csv_from_csv_w_some_infs(self, temp_file, float_frame): From 37cc958537146f3a07688cae34f60aab78c36168 Mon Sep 17 00:00:00 2001 From: Harshit Pande Date: Wed, 24 Sep 2025 17:48:57 -0400 Subject: [PATCH 02/10] Update test_to_csv.py fix missed temp file --- pandas/tests/frame/methods/test_to_csv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index 0cf5330c01f67..84455076b60ba 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -415,7 +415,7 @@ def test_to_csv_idx_ncols(self, nrows, ncols, temp_file): @pytest.mark.slow @pytest.mark.parametrize("nrows", [10, 98, 99, 100, 101, 102]) - def test_to_csv_dup_cols(self, nrows): + def test_to_csv_dup_cols(self, nrows, temp_file): df = DataFrame( np.ones((nrows, 3)), index=Index([f"i-{i}" for i in range(nrows)], name="a"), From 889f3d908c9be8164c66651b684c48bc2384e7bb Mon Sep 17 00:00:00 2001 From: Harshit Pande Date: Fri, 26 Sep 2025 12:16:18 +0000 Subject: [PATCH 03/10] temp file instead of ensure clean --- pandas/tests/frame/methods/test_to_csv.py | 249 ++++++++------- pandas/tests/io/formats/test_to_csv.py | 353 +++++++++++----------- pandas/tests/io/test_parquet.py | 284 ++++++++--------- 3 files changed, 449 insertions(+), 437 deletions(-) diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index 84455076b60ba..13f13c70ff748 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -261,14 +261,13 @@ def _return_result_expected( kwargs["index_col"] = list(range(rnlvl)) kwargs["header"] = list(range(cnlvl)) - path = str(temp_file) df.to_csv(path, encoding="utf8", chunksize=chunksize) recons = self.read_csv(path, **kwargs) else: kwargs["header"] = 0 - path = str(temp_file) + path = str(temp_file) df.to_csv(path, encoding="utf8", chunksize=chunksize) recons = self.read_csv(path, **kwargs) @@ -430,7 +429,9 @@ def test_to_csv_dup_cols(self, nrows, temp_file): ix[-2:] = ["rdupe", "rdupe"] df.index = ix df.columns = cols - result, expected = self._return_result_expected(df, 1000, temp_file, dupe_col=True) + result, expected = self._return_result_expected( + df, 1000, temp_file, dupe_col=True + ) tm.assert_frame_equal(result, expected, check_names=False) @pytest.mark.slow @@ -448,7 +449,9 @@ def test_to_csv_chunksize(self, temp_file): columns=Index(list("ab")), index=MultiIndex.from_arrays([range(rows) for _ in range(2)]), ) - result, expected = self._return_result_expected(df, chunksize, temp_file, rnlvl=2) + result, expected = self._return_result_expected( + df, chunksize, temp_file, rnlvl=2 + ) tm.assert_frame_equal(result, expected, check_names=False) @pytest.mark.slow @@ -481,7 +484,9 @@ def test_to_csv_params(self, nrows, df_params, func_params, ncols, temp_file): else: columns = Index([f"i-{i}" for i in range(ncols)]) df = DataFrame(np.ones((nrows, ncols)), index=index, columns=columns) - result, expected = self._return_result_expected(df, 1000, temp_file, **func_params) + result, expected = self._return_result_expected( + df, 1000, temp_file, **func_params + ) tm.assert_frame_equal(result, expected, check_names=False) def test_to_csv_from_csv_w_some_infs(self, temp_file, float_frame): @@ -598,108 +603,104 @@ def test_to_csv_multiindex(self, temp_file, float_frame, datetime_frame): # needed if setUp becomes class method datetime_frame.index = old_index - with tm.ensure_clean("__tmp_to_csv_multiindex__") as path: - # GH3571, GH1651, GH3141 - - def _make_frame(names=None): - if names is True: - names = ["first", "second"] - return DataFrame( - np.random.default_rng(2).integers(0, 10, size=(3, 3)), - columns=MultiIndex.from_tuples( - [("bah", "foo"), ("bah", "bar"), ("ban", "baz")], names=names - ), - dtype="int64", - ) - - # column & index are multi-index - df = DataFrame( - np.ones((5, 3)), - columns=MultiIndex.from_arrays( - [[f"i-{i}" for i in range(3)] for _ in range(4)], names=list("abcd") - ), - index=MultiIndex.from_arrays( - [[f"i-{i}" for i in range(5)] for _ in range(2)], names=list("ab") - ), - ) - df.to_csv(path) - result = read_csv(path, header=[0, 1, 2, 3], index_col=[0, 1]) - tm.assert_frame_equal(df, result) - - # column is mi - df = DataFrame( - np.ones((5, 3)), - columns=MultiIndex.from_arrays( - [[f"i-{i}" for i in range(3)] for _ in range(4)], names=list("abcd") + def _make_frame(names=None): + if names is True: + names = ["first", "second"] + return DataFrame( + np.random.default_rng(2).integers(0, 10, size=(3, 3)), + columns=MultiIndex.from_tuples( + [("bah", "foo"), ("bah", "bar"), ("ban", "baz")], names=names ), + dtype="int64", ) - df.to_csv(path) - result = read_csv(path, header=[0, 1, 2, 3], index_col=0) - tm.assert_frame_equal(df, result) - - # dup column names? - df = DataFrame( - np.ones((5, 3)), - columns=MultiIndex.from_arrays( - [[f"i-{i}" for i in range(3)] for _ in range(4)], names=list("abcd") - ), - index=MultiIndex.from_arrays( - [[f"i-{i}" for i in range(5)] for _ in range(3)], names=list("abc") - ), - ) - df.to_csv(path) - result = read_csv(path, header=[0, 1, 2, 3], index_col=[0, 1, 2]) - tm.assert_frame_equal(df, result) - - # writing with no index - df = _make_frame() - df.to_csv(path, index=False) - result = read_csv(path, header=[0, 1]) - tm.assert_frame_equal(df, result) - - # we lose the names here - df = _make_frame(True) - df.to_csv(path, index=False) - result = read_csv(path, header=[0, 1]) - assert com.all_none(*result.columns.names) - result.columns.names = df.columns.names - tm.assert_frame_equal(df, result) - - # whatsnew example - df = _make_frame() - df.to_csv(path) - result = read_csv(path, header=[0, 1], index_col=[0]) - tm.assert_frame_equal(df, result) - - df = _make_frame(True) - df.to_csv(path) - result = read_csv(path, header=[0, 1], index_col=[0]) - tm.assert_frame_equal(df, result) - - # invalid options - df = _make_frame(True) - df.to_csv(path) - - for i in [6, 7]: - msg = f"len of {i}, but only 5 lines in file" - with pytest.raises(ParserError, match=msg): - read_csv(path, header=list(range(i)), index_col=0) - - # write with cols - msg = "cannot specify cols with a MultiIndex" - with pytest.raises(TypeError, match=msg): - df.to_csv(path, columns=["foo", "bar"]) - - with tm.ensure_clean("__tmp_to_csv_multiindex__") as path: - # empty - tsframe[:0].to_csv(path) - recons = self.read_csv(path) - - exp = tsframe[:0] - exp.index = [] - - tm.assert_index_equal(recons.columns, exp.columns) - assert len(recons) == 0 + + # column & index are multi-index + df = DataFrame( + np.ones((5, 3)), + columns=MultiIndex.from_arrays( + [[f"i-{i}" for i in range(3)] for _ in range(4)], names=list("abcd") + ), + index=MultiIndex.from_arrays( + [[f"i-{i}" for i in range(5)] for _ in range(2)], names=list("ab") + ), + ) + df.to_csv(path) + result = read_csv(path, header=[0, 1, 2, 3], index_col=[0, 1]) + tm.assert_frame_equal(df, result) + + # column is mi + df = DataFrame( + np.ones((5, 3)), + columns=MultiIndex.from_arrays( + [[f"i-{i}" for i in range(3)] for _ in range(4)], names=list("abcd") + ), + ) + df.to_csv(path) + result = read_csv(path, header=[0, 1, 2, 3], index_col=0) + tm.assert_frame_equal(df, result) + + # dup column names? + df = DataFrame( + np.ones((5, 3)), + columns=MultiIndex.from_arrays( + [[f"i-{i}" for i in range(3)] for _ in range(4)], names=list("abcd") + ), + index=MultiIndex.from_arrays( + [[f"i-{i}" for i in range(5)] for _ in range(3)], names=list("abc") + ), + ) + df.to_csv(path) + result = read_csv(path, header=[0, 1, 2, 3], index_col=[0, 1, 2]) + tm.assert_frame_equal(df, result) + + # writing with no index + df = _make_frame() + df.to_csv(path, index=False) + result = read_csv(path, header=[0, 1]) + tm.assert_frame_equal(df, result) + + # we lose the names here + df = _make_frame(True) + df.to_csv(path, index=False) + result = read_csv(path, header=[0, 1]) + assert com.all_none(*result.columns.names) + result.columns.names = df.columns.names + tm.assert_frame_equal(df, result) + + # whatsnew example + df = _make_frame() + df.to_csv(path) + result = read_csv(path, header=[0, 1], index_col=[0]) + tm.assert_frame_equal(df, result) + + df = _make_frame(True) + df.to_csv(path) + result = read_csv(path, header=[0, 1], index_col=[0]) + tm.assert_frame_equal(df, result) + + # invalid options + df = _make_frame(True) + df.to_csv(path) + + for i in [6, 7]: + msg = f"len of {i}, but only 5 lines in file" + with pytest.raises(ParserError, match=msg): + read_csv(path, header=list(range(i)), index_col=0) + + # write with cols + msg = "cannot specify cols with a MultiIndex" + with pytest.raises(TypeError, match=msg): + df.to_csv(path, columns=["foo", "bar"]) + + # empty + tsframe[:0].to_csv(path) + recons = self.read_csv(path) + + exp = tsframe[:0] + exp.index = [] + + tm.assert_index_equal(recons.columns, exp.columns) + assert len(recons) == 0 def test_to_csv_interval_index(self, temp_file, using_infer_string): # GH 28210 @@ -811,16 +812,15 @@ def test_to_csv_dups_cols(self, temp_file): df.columns = [0, 1, 2] * 5 - with tm.ensure_clean() as filename: - df.to_csv(filename) - result = read_csv(filename, index_col=0) + df.to_csv(path) + result = read_csv(path, index_col=0) - # date cols - for i in ["0.4", "1.4", "2.4"]: - result[i] = to_datetime(result[i]) + # date cols + for i in ["0.4", "1.4", "2.4"]: + result[i] = to_datetime(result[i]) - result.columns = df.columns - tm.assert_frame_equal(result, df) + result.columns = df.columns + tm.assert_frame_equal(result, df) def test_to_csv_dups_cols2(self, temp_file): # GH3457 @@ -1200,18 +1200,17 @@ def test_to_csv_with_dst_transitions_with_pickle(self, start, end, temp_file): idx = idx._with_freq(None) # freq does not round-trip idx._data._freq = None # otherwise there is trouble on unpickle df = DataFrame({"values": 1, "idx": idx}, index=idx) - with tm.ensure_clean("csv_date_format_with_dst") as path: - df.to_csv(path, index=True) - result = read_csv(path, index_col=0) - result.index = ( - to_datetime(result.index, utc=True) - .tz_convert("Europe/Paris") - .as_unit("ns") - ) - result["idx"] = to_datetime(result["idx"], utc=True).astype( - "datetime64[ns, Europe/Paris]" - ) - tm.assert_frame_equal(result, df) + + path = str(temp_file) + df.to_csv(path, index=True) + result = read_csv(path, index_col=0) + result.index = ( + to_datetime(result.index, utc=True).tz_convert("Europe/Paris").as_unit("ns") + ) + result["idx"] = to_datetime(result["idx"], utc=True).astype( + "datetime64[ns, Europe/Paris]" + ) + tm.assert_frame_equal(result, df) # assert working df.astype(str) diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index 52f521d0d36eb..385615fe4e3a2 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -17,7 +17,7 @@ class TestToCSV: - def test_to_csv_with_single_column(self): + def test_to_csv_with_single_column(self, temp_file): # see gh-18676, https://bugs.python.org/issue32255 # # Python's CSV library adds an extraneous '""' @@ -30,31 +30,30 @@ def test_to_csv_with_single_column(self): "" 1.0 """ - with tm.ensure_clean("test.csv") as path: - df1.to_csv(path, header=None, index=None) - with open(path, encoding="utf-8") as f: - assert f.read() == expected1 + path = str(temp_file) + df1.to_csv(path, header=None, index=None) + with open(path, encoding="utf-8") as f: + assert f.read() == expected1 df2 = DataFrame([1, None]) expected2 = """\ 1.0 "" """ - with tm.ensure_clean("test.csv") as path: - df2.to_csv(path, header=None, index=None) - with open(path, encoding="utf-8") as f: - assert f.read() == expected2 + df2.to_csv(path, header=None, index=None) + with open(path, encoding="utf-8") as f: + assert f.read() == expected2 - def test_to_csv_default_encoding(self): + def test_to_csv_default_encoding(self, temp_file): # GH17097 df = DataFrame({"col": ["AAAAA", "ÄÄÄÄÄ", "ßßßßß", "聞聞聞聞聞"]}) - with tm.ensure_clean("test.csv") as path: - # the default to_csv encoding is uft-8. - df.to_csv(path) - tm.assert_frame_equal(pd.read_csv(path, index_col=0), df) + path = str(temp_file) + # the default to_csv encoding is uft-8. + df.to_csv(path) + tm.assert_frame_equal(pd.read_csv(path, index_col=0), df) - def test_to_csv_quotechar(self): + def test_to_csv_quotechar(self, temp_file): df = DataFrame({"col": [1, 2]}) expected = """\ "","col" @@ -62,10 +61,10 @@ def test_to_csv_quotechar(self): "1","2" """ - with tm.ensure_clean("test.csv") as path: - df.to_csv(path, quoting=1) # 1=QUOTE_ALL - with open(path, encoding="utf-8") as f: - assert f.read() == expected + path = str(temp_file) + df.to_csv(path, quoting=1) # 1=QUOTE_ALL + with open(path, encoding="utf-8") as f: + assert f.read() == expected expected = """\ $$,$col$ @@ -73,16 +72,15 @@ def test_to_csv_quotechar(self): $1$,$2$ """ - with tm.ensure_clean("test.csv") as path: - df.to_csv(path, quoting=1, quotechar="$") - with open(path, encoding="utf-8") as f: - assert f.read() == expected + path = str(temp_file) + df.to_csv(path, quoting=1, quotechar="$") + with open(path, encoding="utf-8") as f: + assert f.read() == expected - with tm.ensure_clean("test.csv") as path: - with pytest.raises(TypeError, match="quotechar"): - df.to_csv(path, quoting=1, quotechar=None) + with pytest.raises(TypeError, match="quotechar"): + df.to_csv(path, quoting=1, quotechar=None) - def test_to_csv_doublequote(self): + def test_to_csv_doublequote(self, temp_file): df = DataFrame({"col": ['a"a', '"bb"']}) expected = '''\ "","col" @@ -90,16 +88,15 @@ def test_to_csv_doublequote(self): "1","""bb""" ''' - with tm.ensure_clean("test.csv") as path: - df.to_csv(path, quoting=1, doublequote=True) # QUOTE_ALL - with open(path, encoding="utf-8") as f: - assert f.read() == expected + path = str(temp_file) + df.to_csv(path, quoting=1, doublequote=True) # QUOTE_ALL + with open(path, encoding="utf-8") as f: + assert f.read() == expected - with tm.ensure_clean("test.csv") as path: - with pytest.raises(Error, match="escapechar"): - df.to_csv(path, doublequote=False) # no escapechar set + with pytest.raises(Error, match="escapechar"): + df.to_csv(path, doublequote=False) # no escapechar set - def test_to_csv_escapechar(self): + def test_to_csv_escapechar(self, temp_file): df = DataFrame({"col": ['a"a', '"bb"']}) expected = """\ "","col" @@ -107,10 +104,10 @@ def test_to_csv_escapechar(self): "1","\\"bb\\"" """ - with tm.ensure_clean("test.csv") as path: # QUOTE_ALL - df.to_csv(path, quoting=1, doublequote=False, escapechar="\\") - with open(path, encoding="utf-8") as f: - assert f.read() == expected + path = str(temp_file) + df.to_csv(path, quoting=1, doublequote=False, escapechar="\\") + with open(path, encoding="utf-8") as f: + assert f.read() == expected df = DataFrame({"col": ["a,a", ",bb,"]}) expected = """\ @@ -119,10 +116,9 @@ def test_to_csv_escapechar(self): 1,\\,bb\\, """ - with tm.ensure_clean("test.csv") as path: - df.to_csv(path, quoting=3, escapechar="\\") # QUOTE_NONE - with open(path, encoding="utf-8") as f: - assert f.read() == expected + df.to_csv(path, quoting=3, escapechar="\\") # QUOTE_NONE + with open(path, encoding="utf-8") as f: + assert f.read() == expected def test_csv_to_string(self): df = DataFrame({"col": [1, 2]}) @@ -390,7 +386,7 @@ def test_to_csv_single_level_multi_index(self, ind, expected, frame_or_series): result = obj.to_csv(lineterminator="\n", header=True) assert result == expected - def test_to_csv_string_array_ascii(self): + def test_to_csv_string_array_ascii(self, temp_file): # GH 10813 str_array = [{"names": ["foo", "bar"]}, {"names": ["baz", "qux"]}] df = DataFrame(str_array) @@ -399,12 +395,12 @@ def test_to_csv_string_array_ascii(self): 0,"['foo', 'bar']" 1,"['baz', 'qux']" """ - with tm.ensure_clean("str_test.csv") as path: - df.to_csv(path, encoding="ascii") - with open(path, encoding="utf-8") as f: - assert f.read() == expected_ascii + path = str(temp_file) + df.to_csv(path, encoding="ascii") + with open(path, encoding="utf-8") as f: + assert f.read() == expected_ascii - def test_to_csv_string_array_utf8(self): + def test_to_csv_string_array_utf8(self, temp_file): # GH 10813 str_array = [{"names": ["foo", "bar"]}, {"names": ["baz", "qux"]}] df = DataFrame(str_array) @@ -413,80 +409,81 @@ def test_to_csv_string_array_utf8(self): 0,"['foo', 'bar']" 1,"['baz', 'qux']" """ - with tm.ensure_clean("unicode_test.csv") as path: - df.to_csv(path, encoding="utf-8") - with open(path, encoding="utf-8") as f: - assert f.read() == expected_utf8 + path = str(temp_file) + df.to_csv(path, encoding="utf-8") + with open(path, encoding="utf-8") as f: + assert f.read() == expected_utf8 - def test_to_csv_string_with_lf(self): + def test_to_csv_string_with_lf(self, temp_file): # GH 20353 data = {"int": [1, 2, 3], "str_lf": ["abc", "d\nef", "g\nh\n\ni"]} df = DataFrame(data) - with tm.ensure_clean("lf_test.csv") as path: - # case 1: The default line terminator(=os.linesep)(PR 21406) - os_linesep = os.linesep.encode("utf-8") - expected_noarg = ( - b"int,str_lf" - + os_linesep - + b"1,abc" - + os_linesep - + b'2,"d\nef"' - + os_linesep - + b'3,"g\nh\n\ni"' - + os_linesep - ) - df.to_csv(path, index=False) - with open(path, "rb") as f: - assert f.read() == expected_noarg - with tm.ensure_clean("lf_test.csv") as path: - # case 2: LF as line terminator - expected_lf = b'int,str_lf\n1,abc\n2,"d\nef"\n3,"g\nh\n\ni"\n' - df.to_csv(path, lineterminator="\n", index=False) - with open(path, "rb") as f: - assert f.read() == expected_lf - with tm.ensure_clean("lf_test.csv") as path: - # case 3: CRLF as line terminator - # 'lineterminator' should not change inner element - expected_crlf = b'int,str_lf\r\n1,abc\r\n2,"d\nef"\r\n3,"g\nh\n\ni"\r\n' - df.to_csv(path, lineterminator="\r\n", index=False) - with open(path, "rb") as f: - assert f.read() == expected_crlf - - def test_to_csv_string_with_crlf(self): + path = str(temp_file) + + # case 1: The default line terminator(=os.linesep)(PR 21406) + os_linesep = os.linesep.encode("utf-8") + expected_noarg = ( + b"int,str_lf" + + os_linesep + + b"1,abc" + + os_linesep + + b'2,"d\nef"' + + os_linesep + + b'3,"g\nh\n\ni"' + + os_linesep + ) + df.to_csv(path, index=False) + with open(path, "rb") as f: + assert f.read() == expected_noarg + + # case 2: LF as line terminator + expected_lf = b'int,str_lf\n1,abc\n2,"d\nef"\n3,"g\nh\n\ni"\n' + df.to_csv(path, lineterminator="\n", index=False) + with open(path, "rb") as f: + assert f.read() == expected_lf + + # case 3: CRLF as line terminator + # 'lineterminator' should not change inner element + expected_crlf = b'int,str_lf\r\n1,abc\r\n2,"d\nef"\r\n3,"g\nh\n\ni"\r\n' + df.to_csv(path, lineterminator="\r\n", index=False) + with open(path, "rb") as f: + assert f.read() == expected_crlf + + def test_to_csv_string_with_crlf(self, temp_file): # GH 20353 data = {"int": [1, 2, 3], "str_crlf": ["abc", "d\r\nef", "g\r\nh\r\n\r\ni"]} df = DataFrame(data) - with tm.ensure_clean("crlf_test.csv") as path: - # case 1: The default line terminator(=os.linesep)(PR 21406) - os_linesep = os.linesep.encode("utf-8") - expected_noarg = ( - b"int,str_crlf" - + os_linesep - + b"1,abc" - + os_linesep - + b'2,"d\r\nef"' - + os_linesep - + b'3,"g\r\nh\r\n\r\ni"' - + os_linesep - ) - df.to_csv(path, index=False) - with open(path, "rb") as f: - assert f.read() == expected_noarg - with tm.ensure_clean("crlf_test.csv") as path: - # case 2: LF as line terminator - expected_lf = b'int,str_crlf\n1,abc\n2,"d\r\nef"\n3,"g\r\nh\r\n\r\ni"\n' - df.to_csv(path, lineterminator="\n", index=False) - with open(path, "rb") as f: - assert f.read() == expected_lf - with tm.ensure_clean("crlf_test.csv") as path: - # case 3: CRLF as line terminator - # 'lineterminator' should not change inner element - expected_crlf = ( - b'int,str_crlf\r\n1,abc\r\n2,"d\r\nef"\r\n3,"g\r\nh\r\n\r\ni"\r\n' - ) - df.to_csv(path, lineterminator="\r\n", index=False) - with open(path, "rb") as f: - assert f.read() == expected_crlf + path = str(temp_file) + # case 1: The default line terminator(=os.linesep)(PR 21406) + os_linesep = os.linesep.encode("utf-8") + expected_noarg = ( + b"int,str_crlf" + + os_linesep + + b"1,abc" + + os_linesep + + b'2,"d\r\nef"' + + os_linesep + + b'3,"g\r\nh\r\n\r\ni"' + + os_linesep + ) + df.to_csv(path, index=False) + with open(path, "rb") as f: + assert f.read() == expected_noarg + + # case 2: LF as line terminator + expected_lf = b'int,str_crlf\n1,abc\n2,"d\r\nef"\n3,"g\r\nh\r\n\r\ni"\n' + df.to_csv(path, lineterminator="\n", index=False) + with open(path, "rb") as f: + assert f.read() == expected_lf + + # case 3: CRLF as line terminator + # 'lineterminator' should not change inner element + expected_crlf = ( + b'int,str_crlf\r\n1,abc\r\n2,"d\r\nef"\r\n3,"g\r\nh\r\n\r\ni"\r\n' + ) + df.to_csv(path, lineterminator="\r\n", index=False) + with open(path, "rb") as f: + assert f.read() == expected_crlf def test_to_csv_stdout_file(self, capsys): # GH 21561 @@ -508,7 +505,7 @@ def test_to_csv_stdout_file(self, capsys): "(https://docs.python.org/3/library/csv.html#csv.writer)" ), ) - def test_to_csv_write_to_open_file(self): + def test_to_csv_write_to_open_file(self, temp_file): # GH 21696 df = DataFrame({"a": ["x", "y", "z"]}) expected = """\ @@ -517,31 +514,37 @@ def test_to_csv_write_to_open_file(self): y z """ - with tm.ensure_clean("test.txt") as path: - with open(path, "w", encoding="utf-8") as f: - f.write("manual header\n") - df.to_csv(f, header=None, index=None) - with open(path, encoding="utf-8") as f: - assert f.read() == expected - - def test_to_csv_write_to_open_file_with_newline_py3(self): + path = str(temp_file) + with open(path, "w", encoding="utf-8") as f: + f.write("manual header\n") + df.to_csv(f, header=None, index=None) + with open(path, encoding="utf-8") as f: + assert f.read() == expected + + def test_to_csv_write_to_open_file_with_newline_py3(self, temp_file): # see gh-21696 # see gh-20353 df = DataFrame({"a": ["x", "y", "z"]}) expected_rows = ["x", "y", "z"] expected = "manual header\n" + tm.convert_rows_list_to_csv_str(expected_rows) - with tm.ensure_clean("test.txt") as path: - with open(path, "w", newline="", encoding="utf-8") as f: - f.write("manual header\n") - df.to_csv(f, header=None, index=None) - with open(path, "rb") as f: - assert f.read() == bytes(expected, "utf-8") + path = str(temp_file) + with open(path, "w", newline="", encoding="utf-8") as f: + f.write("manual header\n") + df.to_csv(f, header=None, index=None) + + with open(path, "rb") as f: + assert f.read() == bytes(expected, "utf-8") @pytest.mark.parametrize("to_infer", [True, False]) @pytest.mark.parametrize("read_infer", [True, False]) def test_to_csv_compression( - self, compression_only, read_infer, to_infer, compression_to_extension + self, + compression_only, + read_infer, + to_infer, + compression_to_extension, + temp_file, ): # see gh-15008 compression = compression_only @@ -555,12 +558,12 @@ def test_to_csv_compression( to_compression = "infer" if to_infer else compression read_compression = "infer" if read_infer else compression - with tm.ensure_clean(filename) as path: - df.to_csv(path, compression=to_compression) - result = pd.read_csv(path, index_col=0, compression=read_compression) - tm.assert_frame_equal(result, df) + path = str(temp_file) + df.to_csv(path, compression=to_compression) + result = pd.read_csv(path, index_col=0, compression=read_compression) + tm.assert_frame_equal(result, df) - def test_to_csv_compression_dict(self, compression_only): + def test_to_csv_compression_dict(self, compression_only, temp_file): # GH 26023 method = compression_only df = DataFrame({"ABC": [1]}) @@ -570,34 +573,36 @@ def test_to_csv_compression_dict(self, compression_only): "zstd": "zst", }.get(method, method) filename += extension - with tm.ensure_clean(filename) as path: - df.to_csv(path, compression={"method": method}) - read_df = pd.read_csv(path, index_col=0) - tm.assert_frame_equal(read_df, df) - def test_to_csv_compression_dict_no_method_raises(self): + path = str(temp_file) + df.to_csv(path, compression={"method": method}) + read_df = pd.read_csv(path, index_col=0) + tm.assert_frame_equal(read_df, df) + + def test_to_csv_compression_dict_no_method_raises(self, temp_file): # GH 26023 df = DataFrame({"ABC": [1]}) compression = {"some_option": True} msg = "must have key 'method'" - with tm.ensure_clean("out.zip") as path: - with pytest.raises(ValueError, match=msg): - df.to_csv(path, compression=compression) + path = str(temp_file) + with pytest.raises(ValueError, match=msg): + df.to_csv(path, compression=compression) @pytest.mark.parametrize("compression", ["zip", "infer"]) @pytest.mark.parametrize("archive_name", ["test_to_csv.csv", "test_to_csv.zip"]) - def test_to_csv_zip_arguments(self, compression, archive_name): + def test_to_csv_zip_arguments(self, compression, archive_name, temp_file): # GH 26023 df = DataFrame({"ABC": [1]}) - with tm.ensure_clean("to_csv_archive_name.zip") as path: - df.to_csv( - path, compression={"method": compression, "archive_name": archive_name} - ) - with ZipFile(path) as zp: - assert len(zp.filelist) == 1 - archived_file = zp.filelist[0].filename - assert archived_file == archive_name + + path = str(temp_file) + df.to_csv( + path, compression={"method": compression, "archive_name": archive_name} + ) + with ZipFile(path) as zp: + assert len(zp.filelist) == 1 + archived_file = zp.filelist[0].filename + assert archived_file == archive_name @pytest.mark.parametrize( "filename,expected_arcname", @@ -660,17 +665,18 @@ def test_na_rep_truncated(self): assert result == expected @pytest.mark.parametrize("errors", ["surrogatepass", "ignore", "replace"]) - def test_to_csv_errors(self, errors): + def test_to_csv_errors(self, errors, temp_file): # GH 22610 data = ["\ud800foo"] ser = pd.Series(data, index=Index(data, dtype=object), dtype=object) - with tm.ensure_clean("test.csv") as path: - ser.to_csv(path, errors=errors) + + path = str(temp_file) + ser.to_csv(path, errors=errors) # No use in reading back the data as it is not the same anymore # due to the error handling @pytest.mark.parametrize("mode", ["wb", "w"]) - def test_to_csv_binary_handle(self, mode): + def test_to_csv_binary_handle(self, mode, temp_file): """ Binary file objects should work (if 'mode' contains a 'b') or even without it in most cases. @@ -682,13 +688,14 @@ def test_to_csv_binary_handle(self, mode): columns=Index(list("ABCD")), index=Index([f"i-{i}" for i in range(30)]), ) - with tm.ensure_clean() as path: - with open(path, mode="w+b") as handle: - df.to_csv(handle, mode=mode) - tm.assert_frame_equal(df, pd.read_csv(path, index_col=0)) + + path = str(temp_file) + with open(path, mode="w+b") as handle: + df.to_csv(handle, mode=mode) + tm.assert_frame_equal(df, pd.read_csv(path, index_col=0)) @pytest.mark.parametrize("mode", ["wb", "w"]) - def test_to_csv_encoding_binary_handle(self, mode): + def test_to_csv_encoding_binary_handle(self, mode, temp_file): """ Binary file objects should honor a specified encoding. @@ -705,26 +712,24 @@ def test_to_csv_encoding_binary_handle(self, mode): assert buffer.getvalue().startswith(content) # example from GH 13068 - with tm.ensure_clean() as path: - with open(path, "w+b") as handle: - DataFrame().to_csv(handle, mode=mode, encoding="utf-8-sig") + path = str(temp_file) + with open(path, "w+b") as handle: + DataFrame().to_csv(handle, mode=mode, encoding="utf-8-sig") - handle.seek(0) - assert handle.read().startswith(b'\xef\xbb\xbf""') + handle.seek(0) + assert handle.read().startswith(b'\xef\xbb\xbf""') -def test_to_csv_iterative_compression_name(compression): +def test_to_csv_iterative_compression_name(compression, temp_file): # GH 38714 df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), columns=Index(list("ABCD")), index=Index([f"i-{i}" for i in range(30)]), ) - with tm.ensure_clean() as path: - df.to_csv(path, compression=compression, chunksize=1) - tm.assert_frame_equal( - pd.read_csv(path, compression=compression, index_col=0), df - ) + path = str(temp_file) + df.to_csv(path, compression=compression, chunksize=1) + tm.assert_frame_equal(pd.read_csv(path, compression=compression, index_col=0), df) def test_to_csv_iterative_compression_buffer(compression): diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 908a26874f150..799669ee2c5d1 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -163,8 +163,10 @@ def timezone_aware_date_list(request): return request.param +@pytest.fixture def check_round_trip( df, + temp_file, engine=None, path=None, write_kwargs=None, @@ -223,8 +225,8 @@ def compare(repeat): ) if path is None: - with tm.ensure_clean() as path: - compare(repeat) + path = str(temp_file) + compare(repeat) else: compare(repeat) @@ -337,49 +339,48 @@ def test_get_engine_auto_error_message(): get_engine("auto") -def test_cross_engine_pa_fp(df_cross_compat, pa, fp): +def test_cross_engine_pa_fp(df_cross_compat, pa, fp, temp_file): # cross-compat with differing reading/writing engines df = df_cross_compat - with tm.ensure_clean() as path: - df.to_parquet(path, engine=pa, compression=None) + path = str(temp_file) + df.to_parquet(path, engine=pa, compression=None) - result = read_parquet(path, engine=fp) - tm.assert_frame_equal(result, df) + result = read_parquet(path, engine=fp) + tm.assert_frame_equal(result, df) - result = read_parquet(path, engine=fp, columns=["a", "d"]) - tm.assert_frame_equal(result, df[["a", "d"]]) + result = read_parquet(path, engine=fp, columns=["a", "d"]) + tm.assert_frame_equal(result, df[["a", "d"]]) -def test_cross_engine_fp_pa(df_cross_compat, pa, fp): +def test_cross_engine_fp_pa(df_cross_compat, pa, fp, temp_file): # cross-compat with differing reading/writing engines df = df_cross_compat - with tm.ensure_clean() as path: - df.to_parquet(path, engine=fp, compression=None) + path = str(temp_file) - result = read_parquet(path, engine=pa) - tm.assert_frame_equal(result, df) + df.to_parquet(path, engine=fp, compression=None) + + result = read_parquet(path, engine=pa) + tm.assert_frame_equal(result, df) - result = read_parquet(path, engine=pa, columns=["a", "d"]) - tm.assert_frame_equal(result, df[["a", "d"]]) + result = read_parquet(path, engine=pa, columns=["a", "d"]) + tm.assert_frame_equal(result, df[["a", "d"]]) class Base: - def check_error_on_write(self, df, engine, exc, err_msg): + def check_error_on_write(self, df, engine, exc, err_msg, temp_file_path): # check that we are raising the exception on writing - with tm.ensure_clean() as path: - with pytest.raises(exc, match=err_msg): - to_parquet(df, path, engine, compression=None) + with pytest.raises(exc, match=err_msg): + to_parquet(df, temp_file_path, engine, compression=None) - def check_external_error_on_write(self, df, engine, exc): + def check_external_error_on_write(self, df, engine, exc, temp_file_path): # check that an external library is raising the exception on writing - with tm.ensure_clean() as path: - with tm.external_error_raised(exc): - to_parquet(df, path, engine, compression=None) + with tm.external_error_raised(exc): + to_parquet(df, temp_file_path, engine, compression=None) class TestBasic(Base): - def test_error(self, engine): + def test_error(self, engine, temp_file): for obj in [ pd.Series([1, 2, 3]), 1, @@ -388,7 +389,8 @@ def test_error(self, engine): np.array([1, 2, 3]), ]: msg = "to_parquet only supports IO with DataFrames" - self.check_error_on_write(obj, engine, ValueError, msg) + path = str(temp_file) + self.check_error_on_write(obj, engine, ValueError, msg, path) def test_columns_dtypes(self, engine): df = pd.DataFrame({"string": list("abc"), "int": list(range(1, 4))}) @@ -513,21 +515,22 @@ def test_write_ignoring_index(self, engine): expected = df.reset_index(drop=True) check_round_trip(df, engine, write_kwargs=write_kwargs, expected=expected) - def test_write_column_multiindex(self, engine): + def test_write_column_multiindex(self, engine, temp_file): # Not able to write column multi-indexes with non-string column names. mi_columns = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)]) df = pd.DataFrame( np.random.default_rng(2).standard_normal((4, 3)), columns=mi_columns ) + path = str(temp_file) if engine == "fastparquet": self.check_error_on_write( - df, engine, TypeError, "Column name must be a string" + df, engine, TypeError, "Column name must be a string", path ) elif engine == "pyarrow": check_round_trip(df, engine) - def test_write_column_multiindex_nonstring(self, engine): + def test_write_column_multiindex_nonstring(self, engine, temp_file): # GH #34777 # Not able to write column multi-indexes with non-string column names @@ -539,8 +542,9 @@ def test_write_column_multiindex_nonstring(self, engine): np.random.default_rng(2).standard_normal((8, 8)), columns=arrays ) df.columns.names = ["Level1", "Level2"] + path = str(temp_file) if engine == "fastparquet": - self.check_error_on_write(df, engine, ValueError, "Column name") + self.check_error_on_write(df, engine, ValueError, "Column name", path) elif engine == "pyarrow": check_round_trip(df, engine) @@ -575,7 +579,7 @@ def test_write_column_index_string(self, pa): check_round_trip(df, engine) - def test_write_column_index_nonstring(self, engine): + def test_write_column_index_nonstring(self, engine, temp_file): # GH #34777 # Write column indexes with string column names @@ -584,14 +588,15 @@ def test_write_column_index_nonstring(self, engine): np.random.default_rng(2).standard_normal((8, 4)), columns=arrays ) df.columns.name = "NonStringCol" + path = str(temp_file) if engine == "fastparquet": self.check_error_on_write( - df, engine, TypeError, "Column name must be a string" + df, engine, TypeError, "Column name must be a string", path ) else: check_round_trip(df, engine) - def test_dtype_backend(self, engine, request): + def test_dtype_backend(self, engine, request, temp_file): pq = pytest.importorskip("pyarrow.parquet") if engine == "fastparquet": @@ -615,11 +620,11 @@ def test_dtype_backend(self, engine, request): "g": pyarrow.array([1.0, 2.0, 3.0, None], "float64"), } ) - with tm.ensure_clean() as path: - # write manually with pyarrow to write integers - pq.write_table(table, path) - result1 = read_parquet(path, engine=engine) - result2 = read_parquet(path, engine=engine, dtype_backend="numpy_nullable") + path = str(temp_file) + # write manually with pyarrow to write integers + pq.write_table(table, path) + result1 = read_parquet(path, engine=engine) + result2 = read_parquet(path, engine=engine, dtype_backend="numpy_nullable") assert result1["a"].dtype == np.dtype("float64") expected = pd.DataFrame( @@ -730,29 +735,34 @@ def test_to_bytes_without_path_or_buf_provided(self, pa, df_full): expected["datetime_with_nat"] = expected["datetime_with_nat"].astype("M8[ms]") tm.assert_frame_equal(res, expected) - def test_duplicate_columns(self, pa): + def test_duplicate_columns(self, pa, temp_file): # not currently able to handle duplicate columns df = pd.DataFrame(np.arange(12).reshape(4, 3), columns=list("aaa")).copy() - self.check_error_on_write(df, pa, ValueError, "Duplicate column names found") + path = str(temp_file) + self.check_error_on_write( + df, pa, ValueError, "Duplicate column names found", path + ) def test_timedelta(self, pa): df = pd.DataFrame({"a": pd.timedelta_range("1 day", periods=3)}) check_round_trip(df, pa) - def test_unsupported(self, pa): + def test_unsupported(self, pa, temp_file): # mixed python objects df = pd.DataFrame({"a": ["a", 1, 2.0]}) # pyarrow 0.11 raises ArrowTypeError # older pyarrows raise ArrowInvalid - self.check_external_error_on_write(df, pa, pyarrow.ArrowException) + path = str(temp_file) + self.check_external_error_on_write(df, pa, pyarrow.ArrowException, path) - def test_unsupported_float16(self, pa): + def test_unsupported_float16(self, pa, temp_file): # #44847, #44914 # Not able to write float 16 column using pyarrow. data = np.arange(2, 10, dtype=np.float16) df = pd.DataFrame(data=data, columns=["fp16"]) + path = str(temp_file) if pa_version_under15p0: - self.check_external_error_on_write(df, pa, pyarrow.ArrowException) + self.check_external_error_on_write(df, pa, pyarrow.ArrowException, path) else: check_round_trip(df, pa) @@ -765,18 +775,18 @@ def test_unsupported_float16(self, pa): ) @pytest.mark.skipif(not pa_version_under15p0, reason="float16 works on 15") @pytest.mark.parametrize("path_type", [str, pathlib.Path]) - def test_unsupported_float16_cleanup(self, pa, path_type): + def test_unsupported_float16_cleanup(self, pa, path_type, temp_file): # #44847, #44914 # Not able to write float 16 column using pyarrow. # Tests cleanup by pyarrow in case of an error data = np.arange(2, 10, dtype=np.float16) df = pd.DataFrame(data=data, columns=["fp16"]) - with tm.ensure_clean() as path_str: - path = path_type(path_str) - with tm.external_error_raised(pyarrow.ArrowException): - df.to_parquet(path=path, engine=pa) - assert not os.path.isfile(path) + path_str = str(temp_file) + path = path_type(path_str) + with tm.external_error_raised(pyarrow.ArrowException): + df.to_parquet(path=path, engine=pa) + assert not os.path.isfile(path) def test_categorical(self, pa): # supported in >= 0.7.0 @@ -1005,13 +1015,13 @@ def test_timezone_aware_index(self, pa, timezone_aware_date_list): expected["index_as_col"] = expected["index_as_col"].dt.tz_convert(tz) check_round_trip(df, pa, check_dtype=False, expected=expected) - def test_filter_row_groups(self, pa): + def test_filter_row_groups(self, pa, temp_file): # https://github.com/pandas-dev/pandas/issues/26551 pytest.importorskip("pyarrow") df = pd.DataFrame({"a": list(range(3))}) - with tm.ensure_clean() as path: - df.to_parquet(path, engine=pa) - result = read_parquet(path, pa, filters=[("a", "==", 0)]) + path = str(temp_file) + df.to_parquet(path, engine=pa) + result = read_parquet(path, pa, filters=[("a", "==", 0)]) assert len(result) == 1 @pytest.mark.filterwarnings("ignore:make_block is deprecated:DeprecationWarning") @@ -1206,32 +1216,34 @@ def test_basic(self, fp, df_full, request): df["timedelta"] = pd.timedelta_range("1 day", periods=3) check_round_trip(df, fp) - def test_columns_dtypes_invalid(self, fp): + def test_columns_dtypes_invalid(self, fp, temp_file): df = pd.DataFrame({"string": list("abc"), "int": list(range(1, 4))}) err = TypeError msg = "Column name must be a string" + path = str(temp_file) # numeric df.columns = [0, 1] - self.check_error_on_write(df, fp, err, msg) + self.check_error_on_write(df, fp, err, msg, path) # bytes df.columns = [b"foo", b"bar"] - self.check_error_on_write(df, fp, err, msg) + self.check_error_on_write(df, fp, err, msg, path) # python object df.columns = [ datetime.datetime(2011, 1, 1, 0, 0), datetime.datetime(2011, 1, 1, 1, 1), ] - self.check_error_on_write(df, fp, err, msg) + self.check_error_on_write(df, fp, err, msg, path) - def test_duplicate_columns(self, fp): + def test_duplicate_columns(self, fp, temp_file): # not currently able to handle duplicate columns df = pd.DataFrame(np.arange(12).reshape(4, 3), columns=list("aaa")).copy() msg = "Cannot create parquet dataset with duplicate column names" - self.check_error_on_write(df, fp, ValueError, msg) + path = str(temp_file) + self.check_error_on_write(df, fp, ValueError, msg, path) def test_bool_with_none(self, fp, request): df = pd.DataFrame({"a": [True, None, False]}) @@ -1240,27 +1252,28 @@ def test_bool_with_none(self, fp, request): # float64 check_round_trip(df, fp, expected=expected, check_dtype=False) - def test_unsupported(self, fp): + def test_unsupported(self, fp, temp_file): # period df = pd.DataFrame({"a": pd.period_range("2013", freq="M", periods=3)}) # error from fastparquet -> don't check exact error message - self.check_error_on_write(df, fp, ValueError, None) + path = str(temp_file) + self.check_error_on_write(df, fp, ValueError, None, path) # mixed df = pd.DataFrame({"a": ["a", 1, 2.0]}) msg = "Can't infer object conversion type" - self.check_error_on_write(df, fp, ValueError, msg) + self.check_error_on_write(df, fp, ValueError, msg, path) def test_categorical(self, fp): df = pd.DataFrame({"a": pd.Categorical(list("abc"))}) check_round_trip(df, fp) - def test_filter_row_groups(self, fp): + def test_filter_row_groups(self, fp, temp_file): d = {"a": list(range(3))} df = pd.DataFrame(d) - with tm.ensure_clean() as path: - df.to_parquet(path, engine=fp, compression=None, row_group_offsets=1) - result = read_parquet(path, fp, filters=[("a", "==", 0)]) + path = str(temp_file) + df.to_parquet(path, engine=fp, compression=None, row_group_offsets=1) + result = read_parquet(path, fp, filters=[("a", "==", 0)]) assert len(result) == 1 @pytest.mark.single_cpu @@ -1356,91 +1369,86 @@ def test_timezone_aware_index(self, fp, timezone_aware_date_list, request): expected.index.name = "index" check_round_trip(df, fp, expected=expected) - def test_close_file_handle_on_read_error(self): - with tm.ensure_clean("test.parquet") as path: - pathlib.Path(path).write_bytes(b"breakit") - with tm.external_error_raised(Exception): # Not important which exception - read_parquet(path, engine="fastparquet") - # The next line raises an error on Windows if the file is still open - pathlib.Path(path).unlink(missing_ok=False) + def test_close_file_handle_on_read_error(self, temp_file): + path = str(temp_file) + pathlib.Path(path).write_bytes(b"breakit") + with tm.external_error_raised(Exception): # Not important which exception + read_parquet(path, engine="fastparquet") + # The next line raises an error on Windows if the file is still open + pathlib.Path(path).unlink(missing_ok=False) - def test_bytes_file_name(self, engine): + def test_bytes_file_name(self, engine, temp_file): # GH#48944 df = pd.DataFrame(data={"A": [0, 1], "B": [1, 0]}) - with tm.ensure_clean("test.parquet") as path: - with open(path.encode(), "wb") as f: - df.to_parquet(f) + path = str(temp_file) + with open(path.encode(), "wb") as f: + df.to_parquet(f) - result = read_parquet(path, engine=engine) + result = read_parquet(path, engine=engine) tm.assert_frame_equal(result, df) - def test_filesystem_notimplemented(self): + def test_filesystem_notimplemented(self, temp_file): pytest.importorskip("fastparquet") df = pd.DataFrame(data={"A": [0, 1], "B": [1, 0]}) - with tm.ensure_clean() as path: - with pytest.raises( - NotImplementedError, match="filesystem is not implemented" - ): - df.to_parquet(path, engine="fastparquet", filesystem="foo") - - with tm.ensure_clean() as path: - pathlib.Path(path).write_bytes(b"foo") - with pytest.raises( - NotImplementedError, match="filesystem is not implemented" - ): - read_parquet(path, engine="fastparquet", filesystem="foo") - - def test_invalid_filesystem(self): + path = str(temp_file) + with pytest.raises(NotImplementedError, match="filesystem is not implemented"): + df.to_parquet(path, engine="fastparquet", filesystem="foo") + + pathlib.Path(path).write_bytes(b"foo") + with pytest.raises(NotImplementedError, match="filesystem is not implemented"): + read_parquet(path, engine="fastparquet", filesystem="foo") + + def test_invalid_filesystem(self, temp_file): pytest.importorskip("pyarrow") df = pd.DataFrame(data={"A": [0, 1], "B": [1, 0]}) - with tm.ensure_clean() as path: - with pytest.raises( - ValueError, match="filesystem must be a pyarrow or fsspec FileSystem" - ): - df.to_parquet(path, engine="pyarrow", filesystem="foo") - - with tm.ensure_clean() as path: - pathlib.Path(path).write_bytes(b"foo") - with pytest.raises( - ValueError, match="filesystem must be a pyarrow or fsspec FileSystem" - ): - read_parquet(path, engine="pyarrow", filesystem="foo") - - def test_unsupported_pa_filesystem_storage_options(self): + path = str(temp_file) + + with pytest.raises( + ValueError, match="filesystem must be a pyarrow or fsspec FileSystem" + ): + df.to_parquet(path, engine="pyarrow", filesystem="foo") + + pathlib.Path(path).write_bytes(b"foo") + with pytest.raises( + ValueError, match="filesystem must be a pyarrow or fsspec FileSystem" + ): + read_parquet(path, engine="pyarrow", filesystem="foo") + + def test_unsupported_pa_filesystem_storage_options(self, temp_file): pa_fs = pytest.importorskip("pyarrow.fs") df = pd.DataFrame(data={"A": [0, 1], "B": [1, 0]}) - with tm.ensure_clean() as path: - with pytest.raises( - NotImplementedError, - match="storage_options not supported with a pyarrow FileSystem.", - ): - df.to_parquet( - path, - engine="pyarrow", - filesystem=pa_fs.LocalFileSystem(), - storage_options={"foo": "bar"}, - ) - - with tm.ensure_clean() as path: - pathlib.Path(path).write_bytes(b"foo") - with pytest.raises( - NotImplementedError, - match="storage_options not supported with a pyarrow FileSystem.", - ): - read_parquet( - path, - engine="pyarrow", - filesystem=pa_fs.LocalFileSystem(), - storage_options={"foo": "bar"}, - ) - - def test_invalid_dtype_backend(self, engine): + path = str(temp_file) + + with pytest.raises( + NotImplementedError, + match="storage_options not supported with a pyarrow FileSystem.", + ): + df.to_parquet( + path, + engine="pyarrow", + filesystem=pa_fs.LocalFileSystem(), + storage_options={"foo": "bar"}, + ) + + pathlib.Path(path).write_bytes(b"foo") + with pytest.raises( + NotImplementedError, + match="storage_options not supported with a pyarrow FileSystem.", + ): + read_parquet( + path, + engine="pyarrow", + filesystem=pa_fs.LocalFileSystem(), + storage_options={"foo": "bar"}, + ) + + def test_invalid_dtype_backend(self, engine, temp_file): msg = ( "dtype_backend numpy is invalid, only 'numpy_nullable' and " "'pyarrow' are allowed." ) df = pd.DataFrame({"int": list(range(1, 4))}) - with tm.ensure_clean("tmp.parquet") as path: - df.to_parquet(path) - with pytest.raises(ValueError, match=msg): - read_parquet(path, dtype_backend="numpy") + path = str(temp_file) + df.to_parquet(path) + with pytest.raises(ValueError, match=msg): + read_parquet(path, dtype_backend="numpy") From c184ea559d9e275a536fb8ecdd6d5a46448fa7cb Mon Sep 17 00:00:00 2001 From: Harshit Pande Date: Fri, 26 Sep 2025 21:09:27 +0000 Subject: [PATCH 04/10] extension fix --- pandas/tests/io/formats/test_to_csv.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index 385615fe4e3a2..f716cde8c3ad8 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -549,16 +549,12 @@ def test_to_csv_compression( # see gh-15008 compression = compression_only - # We'll complete file extension subsequently. - filename = "test." - filename += compression_to_extension[compression] - df = DataFrame({"A": [1]}) to_compression = "infer" if to_infer else compression read_compression = "infer" if read_infer else compression - path = str(temp_file) + path = str(temp_file) + "." + compression_to_extension[compression] df.to_csv(path, compression=to_compression) result = pd.read_csv(path, index_col=0, compression=read_compression) tm.assert_frame_equal(result, df) @@ -567,14 +563,12 @@ def test_to_csv_compression_dict(self, compression_only, temp_file): # GH 26023 method = compression_only df = DataFrame({"ABC": [1]}) - filename = "to_csv_compress_as_dict." extension = { "gzip": "gz", "zstd": "zst", }.get(method, method) - filename += extension - path = str(temp_file) + path = str(temp_file) + "." + extension df.to_csv(path, compression={"method": method}) read_df = pd.read_csv(path, index_col=0) tm.assert_frame_equal(read_df, df) @@ -595,7 +589,7 @@ def test_to_csv_zip_arguments(self, compression, archive_name, temp_file): # GH 26023 df = DataFrame({"ABC": [1]}) - path = str(temp_file) + path = str(temp_file) + ".zip" df.to_csv( path, compression={"method": compression, "archive_name": archive_name} ) From afdd1810381d90799b7c82a6c30e1848106e170e Mon Sep 17 00:00:00 2001 From: Harshit Pande Date: Mon, 29 Sep 2025 00:27:48 +0000 Subject: [PATCH 05/10] refactor test parquet to fix fixture error --- pandas/tests/io/test_parquet.py | 193 +++++++++++++++++++------------- 1 file changed, 113 insertions(+), 80 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 799669ee2c5d1..79660394e1197 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -163,7 +163,6 @@ def timezone_aware_date_list(request): return request.param -@pytest.fixture def check_round_trip( df, temp_file, @@ -247,33 +246,33 @@ def check_partition_names(path, expected): assert dataset.partitioning.schema.names == expected -def test_invalid_engine(df_compat): +def test_invalid_engine(df_compat, temp_file): msg = "engine must be one of 'pyarrow', 'fastparquet'" with pytest.raises(ValueError, match=msg): - check_round_trip(df_compat, "foo", "bar") + check_round_trip(df_compat, temp_file, "foo", "bar") -def test_options_py(df_compat, pa, using_infer_string): +def test_options_py(df_compat, pa, using_infer_string, temp_file): # use the set option if using_infer_string and not pa_version_under19p0: df_compat.columns = df_compat.columns.astype("str") with pd.option_context("io.parquet.engine", "pyarrow"): - check_round_trip(df_compat) + check_round_trip(df_compat, temp_file) -def test_options_fp(df_compat, fp): +def test_options_fp(df_compat, fp, temp_file): # use the set option with pd.option_context("io.parquet.engine", "fastparquet"): - check_round_trip(df_compat) + check_round_trip(df_compat, temp_file) -def test_options_auto(df_compat, fp, pa): +def test_options_auto(df_compat, fp, pa, temp_file): # use the set option with pd.option_context("io.parquet.engine", "auto"): - check_round_trip(df_compat) + check_round_trip(df_compat, temp_file) def test_options_get_engine(fp, pa): @@ -392,25 +391,29 @@ def test_error(self, engine, temp_file): path = str(temp_file) self.check_error_on_write(obj, engine, ValueError, msg, path) - def test_columns_dtypes(self, engine): + def test_columns_dtypes(self, engine, temp_file): df = pd.DataFrame({"string": list("abc"), "int": list(range(1, 4))}) # unicode df.columns = ["foo", "bar"] - check_round_trip(df, engine) + check_round_trip(df, temp_file, engine) @pytest.mark.parametrize("compression", [None, "gzip", "snappy", "brotli"]) def test_compression(self, engine, compression): df = pd.DataFrame({"A": [1, 2, 3]}) check_round_trip(df, engine, write_kwargs={"compression": compression}) - def test_read_columns(self, engine): + def test_read_columns(self, engine, temp_file): # GH18154 df = pd.DataFrame({"string": list("abc"), "int": list(range(1, 4))}) expected = pd.DataFrame({"string": list("abc")}) check_round_trip( - df, engine, expected=expected, read_kwargs={"columns": ["string"]} + df, + temp_file, + engine, + expected=expected, + read_kwargs={"columns": ["string"]}, ) def test_read_filters(self, engine, tmp_path): @@ -432,10 +435,10 @@ def test_read_filters(self, engine, tmp_path): repeat=1, ) - def test_write_index(self): + def test_write_index(self, temp_file): pytest.importorskip("pyarrow") df = pd.DataFrame({"A": [1, 2, 3]}) - check_round_trip(df, "pyarrow") + check_round_trip(df, temp_file, "pyarrow") indexes = [ [2, 3, 4], @@ -448,23 +451,23 @@ def test_write_index(self): df.index = index if isinstance(index, pd.DatetimeIndex): df.index = df.index._with_freq(None) # freq doesn't round-trip - check_round_trip(df, "pyarrow") + check_round_trip(df, temp_file, "pyarrow") # index with meta-data df.index = [0, 1, 2] df.index.name = "foo" - check_round_trip(df, "pyarrow") + check_round_trip(df, temp_file, "pyarrow") - def test_write_multiindex(self, pa): + def test_write_multiindex(self, pa, temp_file): # Not supported in fastparquet as of 0.1.3 or older pyarrow version engine = pa df = pd.DataFrame({"A": [1, 2, 3]}) index = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)]) df.index = index - check_round_trip(df, engine) + check_round_trip(df, temp_file, engine) - def test_multiindex_with_columns(self, pa): + def test_multiindex_with_columns(self, pa, temp_file): engine = pa dates = pd.date_range("01-Jan-2018", "01-Dec-2018", freq="MS") df = pd.DataFrame( @@ -478,12 +481,16 @@ def test_multiindex_with_columns(self, pa): for index in [index1, index2]: df.index = index - check_round_trip(df, engine) + check_round_trip(df, temp_file, engine) check_round_trip( - df, engine, read_kwargs={"columns": ["A", "B"]}, expected=df[["A", "B"]] + df, + temp_file, + engine, + read_kwargs={"columns": ["A", "B"]}, + expected=df[["A", "B"]], ) - def test_write_ignoring_index(self, engine): + def test_write_ignoring_index(self, engine, temp_file): # ENH 20768 # Ensure index=False omits the index from the written Parquet file. df = pd.DataFrame({"a": [1, 2, 3], "b": ["q", "r", "s"]}) @@ -494,14 +501,18 @@ def test_write_ignoring_index(self, engine): # have the default integer index. expected = df.reset_index(drop=True) - check_round_trip(df, engine, write_kwargs=write_kwargs, expected=expected) + check_round_trip( + df, temp_file, engine, write_kwargs=write_kwargs, expected=expected + ) # Ignore custom index df = pd.DataFrame( {"a": [1, 2, 3], "b": ["q", "r", "s"]}, index=["zyx", "wvu", "tsr"] ) - check_round_trip(df, engine, write_kwargs=write_kwargs, expected=expected) + check_round_trip( + df, temp_file, engine, write_kwargs=write_kwargs, expected=expected + ) # Ignore multi-indexes as well. arrays = [ @@ -513,7 +524,9 @@ def test_write_ignoring_index(self, engine): ) expected = df.reset_index(drop=True) - check_round_trip(df, engine, write_kwargs=write_kwargs, expected=expected) + check_round_trip( + df, temp_file, engine, write_kwargs=write_kwargs, expected=expected + ) def test_write_column_multiindex(self, engine, temp_file): # Not able to write column multi-indexes with non-string column names. @@ -528,7 +541,7 @@ def test_write_column_multiindex(self, engine, temp_file): df, engine, TypeError, "Column name must be a string", path ) elif engine == "pyarrow": - check_round_trip(df, engine) + check_round_trip(df, temp_file, engine) def test_write_column_multiindex_nonstring(self, engine, temp_file): # GH #34777 @@ -546,9 +559,9 @@ def test_write_column_multiindex_nonstring(self, engine, temp_file): if engine == "fastparquet": self.check_error_on_write(df, engine, ValueError, "Column name", path) elif engine == "pyarrow": - check_round_trip(df, engine) + check_round_trip(df, temp_file, engine) - def test_write_column_multiindex_string(self, pa): + def test_write_column_multiindex_string(self, pa, temp_file): # GH #34777 # Not supported in fastparquet as of 0.1.3 engine = pa @@ -563,9 +576,9 @@ def test_write_column_multiindex_string(self, pa): ) df.columns.names = ["ColLevel1", "ColLevel2"] - check_round_trip(df, engine) + check_round_trip(df, temp_file, engine) - def test_write_column_index_string(self, pa): + def test_write_column_index_string(self, pa, temp_file): # GH #34777 # Not supported in fastparquet as of 0.1.3 engine = pa @@ -577,7 +590,7 @@ def test_write_column_index_string(self, pa): ) df.columns.name = "StringCol" - check_round_trip(df, engine) + check_round_trip(df, temp_file, engine) def test_write_column_index_nonstring(self, engine, temp_file): # GH #34777 @@ -594,7 +607,7 @@ def test_write_column_index_nonstring(self, engine, temp_file): df, engine, TypeError, "Column name must be a string", path ) else: - check_round_trip(df, engine) + check_round_trip(df, temp_file, engine) def test_dtype_backend(self, engine, request, temp_file): pq = pytest.importorskip("pyarrow.parquet") @@ -659,7 +672,7 @@ def test_dtype_backend(self, engine, request, temp_file): "string", ], ) - def test_read_empty_array(self, pa, dtype): + def test_read_empty_array(self, pa, dtype, temp_file): # GH #41241 df = pd.DataFrame( { @@ -676,7 +689,11 @@ def test_read_empty_array(self, pa, dtype): } ) check_round_trip( - df, pa, read_kwargs={"dtype_backend": "numpy_nullable"}, expected=expected + df, + temp_file, + pa, + read_kwargs={"dtype_backend": "numpy_nullable"}, + expected=expected, ) @pytest.mark.network @@ -696,7 +713,7 @@ def test_parquet_read_from_url(self, httpserver, datapath, df_compat, engine): class TestParquetPyArrow(Base): @pytest.mark.xfail(reason="datetime_with_nat unit doesn't round-trip") - def test_basic(self, pa, df_full): + def test_basic(self, pa, df_full, temp_file): df = df_full pytest.importorskip("pyarrow", "11.0.0") @@ -706,9 +723,9 @@ def test_basic(self, pa, df_full): df["datetime_tz"] = dti df["bool_with_none"] = [True, None, True] - check_round_trip(df, pa) + check_round_trip(df, temp_file, pa) - def test_basic_subset_columns(self, pa, df_full): + def test_basic_subset_columns(self, pa, df_full, temp_file): # GH18628 df = df_full @@ -717,6 +734,7 @@ def test_basic_subset_columns(self, pa, df_full): check_round_trip( df, + temp_file, pa, expected=df[["string", "int"]], read_kwargs={"columns": ["string", "int"]}, @@ -743,9 +761,9 @@ def test_duplicate_columns(self, pa, temp_file): df, pa, ValueError, "Duplicate column names found", path ) - def test_timedelta(self, pa): + def test_timedelta(self, pa, temp_file): df = pd.DataFrame({"a": pd.timedelta_range("1 day", periods=3)}) - check_round_trip(df, pa) + check_round_trip(df, temp_file, pa) def test_unsupported(self, pa, temp_file): # mixed python objects @@ -764,7 +782,7 @@ def test_unsupported_float16(self, pa, temp_file): if pa_version_under15p0: self.check_external_error_on_write(df, pa, pyarrow.ArrowException, path) else: - check_round_trip(df, pa) + check_round_trip(df, temp_file, pa) @pytest.mark.xfail( is_platform_windows(), @@ -788,7 +806,7 @@ def test_unsupported_float16_cleanup(self, pa, path_type, temp_file): df.to_parquet(path=path, engine=pa) assert not os.path.isfile(path) - def test_categorical(self, pa): + def test_categorical(self, pa, temp_file): # supported in >= 0.7.0 df = pd.DataFrame( { @@ -807,15 +825,18 @@ def test_categorical(self, pa): } ) - check_round_trip(df, pa) + check_round_trip(df, temp_file, pa) @pytest.mark.single_cpu - def test_s3_roundtrip_explicit_fs(self, df_compat, s3_bucket_public, s3so, pa): + def test_s3_roundtrip_explicit_fs( + self, df_compat, s3_bucket_public, s3so, pa, temp_file + ): s3fs = pytest.importorskip("s3fs") s3 = s3fs.S3FileSystem(**s3so) kw = {"filesystem": s3} check_round_trip( df_compat, + temp_file, pa, path=f"{s3_bucket_public.name}/pyarrow.parquet", read_kwargs=kw, @@ -823,11 +844,12 @@ def test_s3_roundtrip_explicit_fs(self, df_compat, s3_bucket_public, s3so, pa): ) @pytest.mark.single_cpu - def test_s3_roundtrip(self, df_compat, s3_bucket_public, s3so, pa): + def test_s3_roundtrip(self, df_compat, s3_bucket_public, s3so, pa, temp_file): # GH #19134 s3so = {"storage_options": s3so} check_round_trip( df_compat, + temp_file, pa, path=f"s3://{s3_bucket_public.name}/pyarrow.parquet", read_kwargs=s3so, @@ -837,7 +859,7 @@ def test_s3_roundtrip(self, df_compat, s3_bucket_public, s3so, pa): @pytest.mark.single_cpu @pytest.mark.parametrize("partition_col", [["A"], []]) def test_s3_roundtrip_for_dir( - self, df_compat, s3_bucket_public, pa, partition_col, s3so + self, df_compat, s3_bucket_public, pa, partition_col, s3so, temp_file ): pytest.importorskip("s3fs") # GH #26388 @@ -854,6 +876,7 @@ def test_s3_roundtrip_for_dir( check_round_trip( df_compat, + temp_file, pa, expected=expected_df, path=f"s3://{s3_bucket_public.name}/parquet_dir", @@ -916,20 +939,22 @@ def test_partition_cols_pathlib(self, tmp_path, pa, df_compat, path_type): df.to_parquet(path, partition_cols=partition_cols_list) assert read_parquet(path).shape == df.shape - def test_empty_dataframe(self, pa): + def test_empty_dataframe(self, pa, temp_file): # GH #27339 df = pd.DataFrame(index=[], columns=[]) - check_round_trip(df, pa) + check_round_trip(df, temp_file, pa) - def test_write_with_schema(self, pa): + def test_write_with_schema(self, pa, temp_file): import pyarrow df = pd.DataFrame({"x": [0, 1]}) schema = pyarrow.schema([pyarrow.field("x", type=pyarrow.bool_())]) out_df = df.astype(bool) - check_round_trip(df, pa, write_kwargs={"schema": schema}, expected=out_df) + check_round_trip( + df, temp_file, pa, write_kwargs={"schema": schema}, expected=out_df + ) - def test_additional_extension_arrays(self, pa, using_infer_string): + def test_additional_extension_arrays(self, pa, using_infer_string, temp_file): # test additional ExtensionArrays that are supported through the # __arrow_array__ protocol pytest.importorskip("pyarrow") @@ -941,14 +966,16 @@ def test_additional_extension_arrays(self, pa, using_infer_string): } ) if using_infer_string and pa_version_under19p0: - check_round_trip(df, pa, expected=df.astype({"c": "str"})) + check_round_trip(df, temp_file, pa, expected=df.astype({"c": "str"})) else: - check_round_trip(df, pa) + check_round_trip(df, temp_file, pa) df = pd.DataFrame({"a": pd.Series([1, 2, 3, None], dtype="Int64")}) - check_round_trip(df, pa) + check_round_trip(df, temp_file, pa) - def test_pyarrow_backed_string_array(self, pa, string_storage, using_infer_string): + def test_pyarrow_backed_string_array( + self, pa, string_storage, using_infer_string, temp_file + ): # test ArrowStringArray supported through the __arrow_array__ protocol pytest.importorskip("pyarrow") df = pd.DataFrame({"a": pd.Series(["a", None, "c"], dtype="string[pyarrow]")}) @@ -961,9 +988,9 @@ def test_pyarrow_backed_string_array(self, pa, string_storage, using_infer_strin expected.columns = expected.columns.astype("str") else: expected = df.astype(f"string[{string_storage}]") - check_round_trip(df, pa, expected=expected) + check_round_trip(df, temp_file, pa, expected=expected) - def test_additional_extension_types(self, pa): + def test_additional_extension_types(self, pa, temp_file): # test additional ExtensionArrays that are supported through the # __arrow_array__ protocol + by defining a custom ExtensionType pytest.importorskip("pyarrow") @@ -977,16 +1004,16 @@ def test_additional_extension_types(self, pa): ), } ) - check_round_trip(df, pa) + check_round_trip(df, temp_file, pa) - def test_timestamp_nanoseconds(self, pa): + def test_timestamp_nanoseconds(self, pa, temp_file): # with version 2.6, pyarrow defaults to writing the nanoseconds, so # this should work without error, even for pyarrow < 13 ver = "2.6" df = pd.DataFrame({"a": pd.date_range("2017-01-01", freq="1ns", periods=10)}) - check_round_trip(df, pa, write_kwargs={"version": ver}) + check_round_trip(df, temp_file, pa, write_kwargs={"version": ver}) - def test_timezone_aware_index(self, pa, timezone_aware_date_list): + def test_timezone_aware_index(self, pa, timezone_aware_date_list, temp_file): idx = 5 * [timezone_aware_date_list] df = pd.DataFrame(index=idx, data={"index_as_col": idx}) @@ -1013,7 +1040,7 @@ def test_timezone_aware_index(self, pa, timezone_aware_date_list): tz = pytz.FixedOffset(offset.total_seconds() / 60) expected.index = expected.index.tz_convert(tz) expected["index_as_col"] = expected["index_as_col"].dt.tz_convert(tz) - check_round_trip(df, pa, check_dtype=False, expected=expected) + check_round_trip(df, temp_file, pa, check_dtype=False, expected=expected) def test_filter_row_groups(self, pa, temp_file): # https://github.com/pandas-dev/pandas/issues/26551 @@ -1025,7 +1052,7 @@ def test_filter_row_groups(self, pa, temp_file): assert len(result) == 1 @pytest.mark.filterwarnings("ignore:make_block is deprecated:DeprecationWarning") - def test_read_dtype_backend_pyarrow_config(self, pa, df_full): + def test_read_dtype_backend_pyarrow_config(self, pa, df_full, temp_file): import pyarrow df = df_full @@ -1044,12 +1071,13 @@ def test_read_dtype_backend_pyarrow_config(self, pa, df_full): check_round_trip( df, + temp_file, engine=pa, read_kwargs={"dtype_backend": "pyarrow"}, expected=expected, ) - def test_read_dtype_backend_pyarrow_config_index(self, pa): + def test_read_dtype_backend_pyarrow_config_index(self, pa, temp_file): df = pd.DataFrame( {"a": [1, 2]}, index=pd.Index([3, 4], name="test"), dtype="int64[pyarrow]" ) @@ -1058,6 +1086,7 @@ def test_read_dtype_backend_pyarrow_config_index(self, pa): expected.index = expected.index.astype("int64[pyarrow]") check_round_trip( df, + temp_file, engine=pa, read_kwargs={"dtype_backend": "pyarrow"}, expected=expected, @@ -1087,16 +1116,16 @@ def test_read_dtype_backend_pyarrow_config_index(self, pa): ), ], ) - def test_columns_dtypes_not_invalid(self, pa, columns): + def test_columns_dtypes_not_invalid(self, pa, columns, temp_file): df = pd.DataFrame({"string": list("abc"), "int": list(range(1, 4))}) df.columns = columns - check_round_trip(df, pa) + check_round_trip(df, temp_file, pa) - def test_empty_columns(self, pa): + def test_empty_columns(self, pa, temp_file): # GH 52034 df = pd.DataFrame(index=pd.Index(["a", "b", "c"], name="custom name")) - check_round_trip(df, pa) + check_round_trip(df, temp_file, pa) def test_df_attrs_persistence(self, tmp_path, pa): path = tmp_path / "test_df_metadata.p" @@ -1188,7 +1217,7 @@ def test_non_nanosecond_timestamps(self, temp_file): ) tm.assert_frame_equal(result, expected) - def test_maps_as_pydicts(self, pa): + def test_maps_as_pydicts(self, pa, temp_file): pyarrow = pytest.importorskip("pyarrow", "13.0.0") schema = pyarrow.schema( @@ -1197,6 +1226,7 @@ def test_maps_as_pydicts(self, pa): df = pd.DataFrame([{"foo": {"A": 1}}, {"foo": {"B": 2}}]) check_round_trip( df, + temp_file, pa, write_kwargs={"schema": schema}, read_kwargs={"to_pandas_kwargs": {"maps_as_pydicts": "strict"}}, @@ -1204,7 +1234,7 @@ def test_maps_as_pydicts(self, pa): class TestParquetFastParquet(Base): - def test_basic(self, fp, df_full, request): + def test_basic(self, fp, df_full, request, temp_file): pytz = pytest.importorskip("pytz") tz = pytz.timezone("US/Eastern") @@ -1214,7 +1244,7 @@ def test_basic(self, fp, df_full, request): dti = dti._with_freq(None) # freq doesn't round-trip df["datetime_tz"] = dti df["timedelta"] = pd.timedelta_range("1 day", periods=3) - check_round_trip(df, fp) + check_round_trip(df, temp_file, fp) def test_columns_dtypes_invalid(self, fp, temp_file): df = pd.DataFrame({"string": list("abc"), "int": list(range(1, 4))}) @@ -1245,12 +1275,12 @@ def test_duplicate_columns(self, fp, temp_file): path = str(temp_file) self.check_error_on_write(df, fp, ValueError, msg, path) - def test_bool_with_none(self, fp, request): + def test_bool_with_none(self, fp, request, temp_file): df = pd.DataFrame({"a": [True, None, False]}) expected = pd.DataFrame({"a": [1.0, np.nan, 0.0]}, dtype="float16") # Fastparquet bug in 0.7.1 makes it so that this dtype becomes # float64 - check_round_trip(df, fp, expected=expected, check_dtype=False) + check_round_trip(df, temp_file, fp, expected=expected, check_dtype=False) def test_unsupported(self, fp, temp_file): # period @@ -1264,9 +1294,9 @@ def test_unsupported(self, fp, temp_file): msg = "Can't infer object conversion type" self.check_error_on_write(df, fp, ValueError, msg, path) - def test_categorical(self, fp): + def test_categorical(self, fp, temp_file): df = pd.DataFrame({"a": pd.Categorical(list("abc"))}) - check_round_trip(df, fp) + check_round_trip(df, temp_file, fp) def test_filter_row_groups(self, fp, temp_file): d = {"a": list(range(3))} @@ -1277,10 +1307,11 @@ def test_filter_row_groups(self, fp, temp_file): assert len(result) == 1 @pytest.mark.single_cpu - def test_s3_roundtrip(self, df_compat, s3_bucket_public, s3so, fp): + def test_s3_roundtrip(self, df_compat, s3_bucket_public, s3so, fp, temp_file): # GH #19134 check_round_trip( df_compat, + temp_file, fp, path=f"s3://{s3_bucket_public.name}/fastparquet.parquet", read_kwargs={"storage_options": s3so}, @@ -1354,20 +1385,22 @@ def test_error_on_using_partition_cols_and_partition_on( partition_cols=partition_cols, ) - def test_empty_dataframe(self, fp): + def test_empty_dataframe(self, fp, temp_file): # GH #27339 df = pd.DataFrame() expected = df.copy() - check_round_trip(df, fp, expected=expected) + check_round_trip(df, temp_file, fp, expected=expected) - def test_timezone_aware_index(self, fp, timezone_aware_date_list, request): + def test_timezone_aware_index( + self, fp, timezone_aware_date_list, request, temp_file + ): idx = 5 * [timezone_aware_date_list] df = pd.DataFrame(index=idx, data={"index_as_col": idx}) expected = df.copy() expected.index.name = "index" - check_round_trip(df, fp, expected=expected) + check_round_trip(df, temp_file, fp, expected=expected) def test_close_file_handle_on_read_error(self, temp_file): path = str(temp_file) From 52ee1a8400a038e9cb39ba66c7e2d27b47140846 Mon Sep 17 00:00:00 2001 From: Harshit Pande Date: Wed, 1 Oct 2025 10:02:03 +0000 Subject: [PATCH 06/10] fix str type cast in temp file in frame methods to csv --- pandas/tests/frame/methods/test_to_csv.py | 57 +++++++++++------------ 1 file changed, 27 insertions(+), 30 deletions(-) diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index 13f13c70ff748..f9b7d9c741c79 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -261,15 +261,13 @@ def _return_result_expected( kwargs["index_col"] = list(range(rnlvl)) kwargs["header"] = list(range(cnlvl)) - path = str(temp_file) - df.to_csv(path, encoding="utf8", chunksize=chunksize) - recons = self.read_csv(path, **kwargs) + df.to_csv(temp_file, encoding="utf8", chunksize=chunksize) + recons = self.read_csv(temp_file, **kwargs) else: kwargs["header"] = 0 - path = str(temp_file) - df.to_csv(path, encoding="utf8", chunksize=chunksize) - recons = self.read_csv(path, **kwargs) + df.to_csv(temp_file, encoding="utf8", chunksize=chunksize) + recons = self.read_csv(temp_file, **kwargs) def _to_uni(x): if not isinstance(x, str): @@ -624,8 +622,8 @@ def _make_frame(names=None): [[f"i-{i}" for i in range(5)] for _ in range(2)], names=list("ab") ), ) - df.to_csv(path) - result = read_csv(path, header=[0, 1, 2, 3], index_col=[0, 1]) + df.to_csv(temp_file) + result = read_csv(temp_file, header=[0, 1, 2, 3], index_col=[0, 1]) tm.assert_frame_equal(df, result) # column is mi @@ -635,8 +633,8 @@ def _make_frame(names=None): [[f"i-{i}" for i in range(3)] for _ in range(4)], names=list("abcd") ), ) - df.to_csv(path) - result = read_csv(path, header=[0, 1, 2, 3], index_col=0) + df.to_csv(temp_file) + result = read_csv(temp_file, header=[0, 1, 2, 3], index_col=0) tm.assert_frame_equal(df, result) # dup column names? @@ -649,52 +647,52 @@ def _make_frame(names=None): [[f"i-{i}" for i in range(5)] for _ in range(3)], names=list("abc") ), ) - df.to_csv(path) - result = read_csv(path, header=[0, 1, 2, 3], index_col=[0, 1, 2]) + df.to_csv(temp_file) + result = read_csv(temp_file, header=[0, 1, 2, 3], index_col=[0, 1, 2]) tm.assert_frame_equal(df, result) # writing with no index df = _make_frame() - df.to_csv(path, index=False) - result = read_csv(path, header=[0, 1]) + df.to_csv(temp_file, index=False) + result = read_csv(temp_file, header=[0, 1]) tm.assert_frame_equal(df, result) # we lose the names here df = _make_frame(True) - df.to_csv(path, index=False) - result = read_csv(path, header=[0, 1]) + df.to_csv(temp_file, index=False) + result = read_csv(temp_file, header=[0, 1]) assert com.all_none(*result.columns.names) result.columns.names = df.columns.names tm.assert_frame_equal(df, result) # whatsnew example df = _make_frame() - df.to_csv(path) - result = read_csv(path, header=[0, 1], index_col=[0]) + df.to_csv(temp_file) + result = read_csv(temp_file, header=[0, 1], index_col=[0]) tm.assert_frame_equal(df, result) df = _make_frame(True) - df.to_csv(path) - result = read_csv(path, header=[0, 1], index_col=[0]) + df.to_csv(temp_file) + result = read_csv(temp_file, header=[0, 1], index_col=[0]) tm.assert_frame_equal(df, result) # invalid options df = _make_frame(True) - df.to_csv(path) + df.to_csv(temp_file) for i in [6, 7]: msg = f"len of {i}, but only 5 lines in file" with pytest.raises(ParserError, match=msg): - read_csv(path, header=list(range(i)), index_col=0) + read_csv(temp_file, header=list(range(i)), index_col=0) # write with cols msg = "cannot specify cols with a MultiIndex" with pytest.raises(TypeError, match=msg): - df.to_csv(path, columns=["foo", "bar"]) + df.to_csv(temp_file, columns=["foo", "bar"]) # empty - tsframe[:0].to_csv(path) - recons = self.read_csv(path) + tsframe[:0].to_csv(temp_file) + recons = self.read_csv(temp_file) exp = tsframe[:0] exp.index = [] @@ -812,8 +810,8 @@ def test_to_csv_dups_cols(self, temp_file): df.columns = [0, 1, 2] * 5 - df.to_csv(path) - result = read_csv(path, index_col=0) + df.to_csv(temp_file) + result = read_csv(temp_file, index_col=0) # date cols for i in ["0.4", "1.4", "2.4"]: @@ -1201,9 +1199,8 @@ def test_to_csv_with_dst_transitions_with_pickle(self, start, end, temp_file): idx._data._freq = None # otherwise there is trouble on unpickle df = DataFrame({"values": 1, "idx": idx}, index=idx) - path = str(temp_file) - df.to_csv(path, index=True) - result = read_csv(path, index_col=0) + df.to_csv(temp_file, index=True) + result = read_csv(temp_file, index_col=0) result.index = ( to_datetime(result.index, utc=True).tz_convert("Europe/Paris").as_unit("ns") ) From c9ee1c1cbd9bcb9b2b45cc0f85d2026f805b7a60 Mon Sep 17 00:00:00 2001 From: Harshit Pande Date: Wed, 1 Oct 2025 10:58:00 +0000 Subject: [PATCH 07/10] fix str type cast for temp file in io format to csv --- pandas/tests/io/formats/test_to_csv.py | 103 +++++++++++-------------- 1 file changed, 45 insertions(+), 58 deletions(-) diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index f716cde8c3ad8..e6bfe40c5433f 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -30,9 +30,8 @@ def test_to_csv_with_single_column(self, temp_file): "" 1.0 """ - path = str(temp_file) - df1.to_csv(path, header=None, index=None) - with open(path, encoding="utf-8") as f: + df1.to_csv(temp_file, header=None, index=None) + with open(temp_file, encoding="utf-8") as f: assert f.read() == expected1 df2 = DataFrame([1, None]) @@ -40,18 +39,17 @@ def test_to_csv_with_single_column(self, temp_file): 1.0 "" """ - df2.to_csv(path, header=None, index=None) - with open(path, encoding="utf-8") as f: + df2.to_csv(temp_file, header=None, index=None) + with open(temp_file, encoding="utf-8") as f: assert f.read() == expected2 def test_to_csv_default_encoding(self, temp_file): # GH17097 df = DataFrame({"col": ["AAAAA", "ÄÄÄÄÄ", "ßßßßß", "聞聞聞聞聞"]}) - path = str(temp_file) # the default to_csv encoding is uft-8. - df.to_csv(path) - tm.assert_frame_equal(pd.read_csv(path, index_col=0), df) + df.to_csv(temp_file) + tm.assert_frame_equal(pd.read_csv(temp_file, index_col=0), df) def test_to_csv_quotechar(self, temp_file): df = DataFrame({"col": [1, 2]}) @@ -61,9 +59,8 @@ def test_to_csv_quotechar(self, temp_file): "1","2" """ - path = str(temp_file) - df.to_csv(path, quoting=1) # 1=QUOTE_ALL - with open(path, encoding="utf-8") as f: + df.to_csv(temp_file, quoting=1) # 1=QUOTE_ALL + with open(temp_file, encoding="utf-8") as f: assert f.read() == expected expected = """\ @@ -72,13 +69,12 @@ def test_to_csv_quotechar(self, temp_file): $1$,$2$ """ - path = str(temp_file) - df.to_csv(path, quoting=1, quotechar="$") - with open(path, encoding="utf-8") as f: + df.to_csv(temp_file, quoting=1, quotechar="$") + with open(temp_file, encoding="utf-8") as f: assert f.read() == expected with pytest.raises(TypeError, match="quotechar"): - df.to_csv(path, quoting=1, quotechar=None) + df.to_csv(temp_file, quoting=1, quotechar=None) def test_to_csv_doublequote(self, temp_file): df = DataFrame({"col": ['a"a', '"bb"']}) @@ -88,13 +84,12 @@ def test_to_csv_doublequote(self, temp_file): "1","""bb""" ''' - path = str(temp_file) - df.to_csv(path, quoting=1, doublequote=True) # QUOTE_ALL - with open(path, encoding="utf-8") as f: + df.to_csv(temp_file, quoting=1, doublequote=True) # QUOTE_ALL + with open(temp_file, encoding="utf-8") as f: assert f.read() == expected with pytest.raises(Error, match="escapechar"): - df.to_csv(path, doublequote=False) # no escapechar set + df.to_csv(temp_file, doublequote=False) # no escapechar set def test_to_csv_escapechar(self, temp_file): df = DataFrame({"col": ['a"a', '"bb"']}) @@ -104,9 +99,8 @@ def test_to_csv_escapechar(self, temp_file): "1","\\"bb\\"" """ - path = str(temp_file) - df.to_csv(path, quoting=1, doublequote=False, escapechar="\\") - with open(path, encoding="utf-8") as f: + df.to_csv(temp_file, quoting=1, doublequote=False, escapechar="\\") + with open(temp_file, encoding="utf-8") as f: assert f.read() == expected df = DataFrame({"col": ["a,a", ",bb,"]}) @@ -116,8 +110,8 @@ def test_to_csv_escapechar(self, temp_file): 1,\\,bb\\, """ - df.to_csv(path, quoting=3, escapechar="\\") # QUOTE_NONE - with open(path, encoding="utf-8") as f: + df.to_csv(temp_file, quoting=3, escapechar="\\") # QUOTE_NONE + with open(temp_file, encoding="utf-8") as f: assert f.read() == expected def test_csv_to_string(self): @@ -395,9 +389,8 @@ def test_to_csv_string_array_ascii(self, temp_file): 0,"['foo', 'bar']" 1,"['baz', 'qux']" """ - path = str(temp_file) - df.to_csv(path, encoding="ascii") - with open(path, encoding="utf-8") as f: + df.to_csv(temp_file, encoding="ascii") + with open(temp_file, encoding="utf-8") as f: assert f.read() == expected_ascii def test_to_csv_string_array_utf8(self, temp_file): @@ -409,16 +402,14 @@ def test_to_csv_string_array_utf8(self, temp_file): 0,"['foo', 'bar']" 1,"['baz', 'qux']" """ - path = str(temp_file) - df.to_csv(path, encoding="utf-8") - with open(path, encoding="utf-8") as f: + df.to_csv(temp_file, encoding="utf-8") + with open(temp_file, encoding="utf-8") as f: assert f.read() == expected_utf8 def test_to_csv_string_with_lf(self, temp_file): # GH 20353 data = {"int": [1, 2, 3], "str_lf": ["abc", "d\nef", "g\nh\n\ni"]} df = DataFrame(data) - path = str(temp_file) # case 1: The default line terminator(=os.linesep)(PR 21406) os_linesep = os.linesep.encode("utf-8") @@ -432,28 +423,27 @@ def test_to_csv_string_with_lf(self, temp_file): + b'3,"g\nh\n\ni"' + os_linesep ) - df.to_csv(path, index=False) - with open(path, "rb") as f: + df.to_csv(temp_file, index=False) + with open(temp_file, "rb") as f: assert f.read() == expected_noarg # case 2: LF as line terminator expected_lf = b'int,str_lf\n1,abc\n2,"d\nef"\n3,"g\nh\n\ni"\n' - df.to_csv(path, lineterminator="\n", index=False) - with open(path, "rb") as f: + df.to_csv(temp_file, lineterminator="\n", index=False) + with open(temp_file, "rb") as f: assert f.read() == expected_lf # case 3: CRLF as line terminator # 'lineterminator' should not change inner element expected_crlf = b'int,str_lf\r\n1,abc\r\n2,"d\nef"\r\n3,"g\nh\n\ni"\r\n' - df.to_csv(path, lineterminator="\r\n", index=False) - with open(path, "rb") as f: + df.to_csv(temp_file, lineterminator="\r\n", index=False) + with open(temp_file, "rb") as f: assert f.read() == expected_crlf def test_to_csv_string_with_crlf(self, temp_file): # GH 20353 data = {"int": [1, 2, 3], "str_crlf": ["abc", "d\r\nef", "g\r\nh\r\n\r\ni"]} df = DataFrame(data) - path = str(temp_file) # case 1: The default line terminator(=os.linesep)(PR 21406) os_linesep = os.linesep.encode("utf-8") expected_noarg = ( @@ -466,14 +456,14 @@ def test_to_csv_string_with_crlf(self, temp_file): + b'3,"g\r\nh\r\n\r\ni"' + os_linesep ) - df.to_csv(path, index=False) - with open(path, "rb") as f: + df.to_csv(temp_file, index=False) + with open(temp_file, "rb") as f: assert f.read() == expected_noarg # case 2: LF as line terminator expected_lf = b'int,str_crlf\n1,abc\n2,"d\r\nef"\n3,"g\r\nh\r\n\r\ni"\n' - df.to_csv(path, lineterminator="\n", index=False) - with open(path, "rb") as f: + df.to_csv(temp_file, lineterminator="\n", index=False) + with open(temp_file, "rb") as f: assert f.read() == expected_lf # case 3: CRLF as line terminator @@ -481,8 +471,8 @@ def test_to_csv_string_with_crlf(self, temp_file): expected_crlf = ( b'int,str_crlf\r\n1,abc\r\n2,"d\r\nef"\r\n3,"g\r\nh\r\n\r\ni"\r\n' ) - df.to_csv(path, lineterminator="\r\n", index=False) - with open(path, "rb") as f: + df.to_csv(temp_file, lineterminator="\r\n", index=False) + with open(temp_file, "rb") as f: assert f.read() == expected_crlf def test_to_csv_stdout_file(self, capsys): @@ -514,11 +504,10 @@ def test_to_csv_write_to_open_file(self, temp_file): y z """ - path = str(temp_file) - with open(path, "w", encoding="utf-8") as f: + with open(temp_file, "w", encoding="utf-8") as f: f.write("manual header\n") df.to_csv(f, header=None, index=None) - with open(path, encoding="utf-8") as f: + with open(temp_file, encoding="utf-8") as f: assert f.read() == expected def test_to_csv_write_to_open_file_with_newline_py3(self, temp_file): @@ -528,12 +517,11 @@ def test_to_csv_write_to_open_file_with_newline_py3(self, temp_file): expected_rows = ["x", "y", "z"] expected = "manual header\n" + tm.convert_rows_list_to_csv_str(expected_rows) - path = str(temp_file) - with open(path, "w", newline="", encoding="utf-8") as f: + with open(temp_file, "w", newline="", encoding="utf-8") as f: f.write("manual header\n") df.to_csv(f, header=None, index=None) - with open(path, "rb") as f: + with open(temp_file, "rb") as f: assert f.read() == bytes(expected, "utf-8") @pytest.mark.parametrize("to_infer", [True, False]) @@ -664,8 +652,7 @@ def test_to_csv_errors(self, errors, temp_file): data = ["\ud800foo"] ser = pd.Series(data, index=Index(data, dtype=object), dtype=object) - path = str(temp_file) - ser.to_csv(path, errors=errors) + ser.to_csv(temp_file, errors=errors) # No use in reading back the data as it is not the same anymore # due to the error handling @@ -683,10 +670,9 @@ def test_to_csv_binary_handle(self, mode, temp_file): index=Index([f"i-{i}" for i in range(30)]), ) - path = str(temp_file) - with open(path, mode="w+b") as handle: + with open(temp_file, mode="w+b") as handle: df.to_csv(handle, mode=mode) - tm.assert_frame_equal(df, pd.read_csv(path, index_col=0)) + tm.assert_frame_equal(df, pd.read_csv(temp_file, index_col=0)) @pytest.mark.parametrize("mode", ["wb", "w"]) def test_to_csv_encoding_binary_handle(self, mode, temp_file): @@ -721,9 +707,10 @@ def test_to_csv_iterative_compression_name(compression, temp_file): columns=Index(list("ABCD")), index=Index([f"i-{i}" for i in range(30)]), ) - path = str(temp_file) - df.to_csv(path, compression=compression, chunksize=1) - tm.assert_frame_equal(pd.read_csv(path, compression=compression, index_col=0), df) + df.to_csv(temp_file, compression=compression, chunksize=1) + tm.assert_frame_equal( + pd.read_csv(temp_file, compression=compression, index_col=0), df + ) def test_to_csv_iterative_compression_buffer(compression): From 4dfad6754456084bb9983649b57598823a3fb719 Mon Sep 17 00:00:00 2001 From: Harshit Pande Date: Wed, 1 Oct 2025 11:45:59 +0000 Subject: [PATCH 08/10] fix str type cast in io test parquet --- pandas/tests/io/test_parquet.py | 112 +++++++++++++------------------- 1 file changed, 46 insertions(+), 66 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 79660394e1197..53f5a79625ee3 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -224,7 +224,7 @@ def compare(repeat): ) if path is None: - path = str(temp_file) + path = temp_file compare(repeat) else: compare(repeat) @@ -342,27 +342,25 @@ def test_cross_engine_pa_fp(df_cross_compat, pa, fp, temp_file): # cross-compat with differing reading/writing engines df = df_cross_compat - path = str(temp_file) - df.to_parquet(path, engine=pa, compression=None) + df.to_parquet(temp_file, engine=pa, compression=None) - result = read_parquet(path, engine=fp) + result = read_parquet(temp_file, engine=fp) tm.assert_frame_equal(result, df) - result = read_parquet(path, engine=fp, columns=["a", "d"]) + result = read_parquet(temp_file, engine=fp, columns=["a", "d"]) tm.assert_frame_equal(result, df[["a", "d"]]) def test_cross_engine_fp_pa(df_cross_compat, pa, fp, temp_file): # cross-compat with differing reading/writing engines df = df_cross_compat - path = str(temp_file) - df.to_parquet(path, engine=fp, compression=None) + df.to_parquet(temp_file, engine=fp, compression=None) - result = read_parquet(path, engine=pa) + result = read_parquet(temp_file, engine=pa) tm.assert_frame_equal(result, df) - result = read_parquet(path, engine=pa, columns=["a", "d"]) + result = read_parquet(temp_file, engine=pa, columns=["a", "d"]) tm.assert_frame_equal(result, df[["a", "d"]]) @@ -388,8 +386,7 @@ def test_error(self, engine, temp_file): np.array([1, 2, 3]), ]: msg = "to_parquet only supports IO with DataFrames" - path = str(temp_file) - self.check_error_on_write(obj, engine, ValueError, msg, path) + self.check_error_on_write(obj, engine, ValueError, msg, temp_file) def test_columns_dtypes(self, engine, temp_file): df = pd.DataFrame({"string": list("abc"), "int": list(range(1, 4))}) @@ -535,10 +532,9 @@ def test_write_column_multiindex(self, engine, temp_file): np.random.default_rng(2).standard_normal((4, 3)), columns=mi_columns ) - path = str(temp_file) if engine == "fastparquet": self.check_error_on_write( - df, engine, TypeError, "Column name must be a string", path + df, engine, TypeError, "Column name must be a string", temp_file ) elif engine == "pyarrow": check_round_trip(df, temp_file, engine) @@ -555,9 +551,8 @@ def test_write_column_multiindex_nonstring(self, engine, temp_file): np.random.default_rng(2).standard_normal((8, 8)), columns=arrays ) df.columns.names = ["Level1", "Level2"] - path = str(temp_file) if engine == "fastparquet": - self.check_error_on_write(df, engine, ValueError, "Column name", path) + self.check_error_on_write(df, engine, ValueError, "Column name", temp_file) elif engine == "pyarrow": check_round_trip(df, temp_file, engine) @@ -601,10 +596,9 @@ def test_write_column_index_nonstring(self, engine, temp_file): np.random.default_rng(2).standard_normal((8, 4)), columns=arrays ) df.columns.name = "NonStringCol" - path = str(temp_file) if engine == "fastparquet": self.check_error_on_write( - df, engine, TypeError, "Column name must be a string", path + df, engine, TypeError, "Column name must be a string", temp_file ) else: check_round_trip(df, temp_file, engine) @@ -633,11 +627,10 @@ def test_dtype_backend(self, engine, request, temp_file): "g": pyarrow.array([1.0, 2.0, 3.0, None], "float64"), } ) - path = str(temp_file) # write manually with pyarrow to write integers - pq.write_table(table, path) - result1 = read_parquet(path, engine=engine) - result2 = read_parquet(path, engine=engine, dtype_backend="numpy_nullable") + pq.write_table(table, temp_file) + result1 = read_parquet(temp_file, engine=engine) + result2 = read_parquet(temp_file, engine=engine, dtype_backend="numpy_nullable") assert result1["a"].dtype == np.dtype("float64") expected = pd.DataFrame( @@ -756,9 +749,8 @@ def test_to_bytes_without_path_or_buf_provided(self, pa, df_full): def test_duplicate_columns(self, pa, temp_file): # not currently able to handle duplicate columns df = pd.DataFrame(np.arange(12).reshape(4, 3), columns=list("aaa")).copy() - path = str(temp_file) self.check_error_on_write( - df, pa, ValueError, "Duplicate column names found", path + df, pa, ValueError, "Duplicate column names found", temp_file ) def test_timedelta(self, pa, temp_file): @@ -770,17 +762,17 @@ def test_unsupported(self, pa, temp_file): df = pd.DataFrame({"a": ["a", 1, 2.0]}) # pyarrow 0.11 raises ArrowTypeError # older pyarrows raise ArrowInvalid - path = str(temp_file) - self.check_external_error_on_write(df, pa, pyarrow.ArrowException, path) + self.check_external_error_on_write(df, pa, pyarrow.ArrowException, temp_file) def test_unsupported_float16(self, pa, temp_file): # #44847, #44914 # Not able to write float 16 column using pyarrow. data = np.arange(2, 10, dtype=np.float16) df = pd.DataFrame(data=data, columns=["fp16"]) - path = str(temp_file) if pa_version_under15p0: - self.check_external_error_on_write(df, pa, pyarrow.ArrowException, path) + self.check_external_error_on_write( + df, pa, pyarrow.ArrowException, temp_file + ) else: check_round_trip(df, temp_file, pa) @@ -800,8 +792,7 @@ def test_unsupported_float16_cleanup(self, pa, path_type, temp_file): data = np.arange(2, 10, dtype=np.float16) df = pd.DataFrame(data=data, columns=["fp16"]) - path_str = str(temp_file) - path = path_type(path_str) + path = path_type(temp_file) with tm.external_error_raised(pyarrow.ArrowException): df.to_parquet(path=path, engine=pa) assert not os.path.isfile(path) @@ -1046,9 +1037,8 @@ def test_filter_row_groups(self, pa, temp_file): # https://github.com/pandas-dev/pandas/issues/26551 pytest.importorskip("pyarrow") df = pd.DataFrame({"a": list(range(3))}) - path = str(temp_file) - df.to_parquet(path, engine=pa) - result = read_parquet(path, pa, filters=[("a", "==", 0)]) + df.to_parquet(temp_file, engine=pa) + result = read_parquet(temp_file, pa, filters=[("a", "==", 0)]) assert len(result) == 1 @pytest.mark.filterwarnings("ignore:make_block is deprecated:DeprecationWarning") @@ -1251,29 +1241,27 @@ def test_columns_dtypes_invalid(self, fp, temp_file): err = TypeError msg = "Column name must be a string" - path = str(temp_file) # numeric df.columns = [0, 1] - self.check_error_on_write(df, fp, err, msg, path) + self.check_error_on_write(df, fp, err, msg, temp_file) # bytes df.columns = [b"foo", b"bar"] - self.check_error_on_write(df, fp, err, msg, path) + self.check_error_on_write(df, fp, err, msg, temp_file) # python object df.columns = [ datetime.datetime(2011, 1, 1, 0, 0), datetime.datetime(2011, 1, 1, 1, 1), ] - self.check_error_on_write(df, fp, err, msg, path) + self.check_error_on_write(df, fp, err, msg, temp_file) def test_duplicate_columns(self, fp, temp_file): # not currently able to handle duplicate columns df = pd.DataFrame(np.arange(12).reshape(4, 3), columns=list("aaa")).copy() msg = "Cannot create parquet dataset with duplicate column names" - path = str(temp_file) - self.check_error_on_write(df, fp, ValueError, msg, path) + self.check_error_on_write(df, fp, ValueError, msg, temp_file) def test_bool_with_none(self, fp, request, temp_file): df = pd.DataFrame({"a": [True, None, False]}) @@ -1286,13 +1274,12 @@ def test_unsupported(self, fp, temp_file): # period df = pd.DataFrame({"a": pd.period_range("2013", freq="M", periods=3)}) # error from fastparquet -> don't check exact error message - path = str(temp_file) - self.check_error_on_write(df, fp, ValueError, None, path) + self.check_error_on_write(df, fp, ValueError, None, temp_file) # mixed df = pd.DataFrame({"a": ["a", 1, 2.0]}) msg = "Can't infer object conversion type" - self.check_error_on_write(df, fp, ValueError, msg, path) + self.check_error_on_write(df, fp, ValueError, msg, temp_file) def test_categorical(self, fp, temp_file): df = pd.DataFrame({"a": pd.Categorical(list("abc"))}) @@ -1301,9 +1288,8 @@ def test_categorical(self, fp, temp_file): def test_filter_row_groups(self, fp, temp_file): d = {"a": list(range(3))} df = pd.DataFrame(d) - path = str(temp_file) - df.to_parquet(path, engine=fp, compression=None, row_group_offsets=1) - result = read_parquet(path, fp, filters=[("a", "==", 0)]) + df.to_parquet(temp_file, engine=fp, compression=None, row_group_offsets=1) + result = read_parquet(temp_file, fp, filters=[("a", "==", 0)]) assert len(result) == 1 @pytest.mark.single_cpu @@ -1403,73 +1389,68 @@ def test_timezone_aware_index( check_round_trip(df, temp_file, fp, expected=expected) def test_close_file_handle_on_read_error(self, temp_file): - path = str(temp_file) - pathlib.Path(path).write_bytes(b"breakit") + pathlib.Path(temp_file).write_bytes(b"breakit") with tm.external_error_raised(Exception): # Not important which exception - read_parquet(path, engine="fastparquet") + read_parquet(temp_file, engine="fastparquet") # The next line raises an error on Windows if the file is still open - pathlib.Path(path).unlink(missing_ok=False) + pathlib.Path(temp_file).unlink(missing_ok=False) def test_bytes_file_name(self, engine, temp_file): # GH#48944 df = pd.DataFrame(data={"A": [0, 1], "B": [1, 0]}) - path = str(temp_file) - with open(path.encode(), "wb") as f: + with open(temp_file, "wb") as f: df.to_parquet(f) - result = read_parquet(path, engine=engine) + result = read_parquet(temp_file, engine=engine) tm.assert_frame_equal(result, df) def test_filesystem_notimplemented(self, temp_file): pytest.importorskip("fastparquet") df = pd.DataFrame(data={"A": [0, 1], "B": [1, 0]}) - path = str(temp_file) with pytest.raises(NotImplementedError, match="filesystem is not implemented"): - df.to_parquet(path, engine="fastparquet", filesystem="foo") + df.to_parquet(temp_file, engine="fastparquet", filesystem="foo") - pathlib.Path(path).write_bytes(b"foo") + pathlib.Path(temp_file).write_bytes(b"foo") with pytest.raises(NotImplementedError, match="filesystem is not implemented"): - read_parquet(path, engine="fastparquet", filesystem="foo") + read_parquet(temp_file, engine="fastparquet", filesystem="foo") def test_invalid_filesystem(self, temp_file): pytest.importorskip("pyarrow") df = pd.DataFrame(data={"A": [0, 1], "B": [1, 0]}) - path = str(temp_file) with pytest.raises( ValueError, match="filesystem must be a pyarrow or fsspec FileSystem" ): - df.to_parquet(path, engine="pyarrow", filesystem="foo") + df.to_parquet(temp_file, engine="pyarrow", filesystem="foo") - pathlib.Path(path).write_bytes(b"foo") + pathlib.Path(temp_file).write_bytes(b"foo") with pytest.raises( ValueError, match="filesystem must be a pyarrow or fsspec FileSystem" ): - read_parquet(path, engine="pyarrow", filesystem="foo") + read_parquet(temp_file, engine="pyarrow", filesystem="foo") def test_unsupported_pa_filesystem_storage_options(self, temp_file): pa_fs = pytest.importorskip("pyarrow.fs") df = pd.DataFrame(data={"A": [0, 1], "B": [1, 0]}) - path = str(temp_file) with pytest.raises( NotImplementedError, match="storage_options not supported with a pyarrow FileSystem.", ): df.to_parquet( - path, + temp_file, engine="pyarrow", filesystem=pa_fs.LocalFileSystem(), storage_options={"foo": "bar"}, ) - pathlib.Path(path).write_bytes(b"foo") + pathlib.Path(temp_file).write_bytes(b"foo") with pytest.raises( NotImplementedError, match="storage_options not supported with a pyarrow FileSystem.", ): read_parquet( - path, + temp_file, engine="pyarrow", filesystem=pa_fs.LocalFileSystem(), storage_options={"foo": "bar"}, @@ -1481,7 +1462,6 @@ def test_invalid_dtype_backend(self, engine, temp_file): "'pyarrow' are allowed." ) df = pd.DataFrame({"int": list(range(1, 4))}) - path = str(temp_file) - df.to_parquet(path) + df.to_parquet(temp_file) with pytest.raises(ValueError, match=msg): - read_parquet(path, dtype_backend="numpy") + read_parquet(temp_file, dtype_backend="numpy") From da89e1b1640f84f64721b45b45e1550c8441fdf3 Mon Sep 17 00:00:00 2001 From: Harshit Pande Date: Wed, 1 Oct 2025 12:02:38 +0000 Subject: [PATCH 09/10] cleanup str type cast remaining cases --- pandas/tests/io/formats/test_to_csv.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index e6bfe40c5433f..e184c33b0d979 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -567,9 +567,8 @@ def test_to_csv_compression_dict_no_method_raises(self, temp_file): compression = {"some_option": True} msg = "must have key 'method'" - path = str(temp_file) with pytest.raises(ValueError, match=msg): - df.to_csv(path, compression=compression) + df.to_csv(temp_file, compression=compression) @pytest.mark.parametrize("compression", ["zip", "infer"]) @pytest.mark.parametrize("archive_name", ["test_to_csv.csv", "test_to_csv.zip"]) @@ -692,8 +691,7 @@ def test_to_csv_encoding_binary_handle(self, mode, temp_file): assert buffer.getvalue().startswith(content) # example from GH 13068 - path = str(temp_file) - with open(path, "w+b") as handle: + with open(temp_file, "w+b") as handle: DataFrame().to_csv(handle, mode=mode, encoding="utf-8-sig") handle.seek(0) From c7af43a5b7fe35fdcb70bead9f874144ff45ae99 Mon Sep 17 00:00:00 2001 From: Harshit Pande Date: Wed, 1 Oct 2025 20:17:43 +0530 Subject: [PATCH 10/10] refactor name with ext --- pandas/tests/io/formats/test_to_csv.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index e184c33b0d979..f70875172ccc8 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -542,9 +542,9 @@ def test_to_csv_compression( to_compression = "infer" if to_infer else compression read_compression = "infer" if read_infer else compression - path = str(temp_file) + "." + compression_to_extension[compression] - df.to_csv(path, compression=to_compression) - result = pd.read_csv(path, index_col=0, compression=read_compression) + path_ext = str(temp_file) + "." + compression_to_extension[compression] + df.to_csv(path_ext, compression=to_compression) + result = pd.read_csv(path_ext, index_col=0, compression=read_compression) tm.assert_frame_equal(result, df) def test_to_csv_compression_dict(self, compression_only, temp_file):