Merge branch 'pandas-dev:main' into fix-assert-frame-equal-na-61473

LandonB5 · web-flow · commit 506a8bf4542a · 2025-10-02T13:33:10.000-04:00
diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx
@@ -838,7 +838,7 @@ cdef class BaseMultiIndexCodesEngine:
             raise KeyError(key)
         try:
             indices = [1 if checknull(v) else lev.get_loc(v) + multiindex_nulls_shift
-                       for lev, v in zip(self.levels, key)]
+                       for lev, v in zip(self.levels, key, strict=True)]
         except KeyError:
             raise KeyError(key)
 
diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx
@@ -72,7 +72,7 @@ cpdef bint check_na_tuples_nonequal(object left, object right):
     if len(left) != len(right):
         return False
 
-    for left_element, right_element in zip(left, right):
+    for left_element, right_element in zip(left, right, strict=True):
         if left_element is C_NA and right_element is not C_NA:
             return True
         elif right_element is C_NA and left_element is not C_NA:
diff --git a/pandas/_libs/tslibs/fields.pyx b/pandas/_libs/tslibs/fields.pyx
@@ -109,7 +109,7 @@ def month_position_check(fields, weekdays) -> str | None:
         int32_t[:] months = fields["M"]
         int32_t[:] days = fields["D"]
 
-    for y, m, d, wd in zip(years, months, days, weekdays):
+    for y, m, d, wd in zip(years, months, days, weekdays, strict=True):
         if calendar_start:
             calendar_start &= d == 1
         if business_start:
diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx
@@ -2217,7 +2217,7 @@ cdef class BusinessHour(BusinessMixin):
         # Use python string formatting to be faster than strftime
         hours = ",".join(
             f"{st.hour:02d}:{st.minute:02d}-{en.hour:02d}:{en.minute:02d}"
-            for st, en in zip(self.start, self.end)
+            for st, en in zip(self.start, self.end, strict=True)
         )
         attrs = [f"{self._prefix}={hours}"]
         out += ": " + ", ".join(attrs)
@@ -2414,7 +2414,7 @@ cdef class BusinessHour(BusinessMixin):
         # get total business hours by sec in one business day
         businesshours = sum(
             self._get_business_hours_by_sec(st, en)
-            for st, en in zip(self.start, self.end)
+            for st, en in zip(self.start, self.end, strict=True)
         )
 
         bd, r = divmod(abs(n * 60), businesshours // 60)
@@ -5357,7 +5357,7 @@ cpdef to_offset(freq, bint is_period=False):
                 # the last element must be blank
                 raise ValueError("last element must be blank")
 
-            tups = zip(split[0::4], split[1::4], split[2::4])
+            tups = zip(split[0::4], split[1::4], split[2::4], strict=False)
             for n, (sep, stride, name) in enumerate(tups):
                 name = _warn_about_deprecated_aliases(name, is_period)
                 _validate_to_offset_alias(name, is_period)
diff --git a/pandas/_libs/tslibs/timezones.pyx b/pandas/_libs/tslibs/timezones.pyx
@@ -252,7 +252,7 @@ cdef object _get_utc_trans_times_from_dateutil_tz(tzinfo tz):
     """
     new_trans = list(tz._trans_list)
     last_std_offset = 0
-    for i, (trans, tti) in enumerate(zip(tz._trans_list, tz._trans_idx)):
+    for i, (trans, tti) in enumerate(zip(tz._trans_list, tz._trans_idx, strict=True)):
         if not tti.isdst:
             last_std_offset = tti.offset
         new_trans[i] = trans - last_std_offset
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -107,6 +107,7 @@
     is_list_like,
     is_scalar,
     is_sequence,
+    is_string_dtype,
     needs_i8_conversion,
     pandas_dtype,
 )
@@ -4454,8 +4455,12 @@ def _set_item_frame_value(self, key, value: DataFrame) -> None:
                 cols_droplevel = maybe_droplevels(cols, key)
                 if (
                     not isinstance(cols_droplevel, MultiIndex)
+                    and is_string_dtype(cols_droplevel.dtype)
                     and not cols_droplevel.any()
                 ):
+                    # if cols_droplevel contains only empty strings,
+                    # value.reindex(cols_droplevel, axis=1) would be full of NaNs
+                    # see GH#62518 and GH#61841
                     return
                 if len(cols_droplevel) and not cols_droplevel.equals(value.columns):
                     value = value.reindex(cols_droplevel, axis=1)
diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py
@@ -351,7 +351,7 @@ def __iter__(self) -> Iterator:
         )
         self._check_window_bounds(start, end, len(obj))
 
-        for s, e in zip(start, end):
+        for s, e in zip(start, end, strict=True):
             result = obj.iloc[slice(s, e)]
             yield result
 
@@ -802,7 +802,7 @@ def _apply_pairwise(
             groupby_codes = []
             groupby_levels = []
             # e.g. [[1, 2], [4, 5]] as [[1, 4], [2, 5]]
-            for gb_level_pair in map(list, zip(*gb_pairs)):
+            for gb_level_pair in map(list, zip(*gb_pairs, strict=True)):
                 labels = np.repeat(np.array(gb_level_pair), old_result_len)
                 codes, levels = factorize(labels)
                 groupby_codes.append(codes)
diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py
@@ -271,3 +271,24 @@ def test_multiindex_assign_aligns_as_implicit_tuple(self):
         df1["C"] = s1
         tm.assert_frame_equal(df1, df2)
         tm.assert_frame_equal(df1, df3)
+
+    def test_multiindex_assign_alignment_with_non_string_dtype(self):
+        # GH 62518
+        columns = MultiIndex.from_arrays(
+            [["a", "a", "z", "z"], pd.Categorical([1, 2, 1, 2])]
+        )
+
+        meta = DataFrame(columns=columns, dtype=object)
+        meta["z"] = meta["z"].astype("int64")
+
+        result = DataFrame(
+            data={
+                ("a", 1): Series([], dtype=object),
+                ("a", 2): Series([], dtype=object),
+                ("z", 1): Series([], dtype="int64"),
+                ("z", 2): Series([], dtype="int64"),
+            },
+            columns=columns,
+        )
+
+        tm.assert_frame_equal(meta, result)
diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py
@@ -12,22 +12,21 @@
 import pandas._testing as tm
 
 
-def test_compression_roundtrip(compression):
+def test_compression_roundtrip(compression, temp_file):
     df = pd.DataFrame(
         [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]],
         index=["A", "B"],
         columns=["X", "Y", "Z"],
     )
 
-    with tm.ensure_clean() as path:
-        df.to_json(path, compression=compression)
-        tm.assert_frame_equal(df, pd.read_json(path, compression=compression))
+    df.to_json(temp_file, compression=compression)
+    tm.assert_frame_equal(df, pd.read_json(temp_file, compression=compression))
 
-        # explicitly ensure file was compressed.
-        with tm.decompress_file(path, compression) as fh:
-            result = fh.read().decode("utf8")
-            data = StringIO(result)
-        tm.assert_frame_equal(df, pd.read_json(data))
+    # explicitly ensure file was compressed.
+    with tm.decompress_file(temp_file, compression) as fh:
+        result = fh.read().decode("utf8")
+        data = StringIO(result)
+    tm.assert_frame_equal(df, pd.read_json(data))
 
 
 def test_read_zipped_json(datapath):
@@ -43,15 +42,14 @@ def test_read_zipped_json(datapath):
 @td.skip_if_not_us_locale
 @pytest.mark.single_cpu
 @pytest.mark.network
-def test_with_s3_url(compression, s3_bucket_public, s3so):
+def test_with_s3_url(compression, s3_bucket_public, s3so, temp_file):
     # Bucket created in tests/io/conftest.py
     df = pd.read_json(StringIO('{"a": [1, 2, 3], "b": [4, 5, 6]}'))
 
     key = f"{uuid.uuid4()}.json"
-    with tm.ensure_clean() as path:
-        df.to_json(path, compression=compression)
-        with open(path, "rb") as f:
-            s3_bucket_public.put_object(Key=key, Body=f)
+    df.to_json(temp_file, compression=compression)
+    with open(temp_file, "rb") as f:
+        s3_bucket_public.put_object(Key=key, Body=f)
 
     roundtripped_df = pd.read_json(
         f"s3://{s3_bucket_public.name}/{key}",
@@ -61,39 +59,35 @@ def test_with_s3_url(compression, s3_bucket_public, s3so):
     tm.assert_frame_equal(df, roundtripped_df)
 
 
-def test_lines_with_compression(compression):
-    with tm.ensure_clean() as path:
-        df = pd.read_json(StringIO('{"a": [1, 2, 3], "b": [4, 5, 6]}'))
-        df.to_json(path, orient="records", lines=True, compression=compression)
-        roundtripped_df = pd.read_json(path, lines=True, compression=compression)
-        tm.assert_frame_equal(df, roundtripped_df)
+def test_lines_with_compression(compression, temp_file):
+    df = pd.read_json(StringIO('{"a": [1, 2, 3], "b": [4, 5, 6]}'))
+    df.to_json(temp_file, orient="records", lines=True, compression=compression)
+    roundtripped_df = pd.read_json(temp_file, lines=True, compression=compression)
+    tm.assert_frame_equal(df, roundtripped_df)
 
 
-def test_chunksize_with_compression(compression):
-    with tm.ensure_clean() as path:
-        df = pd.read_json(StringIO('{"a": ["foo", "bar", "baz"], "b": [4, 5, 6]}'))
-        df.to_json(path, orient="records", lines=True, compression=compression)
+def test_chunksize_with_compression(compression, temp_file):
+    df = pd.read_json(StringIO('{"a": ["foo", "bar", "baz"], "b": [4, 5, 6]}'))
+    df.to_json(temp_file, orient="records", lines=True, compression=compression)
 
-        with pd.read_json(
-            path, lines=True, chunksize=1, compression=compression
-        ) as res:
-            roundtripped_df = pd.concat(res)
-        tm.assert_frame_equal(df, roundtripped_df)
+    with pd.read_json(
+        temp_file, lines=True, chunksize=1, compression=compression
+    ) as res:
+        roundtripped_df = pd.concat(res)
+    tm.assert_frame_equal(df, roundtripped_df)
 
 
-def test_write_unsupported_compression_type():
+def test_write_unsupported_compression_type(temp_file):
     df = pd.read_json(StringIO('{"a": [1, 2, 3], "b": [4, 5, 6]}'))
-    with tm.ensure_clean() as path:
-        msg = "Unrecognized compression type: unsupported"
-        with pytest.raises(ValueError, match=msg):
-            df.to_json(path, compression="unsupported")
+    msg = "Unrecognized compression type: unsupported"
+    with pytest.raises(ValueError, match=msg):
+        df.to_json(temp_file, compression="unsupported")
 
 
-def test_read_unsupported_compression_type():
-    with tm.ensure_clean() as path:
-        msg = "Unrecognized compression type: unsupported"
-        with pytest.raises(ValueError, match=msg):
-            pd.read_json(path, compression="unsupported")
+def test_read_unsupported_compression_type(temp_file):
+    msg = "Unrecognized compression type: unsupported"
+    with pytest.raises(ValueError, match=msg):
+        pd.read_json(temp_file, compression="unsupported")
 
 
 @pytest.mark.parametrize(
@@ -102,25 +96,28 @@ def test_read_unsupported_compression_type():
 @pytest.mark.parametrize("to_infer", [True, False])
 @pytest.mark.parametrize("read_infer", [True, False])
 def test_to_json_compression(
-    compression_only, read_infer, to_infer, compression_to_extension, infer_string
+    compression_only,
+    read_infer,
+    to_infer,
+    compression_to_extension,
+    infer_string,
+    tmp_path,
 ):
     with pd.option_context("future.infer_string", infer_string):
         # see gh-15008
         compression = compression_only
 
         # We'll complete file extension subsequently.
-        filename = "test."
-        filename += compression_to_extension[compression]
+        filename = tmp_path / f"test.{compression_to_extension[compression]}"
 
         df = pd.DataFrame({"A": [1]})
 
         to_compression = "infer" if to_infer else compression
         read_compression = "infer" if read_infer else compression
 
-        with tm.ensure_clean(filename) as path:
-            df.to_json(path, compression=to_compression)
-            result = pd.read_json(path, compression=read_compression)
-            tm.assert_frame_equal(result, df)
+        df.to_json(filename, compression=to_compression)
+        result = pd.read_json(filename, compression=read_compression)
+        tm.assert_frame_equal(result, df)
 
 
 def test_to_json_compression_mode(compression):
diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
@@ -806,11 +806,10 @@ def test_reconstruction_index(self):
         result = read_json(StringIO(df.to_json()))
         tm.assert_frame_equal(result, df)
 
-    def test_path(self, float_frame, int_frame, datetime_frame):
-        with tm.ensure_clean("test.json") as path:
-            for df in [float_frame, int_frame, datetime_frame]:
-                df.to_json(path)
-                read_json(path)
+    def test_path(self, float_frame, int_frame, datetime_frame, temp_file):
+        for df in [float_frame, int_frame, datetime_frame]:
+            df.to_json(temp_file)
+            read_json(temp_file)
 
     def test_axis_dates(self, datetime_series, datetime_frame):
         # frame
@@ -1423,14 +1422,13 @@ def test_read_s3_jsonl(self, s3_bucket_public_with_data, s3so):
         expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"])
         tm.assert_frame_equal(result, expected)
 
-    def test_read_local_jsonl(self):
+    def test_read_local_jsonl(self, temp_file):
         # GH17200
-        with tm.ensure_clean("tmp_items.json") as path:
-            with open(path, "w", encoding="utf-8") as infile:
-                infile.write('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n')
-            result = read_json(path, lines=True)
-            expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"])
-            tm.assert_frame_equal(result, expected)
+        with open(temp_file, "w", encoding="utf-8") as infile:
+            infile.write('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n')
+        result = read_json(temp_file, lines=True)
+        expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"])
+        tm.assert_frame_equal(result, expected)
 
     def test_read_jsonl_unicode_chars(self):
         # GH15132: non-ascii unicode characters
@@ -1526,17 +1524,16 @@ def test_to_jsonl(self):
         ],
     )
     @pytest.mark.parametrize("dtype", ["category", object])
-    def test_latin_encoding(self, dtype, val):
+    def test_latin_encoding(self, dtype, val, temp_file):
         # GH 13774
         ser = Series(
             [x.decode("latin-1") if isinstance(x, bytes) else x for x in val],
             dtype=dtype,
         )
         encoding = "latin-1"
-        with tm.ensure_clean("test.json") as path:
-            ser.to_json(path, encoding=encoding)
-            retr = read_json(StringIO(path), encoding=encoding)
-            tm.assert_series_equal(ser, retr, check_categorical=False)
+        ser.to_json(temp_file, encoding=encoding)
+        retr = read_json(StringIO(temp_file), encoding=encoding)
+        tm.assert_series_equal(ser, retr, check_categorical=False)
 
     def test_data_frame_size_after_to_json(self):
         # GH15344
diff --git a/pandas/util/_doctools.py b/pandas/util/_doctools.py
@@ -84,7 +84,7 @@ def plot(
             # left
             max_left_cols = max(self._shape(df)[1] for df in left)
             max_left_rows = max(self._shape(df)[0] for df in left)
-            for i, (_left, _label) in enumerate(zip(left, labels)):
+            for i, (_left, _label) in enumerate(zip(left, labels, strict=True)):
                 ax = fig.add_subplot(gs[i, 0:max_left_cols])
                 self._make_table(ax, _left, title=_label, height=1.0 / max_left_rows)
             # right
@@ -97,7 +97,7 @@ def plot(
             gs = gridspec.GridSpec(1, hcells)
             # left
             i = 0
-            for df, _label in zip(left, labels):
+            for df, _label in zip(left, labels, strict=True):
                 sp = self._shape(df)
                 ax = fig.add_subplot(gs[0, i : i + sp[1]])
                 self._make_table(ax, df, title=_label, height=height)
diff --git a/pandas/util/_validators.py b/pandas/util/_validators.py
@@ -122,7 +122,7 @@ def validate_args(fname, args, max_fname_arg_count, compat_args) -> None:
     # We do this so that we can provide a more informative
     # error message about the parameters that we are not
     # supporting in the pandas implementation of 'fname'
-    kwargs = dict(zip(compat_args, args))
+    kwargs = dict(zip(compat_args, args, strict=False))
     _check_for_default_values(fname, kwargs, compat_args)
 
 
@@ -212,7 +212,7 @@ def validate_args_and_kwargs(
 
     # Check there is no overlap with the positional and keyword
     # arguments, similar to what is done in actual Python functions
-    args_dict = dict(zip(compat_args, args))
+    args_dict = dict(zip(compat_args, args, strict=False))
 
     for key in args_dict:
         if key in kwargs: