diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index eb938a7140e29..a6e60f913d2f4 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -170,6 +170,7 @@ Other enhancements - :func:`read_spss` now supports kwargs to be passed to pyreadstat (:issue:`56356`) - :func:`read_stata` now returns ``datetime64`` resolutions better matching those natively stored in the stata format (:issue:`55642`) - :meth:`DataFrame.agg` called with ``axis=1`` and a ``func`` which relabels the result index now raises a ``NotImplementedError`` (:issue:`58807`). +- :meth:`DataFrame.corr` now uses two pass Welford's Method to improve numerical stability with precision for very large/small values (:issue:`59652`) - :meth:`Index.get_loc` now accepts also subclasses of ``tuple`` as keys (:issue:`57922`) - :meth:`Styler.set_tooltips` provides alternative method to storing tooltips by using title attribute of td elements. (:issue:`56981`) - Added missing parameter ``weights`` in :meth:`DataFrame.plot.kde` for the estimation of the PDF (:issue:`59337`) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 222b5b7952c2e..2dcf8eceedf5c 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -345,7 +345,9 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None): float64_t[:, ::1] result uint8_t[:, :] mask int64_t nobs = 0 - float64_t vx, vy, dx, dy, meanx, meany, divisor, ssqdmx, ssqdmy, covxy, val + float64_t vx, vy, meanx, meany, divisor, ssqdmx, ssqdmy, cxy, val + float64_t ref_x, ref_y, dx, dy + bint ref_set N, K = (mat).shape if minp is None: @@ -358,39 +360,48 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None): with nogil: for xi in range(K): - for yi in range(xi + 1): + for yi in range(xi+1): # Welford's method for the variance-calculation # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance - nobs = ssqdmx = ssqdmy = covxy = meanx = meany = 0 + # Changed to Welford's two-pass for improved numeric stability + nobs = ssqdmx = ssqdmy = cxy = meanx = meany = 0 + ref_set = False for i in range(N): if mask[i, xi] and mask[i, yi]: + nobs += 1 vx = mat[i, xi] vy = mat[i, yi] - nobs += 1 + if not ref_set: + ref_x = vx + ref_y = vy + ref_set = True + + vx -= ref_x + vy -= ref_y dx = vx - meanx dy = vy - meany - meanx += 1. / nobs * dx - meany += 1. / nobs * dy + meanx += dx / nobs + meany += dy / nobs + cxy += dx * (vy - meany) ssqdmx += (vx - meanx) * dx ssqdmy += (vy - meany) * dy - covxy += (vx - meanx) * dy if nobs < minpv: result[xi, yi] = result[yi, xi] = NaN + continue + + divisor = (nobs - 1.0) if cov else sqrt(ssqdmx * ssqdmy) + # clip `cxy / divisor` to ensure coeff is within bounds + if divisor != 0: + val = cxy / divisor + if not cov: + if val > 1.0: + val = 1.0 + elif val < -1.0: + val = -1.0 + result[xi, yi] = result[yi, xi] = val else: - divisor = (nobs - 1.0) if cov else sqrt(ssqdmx * ssqdmy) - - # clip `covxy / divisor` to ensure coeff is within bounds - if divisor != 0: - val = covxy / divisor - if not cov: - if val > 1.0: - val = 1.0 - elif val < -1.0: - val = -1.0 - result[xi, yi] = result[yi, xi] = val - else: - result[xi, yi] = result[yi, xi] = NaN + result[xi, yi] = result[yi, xi] = NaN return result.base diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index a5ed2e86283e9..419b125ff824a 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -493,3 +493,51 @@ def test_cov_with_missing_values(self): result2 = df.dropna().cov() tm.assert_frame_equal(result1, expected) tm.assert_frame_equal(result2, expected) + + pair_cases = [ + np.array( + [[30.0, 30.100000381469727], [116.80000305175781, 116.8000030517578]], + dtype=np.longdouble, + ), + np.array( + [[-30.0, 30.100000381469727], [116.80000305175781, -116.8000030517578]], + dtype=np.longdouble, + ), + np.array([[1e-8, 3.42e-8], [2e-9, 3e-8]], dtype=np.longdouble), + np.array([[1e12, 1e-8], [1e12 + 1e-3, 2e-8]], dtype=np.longdouble), + np.array([[0.0, 1e-12], [1e-14, 0.0]], dtype=np.longdouble), + ] + + @pytest.mark.parametrize("values", pair_cases) + def test_pair_correlation(self, values): + df = DataFrame(values.T, dtype=np.longdouble) + result = df.corr(method="pearson") + expected = DataFrame(np.corrcoef(values[0], values[1]), dtype=np.float64) + tm.assert_frame_equal(result, expected) + + multi_cases = [ + np.array( + [[1e12, 1e-8, 5.5], [1e12 + 1e-3, 2e-8, 5.50000001]], dtype=np.longdouble + ), + np.array( + [ + [1e12, 1e12 + 1e-3, 1e12 + 2e-3], + [1e12 + 2e-3, 1e12 + 3e-3, 1e12 + 4e-3], + [1e12 + 1e-2, 1e12 + 1e-2, 1e12 + 1e-2], + ], + dtype=np.longdouble, + ), + np.array([[1e-8, 2e-8], [2e-8, 3e-8], [0.0, 1e-12]], dtype=np.longdouble), + ] + + @pytest.mark.parametrize("values", multi_cases) + def test_multi_correlation(self, values): + df = DataFrame(values.T, dtype=np.longdouble) + result = df.corr(method="pearson") + expected = DataFrame( + np.corrcoef(values), + index=range(values.shape[0]), + columns=range(values.shape[0]), + dtype=np.float64, + ) + tm.assert_frame_equal(result, expected)