diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 7de1ffbacbde6..6c2d0d7f44b24 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -239,6 +239,7 @@ Other enhancements - :meth:`DataFrame.apply` supports using third-party execution engines like the Bodo.ai JIT compiler (:issue:`60668`) - :meth:`DataFrame.iloc` and :meth:`Series.iloc` now support boolean masks in ``__getitem__`` for more consistent indexing behavior (:issue:`60994`) - :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now support f-strings (e.g., ``"{:.6f}"``) for the ``float_format`` parameter, in addition to the ``%`` format strings and callables (:issue:`49580`) +- :meth:`Series.corr`, :meth:`DataFrame.corr`, :meth:`DataFrame.corrwith` with ``method="kendall"`` and ``method="spearman"`` now work with ordered categorical data types (:issue:`60306`) - :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`) - :meth:`Series.map` now accepts an ``engine`` parameter to allow execution with a third-party execution engine (:issue:`61125`) - :meth:`Series.nlargest` uses stable sort internally and will preserve original ordering in the case of equality (:issue:`55767`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9d6af3c7b9917..5d73f2d9475ae 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -185,6 +185,7 @@ treat_as_nested, ) from pandas.core.methods import selectn +from pandas.core.methods.corr import transform_ord_cat_cols_to_coded_cols from pandas.core.reshape.melt import melt from pandas.core.series import Series from pandas.core.shared_docs import _shared_docs @@ -11718,6 +11719,10 @@ def corr( data = self._get_numeric_data() if numeric_only else self cols = data.columns idx = cols.copy() + + if method in ("spearman", "kendall"): + data = transform_ord_cat_cols_to_coded_cols(data) + mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False) if method == "pearson": @@ -12007,6 +12012,8 @@ def corrwith( correl = num / dom elif method in ["kendall", "spearman"] or callable(method): + left = transform_ord_cat_cols_to_coded_cols(left) + right = transform_ord_cat_cols_to_coded_cols(right) def c(x): return nanops.nancorr(x[0], x[1], method=method) diff --git a/pandas/core/methods/corr.py b/pandas/core/methods/corr.py new file mode 100644 index 0000000000000..9d070b6dae652 --- /dev/null +++ b/pandas/core/methods/corr.py @@ -0,0 +1,32 @@ +""" +Module for correlation related implementation +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import numpy as np + +from pandas.core.dtypes.dtypes import CategoricalDtype + +if TYPE_CHECKING: + from pandas import DataFrame + + +def transform_ord_cat_cols_to_coded_cols(df: DataFrame) -> DataFrame: + """ + Replace ordered categoricals with their codes, making a shallow copy if necessary. + """ + + result = df + made_copy = False + for idx, dtype in enumerate(df.dtypes): + if not isinstance(dtype, CategoricalDtype) or not dtype.ordered: + continue + col = result._ixs(idx, axis=1) + if not made_copy: + made_copy = True + result = result.copy(deep=False) + result._iset_item(idx, col.cat.codes.replace(-1, np.nan)) + return result diff --git a/pandas/core/series.py b/pandas/core/series.py index 512c24cc02f60..d2bd20637bb29 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2750,6 +2750,12 @@ def corr( if len(this) == 0: return np.nan + if method in ("spearman", "kendall"): + if this.dtype == "category" and this.cat.ordered: + this = this.cat.codes.replace(-1, np.nan) + if other.dtype == "category" and other.cat.ordered: + other = other.cat.codes.replace(-1, np.nan) + this_values = this.to_numpy(dtype=float, na_value=np.nan, copy=False) other_values = other.to_numpy(dtype=float, na_value=np.nan, copy=False) diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index a5ed2e86283e9..89376e55b84d1 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -252,6 +252,59 @@ def test_corr_numeric_only(self, meth, numeric_only): with pytest.raises(ValueError, match="could not convert string to float"): df.corr(meth, numeric_only=numeric_only) + @pytest.mark.parametrize("method", ["kendall", "spearman"]) + @pytest.mark.parametrize("col1", ["ord_cat", "ord_cat_none", "ord_cat_shuff"]) + @pytest.mark.parametrize("col2", ["ord_cat", "ord_cat_none", "ord_cat_shuff"]) + @td.skip_if_no("scipy") + def test_corr_rank_ordered_categorical(self, method, col1, col2): + # GH #60306 + df = DataFrame( + { + "ord_cat": pd.Categorical( + ["low", "m", "h", "vh"], + categories=["low", "m", "h", "vh"], + ordered=True, + ), + "ord_cat_none": pd.Categorical( + ["low", "m", "h", None], + categories=["low", "m", "h"], + ordered=True, + ), + "ord_cat_shuff": pd.Categorical( + ["m", "h", "vh", "low"], + categories=["low", "m", "h", "vh"], + ordered=True, + ), + } + ) + corr_calc = df.corr(method=method) + corr_expected = df[col1].corr(df[col2], method=method) + tm.assert_almost_equal(corr_calc[col1][col2], corr_expected) + + @pytest.mark.parametrize("method", ["kendall", "spearman"]) + @pytest.mark.parametrize("col1_idx", [0, 1, 2, 3, 4]) + @pytest.mark.parametrize("col2_idx", [0, 1, 2, 3, 4]) + @td.skip_if_no("scipy") + def test_corr_rank_ordered_categorical_duplicate_columns( + self, method, col1_idx, col2_idx + ): + # GH #60306 + cat = pd.CategoricalDtype(categories=[4, 3, 2, 1], ordered=True) + df = DataFrame( + { + "a": pd.array([1, 2, 3, 4], dtype=cat), + "b": pd.array([4, 3, 2, 1], dtype=cat), + "c": [4, 3, 2, 1], + "d": [10, 20, 30, 40], + "e": [100, 200, 300, 400], + } + ) + df.columns = ["a", "a", "c", "c", "e"] + + corr_calc = df.corr(method=method) + corr_expected = df.iloc[:, col1_idx].corr(df.iloc[:, col2_idx], method=method) + tm.assert_almost_equal(corr_calc.iloc[col1_idx, col2_idx], corr_expected) + class TestDataFrameCorrWith: @pytest.mark.parametrize( @@ -493,3 +546,42 @@ def test_cov_with_missing_values(self): result2 = df.dropna().cov() tm.assert_frame_equal(result1, expected) tm.assert_frame_equal(result2, expected) + + @pytest.mark.parametrize("method", ["kendall", "spearman"]) + @pytest.mark.parametrize("col", ["a", "b", "c", "d"]) + def test_corr_rank_ordered_categorical(self, method, col): + # GH #60306 + pytest.importorskip("scipy") + df1 = DataFrame( + { + "a": pd.Categorical( + ["low", "m", "h", "vh"], + categories=["low", "m", "h", "vh"], + ordered=True, + ), + "b": pd.Categorical( + ["low", "m", "h", None], + categories=["low", "m", "h"], + ordered=True, + ), + "c": [0, 1, 2, 3], + "d": [2.0, 3.0, 4.5, 6.5], + } + ) + + df2 = DataFrame( + { + "a": [2.0, 3.0, 4.5, np.nan], + "b": pd.Categorical( + ["m", "h", "vh", "low"], + categories=["low", "m", "h", "vh"], + ordered=True, + ), + "c": [2, 3, 0, 1], + "d": [2.0, 3.0, 4.5, 6.5], + } + ) + + corr_calc = df1.corrwith(df2, method=method) + corr_expected = df1[col].corr(df2[col], method=method) + tm.assert_almost_equal(corr_calc.get(col), corr_expected) diff --git a/pandas/tests/methods/corr.py b/pandas/tests/methods/corr.py new file mode 100644 index 0000000000000..7e301c0f06763 --- /dev/null +++ b/pandas/tests/methods/corr.py @@ -0,0 +1,132 @@ +""" +Tests for core/methods/corr.py +""" + +import numpy as np +import pytest + +from pandas import ( + Categorical, + DataFrame, + Series, +) +import pandas._testing as tm +from pandas.core.methods.corr import transform_ord_cat_cols_to_coded_cols + + +@pytest.mark.parametrize( + ("input_df_dict", "expected_df_dict"), + [ + pytest.param( + # 1) Simple: two ordered categorical columns (with and without None) + { + "ord_cat": Categorical( + ["low", "m", "h", "vh"], + categories=["low", "m", "h", "vh"], + ordered=True, + ), + "ord_cat_none": Categorical( + ["low", "m", "h", None], + categories=["low", "m", "h"], + ordered=True, + ), + }, + { + # codes: low=0, m=1, h=2, vh=3 + "ord_cat": Series([0, 1, 2, 3], dtype="int8"), + # codes: low=0, m=1, h=2, None -> NaN + "ord_cat_none": [0, 1.0, 2.0, np.nan], + }, + id="ordered-categoricals-basic", + ), + pytest.param( + # 2) Mixed dtypes: only the ordered categorical should change + { + "ordered": Categorical( + ["a", "c", "b"], + categories=["a", "b", "c"], + ordered=True, + ), + "unordered": Categorical(["x", "y", "x"], ordered=False), + "num": [10, 20, 30], + "text": ["u", "v", "w"], + }, + { + # codes: a=0, c=2, b=1 + "ordered": Series([0, 2, 1], dtype="int8"), + # unordered categorical should be untouched (still categorical) + "unordered": Categorical(["x", "y", "x"], ordered=False), + "num": [10, 20, 30], + "text": ["u", "v", "w"], + }, + id="mixed-types-only-ordered-changes", + ), + ], +) +def test_transform_ord_cat_cols_to_coded_cols( + input_df_dict: dict, expected_df_dict: dict +) -> None: + # GH #60306 + input_df = DataFrame(input_df_dict) + expected_df = DataFrame(expected_df_dict) + out_df = transform_ord_cat_cols_to_coded_cols(input_df) + assert list(out_df.columns) == list(expected_df.columns) + tm.assert_frame_equal(out_df, expected_df) + + +@pytest.mark.parametrize( + ("input_df_dict", "expected_df_dict"), + [ + pytest.param( + { + "dup_1": Categorical( + ["low", "m", "h"], + categories=["low", "m", "h"], + ordered=True, + ), + "dup_2": [5, 6, 7], + }, + { + # After transform: position 0 (ordered cat) becomes codes [0,1,2], + # position 1 remains untouched numbers [5,6,7]. + "dup_1": Series([0, 1, 2], dtype="int8"), + "dup_2": [5, 6, 7], + }, + id="duplicate-names-ordered-first", + ), + pytest.param( + { + "dup_1": ["a", "b", "c"], # non-categorical + "dup_2": Categorical( + ["p", "q", None], + categories=["p", "q"], + ordered=True, + ), + "dup_3": Categorical( + ["low", "m", "h"], + categories=["low", "m", "h"], + ordered=True, + ), + }, + { + # First stays object; second turns into codes [0, 1, NaN] + # and third changes into codes [0, 1, 2] + "dup_1": ["a", "b", "c"], + "dup_2": [0.0, 1.0, np.nan], + "dup_3": Series([0, 1, 2], dtype="int8"), + }, + id="duplicate-names-ordered-and-non-categorical-and-none", + ), + ], +) +def test_transform_ord_cat_cols_to_coded_cols_duplicated_col( + input_df_dict: dict, expected_df_dict: dict +) -> None: + # GH #60306 + input_df = DataFrame(input_df_dict) + expected_df = DataFrame(expected_df_dict) + input_df.columns = ["dup" for _ in input_df.columns] + expected_df.columns = ["dup" for _ in expected_df.columns] + + out_df = transform_ord_cat_cols_to_coded_cols(input_df) + tm.assert_frame_equal(out_df, expected_df) diff --git a/pandas/tests/series/methods/test_cov_corr.py b/pandas/tests/series/methods/test_cov_corr.py index 7a4d48fb76940..fb5402bad3948 100644 --- a/pandas/tests/series/methods/test_cov_corr.py +++ b/pandas/tests/series/methods/test_cov_corr.py @@ -3,6 +3,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( Series, @@ -184,3 +186,45 @@ def test_corr_callable_method(self, datetime_series): df = pd.DataFrame([s1, s2]) expected = pd.DataFrame([{0: 1.0, 1: 0}, {0: 0, 1: 1.0}]) tm.assert_almost_equal(df.transpose().corr(method=my_corr), expected) + + @td.skip_if_no("scipy") + @pytest.mark.parametrize("method", ["kendall", "spearman"]) + @pytest.mark.parametrize( + "cat_series_inpt", + [ + pd.Categorical( # ordered cat series + ["low", "medium", "high"], + categories=["low", "medium", "high"], + ordered=True, + ), + pd.Categorical( # ordered cat series with NA + ["low", "medium", "high", None], + categories=["low", "medium", "high"], + ordered=True, + ), + ], + ) + @pytest.mark.parametrize( + "other_series_inpt", + [ + pd.Categorical( # other cat ordered series + ["m", "l", "h"], + categories=["l", "m", "h"], + ordered=True, + ), + # other non cat series + [2, 1, 3], + ], + ) + def test_corr_rank_ordered_categorical( + self, + method, + cat_series_inpt, + other_series_inpt, + ): + # GH #60306 + expected_corr = {"kendall": 0.33333333333333337, "spearman": 0.5} + cat_series = Series(cat_series_inpt) + other_series = Series(other_series_inpt) + corr_calc = cat_series.corr(other_series, method=method) + tm.assert_almost_equal(corr_calc, expected_corr[method])