-
-
Notifications
You must be signed in to change notification settings - Fork 19.4k
BUG: for ordered categorical data implements correct computation of kendall/spearman correlations #62880
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
BUG: for ordered categorical data implements correct computation of kendall/spearman correlations #62880
Changes from all commits
f692989
e5352f3
06af533
6399dcf
e1f640e
1c837f2
04abdea
da66575
0404900
2318aa8
c57f494
e658a7a
dbfb702
33d2357
3bea67b
5dc3926
77d15e6
8238cb9
12378e7
353af37
24db452
29e889a
0ef7ce3
0fe8879
9b68e60
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,32 @@ | ||
| """ | ||
| Module for correlation related implementation | ||
| """ | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| from typing import TYPE_CHECKING | ||
|
|
||
| import numpy as np | ||
|
|
||
| from pandas.core.dtypes.dtypes import CategoricalDtype | ||
|
|
||
| if TYPE_CHECKING: | ||
| from pandas import DataFrame | ||
|
|
||
|
|
||
| def transform_ord_cat_cols_to_coded_cols(df: DataFrame) -> DataFrame: | ||
| """ | ||
| Replace ordered categoricals with their codes, making a shallow copy if necessary. | ||
| """ | ||
|
|
||
| result = df | ||
| made_copy = False | ||
| for idx, dtype in enumerate(df.dtypes): | ||
| if not isinstance(dtype, CategoricalDtype) or not dtype.ordered: | ||
| continue | ||
| col = result._ixs(idx, axis=1) | ||
| if not made_copy: | ||
| made_copy = True | ||
| result = result.copy(deep=False) | ||
| result._iset_item(idx, col.cat.codes.replace(-1, np.nan)) | ||
| return result |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -252,6 +252,59 @@ def test_corr_numeric_only(self, meth, numeric_only): | |
| with pytest.raises(ValueError, match="could not convert string to float"): | ||
| df.corr(meth, numeric_only=numeric_only) | ||
|
|
||
| @pytest.mark.parametrize("method", ["kendall", "spearman"]) | ||
| @pytest.mark.parametrize("col1", ["ord_cat", "ord_cat_none", "ord_cat_shuff"]) | ||
| @pytest.mark.parametrize("col2", ["ord_cat", "ord_cat_none", "ord_cat_shuff"]) | ||
| @td.skip_if_no("scipy") | ||
| def test_corr_rank_ordered_categorical(self, method, col1, col2): | ||
| # GH #60306 | ||
| df = DataFrame( | ||
| { | ||
| "ord_cat": pd.Categorical( | ||
| ["low", "m", "h", "vh"], | ||
| categories=["low", "m", "h", "vh"], | ||
| ordered=True, | ||
| ), | ||
| "ord_cat_none": pd.Categorical( | ||
| ["low", "m", "h", None], | ||
| categories=["low", "m", "h"], | ||
| ordered=True, | ||
| ), | ||
| "ord_cat_shuff": pd.Categorical( | ||
| ["m", "h", "vh", "low"], | ||
| categories=["low", "m", "h", "vh"], | ||
| ordered=True, | ||
| ), | ||
|
Comment on lines
+263
to
+277
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you use 2
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
| } | ||
| ) | ||
| corr_calc = df.corr(method=method) | ||
| corr_expected = df[col1].corr(df[col2], method=method) | ||
| tm.assert_almost_equal(corr_calc[col1][col2], corr_expected) | ||
|
|
||
| @pytest.mark.parametrize("method", ["kendall", "spearman"]) | ||
| @pytest.mark.parametrize("col1_idx", [0, 1, 2, 3, 4]) | ||
| @pytest.mark.parametrize("col2_idx", [0, 1, 2, 3, 4]) | ||
| @td.skip_if_no("scipy") | ||
| def test_corr_rank_ordered_categorical_duplicate_columns( | ||
| self, method, col1_idx, col2_idx | ||
| ): | ||
| # GH #60306 | ||
| cat = pd.CategoricalDtype(categories=[4, 3, 2, 1], ordered=True) | ||
| df = DataFrame( | ||
| { | ||
| "a": pd.array([1, 2, 3, 4], dtype=cat), | ||
| "b": pd.array([4, 3, 2, 1], dtype=cat), | ||
| "c": [4, 3, 2, 1], | ||
| "d": [10, 20, 30, 40], | ||
| "e": [100, 200, 300, 400], | ||
|
Comment on lines
+295
to
+299
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same here
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
| } | ||
| ) | ||
| df.columns = ["a", "a", "c", "c", "e"] | ||
|
|
||
| corr_calc = df.corr(method=method) | ||
| corr_expected = df.iloc[:, col1_idx].corr(df.iloc[:, col2_idx], method=method) | ||
| tm.assert_almost_equal(corr_calc.iloc[col1_idx, col2_idx], corr_expected) | ||
|
|
||
|
|
||
| class TestDataFrameCorrWith: | ||
| @pytest.mark.parametrize( | ||
|
|
@@ -493,3 +546,42 @@ def test_cov_with_missing_values(self): | |
| result2 = df.dropna().cov() | ||
| tm.assert_frame_equal(result1, expected) | ||
| tm.assert_frame_equal(result2, expected) | ||
|
|
||
| @pytest.mark.parametrize("method", ["kendall", "spearman"]) | ||
| @pytest.mark.parametrize("col", ["a", "b", "c", "d"]) | ||
| def test_corr_rank_ordered_categorical(self, method, col): | ||
| # GH #60306 | ||
| pytest.importorskip("scipy") | ||
| df1 = DataFrame( | ||
| { | ||
| "a": pd.Categorical( | ||
| ["low", "m", "h", "vh"], | ||
| categories=["low", "m", "h", "vh"], | ||
| ordered=True, | ||
| ), | ||
| "b": pd.Categorical( | ||
| ["low", "m", "h", None], | ||
| categories=["low", "m", "h"], | ||
| ordered=True, | ||
| ), | ||
| "c": [0, 1, 2, 3], | ||
| "d": [2.0, 3.0, 4.5, 6.5], | ||
| } | ||
| ) | ||
|
|
||
| df2 = DataFrame( | ||
| { | ||
| "a": [2.0, 3.0, 4.5, np.nan], | ||
| "b": pd.Categorical( | ||
| ["m", "h", "vh", "low"], | ||
| categories=["low", "m", "h", "vh"], | ||
| ordered=True, | ||
| ), | ||
| "c": [2, 3, 0, 1], | ||
| "d": [2.0, 3.0, 4.5, 6.5], | ||
| } | ||
| ) | ||
|
|
||
| corr_calc = df1.corrwith(df2, method=method) | ||
| corr_expected = df1[col].corr(df2[col], method=method) | ||
| tm.assert_almost_equal(corr_calc.get(col), corr_expected) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,132 @@ | ||
| """ | ||
| Tests for core/methods/corr.py | ||
| """ | ||
|
|
||
| import numpy as np | ||
| import pytest | ||
|
|
||
| from pandas import ( | ||
| Categorical, | ||
| DataFrame, | ||
| Series, | ||
| ) | ||
| import pandas._testing as tm | ||
| from pandas.core.methods.corr import transform_ord_cat_cols_to_coded_cols | ||
|
|
||
|
|
||
| @pytest.mark.parametrize( | ||
| ("input_df_dict", "expected_df_dict"), | ||
| [ | ||
| pytest.param( | ||
| # 1) Simple: two ordered categorical columns (with and without None) | ||
| { | ||
| "ord_cat": Categorical( | ||
| ["low", "m", "h", "vh"], | ||
| categories=["low", "m", "h", "vh"], | ||
| ordered=True, | ||
| ), | ||
| "ord_cat_none": Categorical( | ||
| ["low", "m", "h", None], | ||
| categories=["low", "m", "h"], | ||
| ordered=True, | ||
| ), | ||
| }, | ||
| { | ||
| # codes: low=0, m=1, h=2, vh=3 | ||
| "ord_cat": Series([0, 1, 2, 3], dtype="int8"), | ||
| # codes: low=0, m=1, h=2, None -> NaN | ||
| "ord_cat_none": [0, 1.0, 2.0, np.nan], | ||
| }, | ||
| id="ordered-categoricals-basic", | ||
| ), | ||
| pytest.param( | ||
| # 2) Mixed dtypes: only the ordered categorical should change | ||
| { | ||
| "ordered": Categorical( | ||
| ["a", "c", "b"], | ||
| categories=["a", "b", "c"], | ||
| ordered=True, | ||
| ), | ||
| "unordered": Categorical(["x", "y", "x"], ordered=False), | ||
| "num": [10, 20, 30], | ||
| "text": ["u", "v", "w"], | ||
| }, | ||
| { | ||
| # codes: a=0, c=2, b=1 | ||
| "ordered": Series([0, 2, 1], dtype="int8"), | ||
| # unordered categorical should be untouched (still categorical) | ||
| "unordered": Categorical(["x", "y", "x"], ordered=False), | ||
| "num": [10, 20, 30], | ||
| "text": ["u", "v", "w"], | ||
| }, | ||
| id="mixed-types-only-ordered-changes", | ||
| ), | ||
| ], | ||
| ) | ||
| def test_transform_ord_cat_cols_to_coded_cols( | ||
| input_df_dict: dict, expected_df_dict: dict | ||
| ) -> None: | ||
| # GH #60306 | ||
| input_df = DataFrame(input_df_dict) | ||
| expected_df = DataFrame(expected_df_dict) | ||
| out_df = transform_ord_cat_cols_to_coded_cols(input_df) | ||
| assert list(out_df.columns) == list(expected_df.columns) | ||
| tm.assert_frame_equal(out_df, expected_df) | ||
|
|
||
|
|
||
| @pytest.mark.parametrize( | ||
| ("input_df_dict", "expected_df_dict"), | ||
| [ | ||
| pytest.param( | ||
| { | ||
| "dup_1": Categorical( | ||
| ["low", "m", "h"], | ||
| categories=["low", "m", "h"], | ||
| ordered=True, | ||
| ), | ||
| "dup_2": [5, 6, 7], | ||
| }, | ||
| { | ||
| # After transform: position 0 (ordered cat) becomes codes [0,1,2], | ||
| # position 1 remains untouched numbers [5,6,7]. | ||
| "dup_1": Series([0, 1, 2], dtype="int8"), | ||
| "dup_2": [5, 6, 7], | ||
| }, | ||
| id="duplicate-names-ordered-first", | ||
| ), | ||
| pytest.param( | ||
| { | ||
| "dup_1": ["a", "b", "c"], # non-categorical | ||
| "dup_2": Categorical( | ||
| ["p", "q", None], | ||
| categories=["p", "q"], | ||
| ordered=True, | ||
| ), | ||
| "dup_3": Categorical( | ||
| ["low", "m", "h"], | ||
| categories=["low", "m", "h"], | ||
| ordered=True, | ||
| ), | ||
| }, | ||
| { | ||
| # First stays object; second turns into codes [0, 1, NaN] | ||
| # and third changes into codes [0, 1, 2] | ||
| "dup_1": ["a", "b", "c"], | ||
| "dup_2": [0.0, 1.0, np.nan], | ||
| "dup_3": Series([0, 1, 2], dtype="int8"), | ||
| }, | ||
| id="duplicate-names-ordered-and-non-categorical-and-none", | ||
| ), | ||
| ], | ||
| ) | ||
| def test_transform_ord_cat_cols_to_coded_cols_duplicated_col( | ||
| input_df_dict: dict, expected_df_dict: dict | ||
| ) -> None: | ||
| # GH #60306 | ||
| input_df = DataFrame(input_df_dict) | ||
| expected_df = DataFrame(expected_df_dict) | ||
| input_df.columns = ["dup" for _ in input_df.columns] | ||
| expected_df.columns = ["dup" for _ in expected_df.columns] | ||
|
|
||
| out_df = transform_ord_cat_cols_to_coded_cols(input_df) | ||
| tm.assert_frame_equal(out_df, expected_df) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,6 +3,8 @@ | |
| import numpy as np | ||
| import pytest | ||
|
|
||
| import pandas.util._test_decorators as td | ||
|
|
||
| import pandas as pd | ||
| from pandas import ( | ||
| Series, | ||
|
|
@@ -184,3 +186,45 @@ def test_corr_callable_method(self, datetime_series): | |
| df = pd.DataFrame([s1, s2]) | ||
| expected = pd.DataFrame([{0: 1.0, 1: 0}, {0: 0, 1: 1.0}]) | ||
| tm.assert_almost_equal(df.transpose().corr(method=my_corr), expected) | ||
|
|
||
| @td.skip_if_no("scipy") | ||
| @pytest.mark.parametrize("method", ["kendall", "spearman"]) | ||
| @pytest.mark.parametrize( | ||
| "cat_series_inpt", | ||
| [ | ||
| pd.Categorical( # ordered cat series | ||
| ["low", "medium", "high"], | ||
| categories=["low", "medium", "high"], | ||
| ordered=True, | ||
| ), | ||
| pd.Categorical( # ordered cat series with NA | ||
| ["low", "medium", "high", None], | ||
| categories=["low", "medium", "high"], | ||
| ordered=True, | ||
| ), | ||
| ], | ||
| ) | ||
| @pytest.mark.parametrize( | ||
| "other_series_inpt", | ||
| [ | ||
| pd.Categorical( # other cat ordered series | ||
| ["m", "l", "h"], | ||
| categories=["l", "m", "h"], | ||
| ordered=True, | ||
| ), | ||
| # other non cat series | ||
| [2, 1, 3], | ||
| ], | ||
| ) | ||
| def test_corr_rank_ordered_categorical( | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This test is pretty long, to the point where its unclear what its intent is. Maybe its worth breaking up into a few tests? Or adding parameterization?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fixed |
||
| self, | ||
| method, | ||
| cat_series_inpt, | ||
| other_series_inpt, | ||
| ): | ||
| # GH #60306 | ||
| expected_corr = {"kendall": 0.33333333333333337, "spearman": 0.5} | ||
| cat_series = Series(cat_series_inpt) | ||
| other_series = Series(other_series_inpt) | ||
| corr_calc = cat_series.corr(other_series, method=method) | ||
| tm.assert_almost_equal(corr_calc, expected_corr[method]) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you add
# GH #60306to the start of each test.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done