diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9d6af3c7b9917..29ab4d1f47170 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8458,6 +8458,28 @@ def _arith_method_with_reindex(self, right: DataFrame, op) -> DataFrame: # to avoid constructing two potentially large/sparse DataFrames join_columns = left.columns.join(right.columns, how="outer") + # GH#63288 Preserve ExtensionDtype (e.g. pyarrow) when reindexing + # introduces missing columns + from pandas.core.dtypes.base import ExtensionDtype + + missing_cols = [c for c in join_columns if c not in result.columns] + + if missing_cols: + for col in missing_cols: + src = left[col] if col in left.columns else right[col] + + if isinstance(src.dtype, ExtensionDtype): + # Create NA-filled Series with same ExtensionDtype + fill = src.iloc[:0].reindex(result.index) + else: + # Fallback to existing NumPy behavior + fill = self._constructor_sliced( + [np.nan] * len(result.index), + index=result.index, + ) + + result[col] = fill + if result.columns.has_duplicates: # Avoid reindexing with a duplicate axis. # https://github.com/pandas-dev/pandas/issues/35194 diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 3981b0fc4e23a..030afa05367ef 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -2201,3 +2201,28 @@ def test_mixed_col_index_dtype(string_dtype_no_object): expected.columns = expected.columns.astype(string_dtype_no_object) tm.assert_frame_equal(result, expected) + + +def test_arith_reindex_with_pyarrow_dtype(): + # GH#63288 - Preserve pyarrow dtypes when reindexing introduces + # missing columns + pytest.importorskip("pyarrow") + + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) + df1 = df.iloc[:, :2].astype("int64[pyarrow]") # columns: a, b + df2 = df.iloc[1:, 1:].astype("int64[pyarrow]") # columns: b, c + + result = df1 + df2 + expected = DataFrame( + { + "a": pd.array([pd.NA, pd.NA, pd.NA], dtype="int64[pyarrow]"), + "b": pd.array([pd.NA, 10, 12], dtype="int64[pyarrow]"), + "c": pd.array([pd.NA, pd.NA, pd.NA], dtype="int64[pyarrow]"), + } + ) + tm.assert_frame_equal(result, expected) + + # Verify all columns preserved ExtensionDtype + assert str(result["a"].dtype) == "int64[pyarrow]" + assert str(result["b"].dtype) == "int64[pyarrow]" + assert str(result["c"].dtype) == "int64[pyarrow]"