Skip to content

Commit

Permalink
FIX Ensure that index is correct when global transform_output is pand…
Browse files Browse the repository at this point in the history
  • Loading branch information
thomasjpfan committed Jun 14, 2023
1 parent e5df5fe commit 96878ba
Show file tree
Hide file tree
Showing 5 changed files with 28 additions and 9 deletions.
12 changes: 12 additions & 0 deletions doc/whats_new/v1.3.rst
Original file line number Diff line number Diff line change
Expand Up @@ -401,6 +401,9 @@ Changelog
- |Enhancement| Added the parameter `fill_value` to :class:`impute.IterativeImputer`.
:pr:`25232` by :user:`Thijs van Weezel <ValueInvestorThijs>`.

- |Fix| :class:`impute.IterativeImputer` now correctly preserves the Pandas
Index when the `set_config(transform_output="pandas")`. :pr:`26454` by `Thomas Fan`_.

:mod:`sklearn.inspection`
.........................

Expand Down Expand Up @@ -444,6 +447,12 @@ Changelog
on linearly separable problems.
:pr:`25214` by `Tom Dupre la Tour`_.

:mod:`sklearn.manifold`
.......................

- |Fix| :class:`manifold.Isomap` now correctly preserves the Pandas
Index when the `set_config(transform_output="pandas")`. :pr:`26454` by `Thomas Fan`_.

:mod:`sklearn.metrics`
......................

Expand Down Expand Up @@ -636,6 +645,9 @@ Changelog
The `sample_interval_` attribute is deprecated and will be removed in 1.5.
:pr:`25190` by :user:`Vincent Maladière <Vincent-Maladiere>`.

- |Fix| :class:`preprocessing.PowerTransformer` now correctly preserves the Pandas
Index when the `set_config(transform_output="pandas")`. :pr:`26454` by `Thomas Fan`_.

- |Fix| :class:`preprocessing.PowerTransformer` now correcly raises error when
using `method="box-cox"` on data with a constant `np.nan` column.
:pr:`26400` by :user:`Yao Xiao <Charlie-XIAO>`.
Expand Down
2 changes: 1 addition & 1 deletion sklearn/impute/_iterative.py
Original file line number Diff line number Diff line change
Expand Up @@ -627,7 +627,7 @@ def _initial_imputation(self, X, in_fit=False):
strategy=self.initial_strategy,
fill_value=self.fill_value,
keep_empty_features=self.keep_empty_features,
)
).set_output(transform="default")
X_filled = self.initial_imputer_.fit_transform(X)
else:
X_filled = self.initial_imputer_.transform(X)
Expand Down
2 changes: 1 addition & 1 deletion sklearn/manifold/_isomap.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ def _fit_transform(self, X):
tol=self.tol,
max_iter=self.max_iter,
n_jobs=self.n_jobs,
)
).set_output(transform="default")

if self.n_neighbors is not None:
nbg = kneighbors_graph(
Expand Down
2 changes: 1 addition & 1 deletion sklearn/preprocessing/_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -3180,7 +3180,7 @@ def _fit(self, X, y=None, force_transform=False):
X[:, i] = transform_function(X[:, i], self.lambdas_[i])

if self.standardize:
self._scaler = StandardScaler(copy=False)
self._scaler = StandardScaler(copy=False).set_output(transform="default")
if force_transform:
X = self._scaler.fit_transform(X)
else:
Expand Down
19 changes: 13 additions & 6 deletions sklearn/utils/estimator_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -4424,7 +4424,7 @@ def _output_from_fit_transform(transformer, name, X, df, y):
return outputs


def _check_generated_dataframe(name, case, outputs_default, outputs_pandas):
def _check_generated_dataframe(name, case, index, outputs_default, outputs_pandas):
import pandas as pd

X_trans, feature_names_default = outputs_default
Expand All @@ -4434,7 +4434,12 @@ def _check_generated_dataframe(name, case, outputs_default, outputs_pandas):
# We always rely on the output of `get_feature_names_out` of the
# transformer used to generate the dataframe as a ground-truth of the
# columns.
expected_dataframe = pd.DataFrame(X_trans, columns=feature_names_pandas, copy=False)
# If a dataframe is passed into transform, then the output should have the same
# index
expected_index = index if case.endswith("df") else None
expected_dataframe = pd.DataFrame(
X_trans, columns=feature_names_pandas, copy=False, index=expected_index
)

try:
pd.testing.assert_frame_equal(df_trans, expected_dataframe)
Expand Down Expand Up @@ -4469,7 +4474,8 @@ def check_set_output_transform_pandas(name, transformer_orig):
set_random_state(transformer)

feature_names_in = [f"col{i}" for i in range(X.shape[1])]
df = pd.DataFrame(X, columns=feature_names_in, copy=False)
index = [f"index{i}" for i in range(X.shape[0])]
df = pd.DataFrame(X, columns=feature_names_in, copy=False, index=index)

transformer_default = clone(transformer).set_output(transform="default")
outputs_default = _output_from_fit_transform(transformer_default, name, X, df, y)
Expand All @@ -4483,7 +4489,7 @@ def check_set_output_transform_pandas(name, transformer_orig):

for case in outputs_default:
_check_generated_dataframe(
name, case, outputs_default[case], outputs_pandas[case]
name, case, index, outputs_default[case], outputs_pandas[case]
)


Expand Down Expand Up @@ -4511,7 +4517,8 @@ def check_global_ouptut_transform_pandas(name, transformer_orig):
set_random_state(transformer)

feature_names_in = [f"col{i}" for i in range(X.shape[1])]
df = pd.DataFrame(X, columns=feature_names_in, copy=False)
index = [f"index{i}" for i in range(X.shape[0])]
df = pd.DataFrame(X, columns=feature_names_in, copy=False, index=index)

transformer_default = clone(transformer).set_output(transform="default")
outputs_default = _output_from_fit_transform(transformer_default, name, X, df, y)
Expand All @@ -4528,5 +4535,5 @@ def check_global_ouptut_transform_pandas(name, transformer_orig):

for case in outputs_default:
_check_generated_dataframe(
name, case, outputs_default[case], outputs_pandas[case]
name, case, index, outputs_default[case], outputs_pandas[case]
)

0 comments on commit 96878ba

Please sign in to comment.