Skip to content

Commit

Permalink
FEA Add writeable parameter to check_array (scikit-learn#29018)
Browse files Browse the repository at this point in the history
Co-authored-by: Olivier Grisel <[email protected]>
  • Loading branch information
jeremiedbb and ogrisel committed Jun 20, 2024
1 parent ba2c93b commit ef6efef
Show file tree
Hide file tree
Showing 19 changed files with 293 additions and 42 deletions.
26 changes: 18 additions & 8 deletions doc/whats_new/v1.5.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,19 @@ Version 1.5.1

**TODO**

Changelog
---------

Changes impacting many modules
------------------------------

- |Fix| Fixed a regression in the validation of the input data of all estimators where
an unexpected error was raised when passing a DataFrame backed by a read-only buffer.
:pr:`29018` by :user:`Jérémie du Boisberranger <jeremiedbb>`.

- |Fix| Fixed a regression causing a dead-lock at import time in some settings.
:pr:`29235` by :user:`Jérémie du Boisberranger <jeremiedbb>`.

Changelog
---------

:mod:`sklearn.metrics`
......................

Expand All @@ -37,6 +41,10 @@ Changes impacting many modules
instead of implicitly converting those inputs as regular NumPy arrays.
:pr:`29119` by :user:`Olivier Grisel`.

- |Fix| Fix a regression in :func:`metrics.zero_one_loss` causing an error
for Array API dispatch with multilabel inputs.
:pr:`29269` by :user:`Yaroslav Korobko <Tialo>`.

:mod:`sklearn.model_selection`
..............................

Expand All @@ -48,12 +56,14 @@ Changes impacting many modules
grids that have estimators as parameter values.
:pr:`29179` by :user:`Marco Gorelli<MarcoGorelli>`.

:mod:`sklearn.metrics`
..............................
:mod:`sklearn.utils`
....................

- |Fix| Fix a regression in :func:`metrics.zero_one_loss` causing an error
for Array API dispatch with multilabel inputs.
:pr:`29269` by :user:`Yaroslav Korobko <Tialo>`.
- |API| :func:`utils.validation.check_array` has a new parameter, `force_writeable`, to
control the writeability of the output array. If set to `True`, the output array will
be guaranteed to be writeable and a copy will be made if the input array is read-only.
If set to `False`, no guarantee is made about the writeability of the output array.
:pr:`29018` by :user:`Jérémie du Boisberranger <jeremiedbb>`.

.. _changes_1_5:

Expand Down
9 changes: 3 additions & 6 deletions sklearn/cluster/_affinity_propagation.py
Original file line number Diff line number Diff line change
Expand Up @@ -502,13 +502,10 @@ def fit(self, X, y=None):
Returns the instance itself.
"""
if self.affinity == "precomputed":
accept_sparse = False
else:
accept_sparse = "csr"
X = self._validate_data(X, accept_sparse=accept_sparse)
if self.affinity == "precomputed":
self.affinity_matrix_ = X.copy() if self.copy else X
X = self._validate_data(X, copy=self.copy, force_writeable=True)
self.affinity_matrix_ = X
else: # self.affinity == "euclidean"
X = self._validate_data(X, accept_sparse="csr")
self.affinity_matrix_ = -euclidean_distances(X, squared=True)

if self.affinity_matrix_.shape[0] != self.affinity_matrix_.shape[1]:
Expand Down
5 changes: 4 additions & 1 deletion sklearn/cluster/_hdbscan/hdbscan.py
Original file line number Diff line number Diff line change
Expand Up @@ -770,14 +770,17 @@ def fit(self, X, y=None):
X,
accept_sparse=["csr", "lil"],
dtype=np.float64,
force_writeable=True,
)
else:
# Only non-sparse, precomputed distance matrices are handled here
# and thereby allowed to contain numpy.inf for missing distances

# Perform data validation after removing infinite values (numpy.inf)
# from the given distance matrix.
X = self._validate_data(X, force_all_finite=False, dtype=np.float64)
X = self._validate_data(
X, force_all_finite=False, dtype=np.float64, force_writeable=True
)
if np.isnan(X).any():
# TODO: Support np.nan in Cython implementation for precomputed
# dense HDBSCAN
Expand Down
26 changes: 22 additions & 4 deletions sklearn/cross_decomposition/_pls.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,10 +263,19 @@ def fit(self, X, y=None, Y=None):

check_consistent_length(X, y)
X = self._validate_data(
X, dtype=np.float64, copy=self.copy, ensure_min_samples=2
X,
dtype=np.float64,
force_writeable=True,
copy=self.copy,
ensure_min_samples=2,
)
y = check_array(
y, input_name="y", dtype=np.float64, copy=self.copy, ensure_2d=False
y,
input_name="y",
dtype=np.float64,
force_writeable=True,
copy=self.copy,
ensure_2d=False,
)
if y.ndim == 1:
self._predict_1d = True
Expand Down Expand Up @@ -1056,10 +1065,19 @@ def fit(self, X, y=None, Y=None):
y = _deprecate_Y_when_required(y, Y)
check_consistent_length(X, y)
X = self._validate_data(
X, dtype=np.float64, copy=self.copy, ensure_min_samples=2
X,
dtype=np.float64,
force_writeable=True,
copy=self.copy,
ensure_min_samples=2,
)
y = check_array(
y, input_name="y", dtype=np.float64, copy=self.copy, ensure_2d=False
y,
input_name="y",
dtype=np.float64,
force_writeable=True,
copy=self.copy,
ensure_2d=False,
)
if y.ndim == 1:
y = y.reshape(-1, 1)
Expand Down
4 changes: 3 additions & 1 deletion sklearn/decomposition/_factor_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,9 @@ def fit(self, X, y=None):
self : object
FactorAnalysis class instance.
"""
X = self._validate_data(X, copy=self.copy, dtype=np.float64)
X = self._validate_data(
X, copy=self.copy, dtype=np.float64, force_writeable=True
)

n_samples, n_features = X.shape
n_components = self.n_components
Expand Down
7 changes: 6 additions & 1 deletion sklearn/decomposition/_incremental_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,7 @@ def fit(self, X, y=None):
accept_sparse=["csr", "csc", "lil"],
copy=self.copy,
dtype=[np.float64, np.float32],
force_writeable=True,
)
n_samples, n_features = X.shape

Expand Down Expand Up @@ -277,7 +278,11 @@ def partial_fit(self, X, y=None, check_input=True):
"or use IncrementalPCA.fit to do so in batches."
)
X = self._validate_data(
X, copy=self.copy, dtype=[np.float64, np.float32], reset=first_pass
X,
copy=self.copy,
dtype=[np.float64, np.float32],
force_writeable=True,
reset=first_pass,
)
n_samples, n_features = X.shape
if first_pass:
Expand Down
1 change: 1 addition & 0 deletions sklearn/decomposition/_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -505,6 +505,7 @@ def _fit(self, X):
X = self._validate_data(
X,
dtype=[xp.float64, xp.float32],
force_writeable=True,
accept_sparse=("csr", "csc"),
ensure_2d=True,
copy=False,
Expand Down
1 change: 1 addition & 0 deletions sklearn/impute/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,6 +333,7 @@ def _validate_input(self, X, in_fit):
reset=in_fit,
accept_sparse="csc",
dtype=dtype,
force_writeable=True if not in_fit else None,
force_all_finite=force_all_finite,
copy=self.copy,
)
Expand Down
1 change: 1 addition & 0 deletions sklearn/impute/_knn.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,7 @@ def transform(self, X):
X,
accept_sparse=False,
dtype=FLOAT_DTYPES,
force_writeable=True,
force_all_finite=force_all_finite,
copy=self.copy,
reset=False,
Expand Down
7 changes: 6 additions & 1 deletion sklearn/linear_model/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -598,7 +598,12 @@ def fit(self, X, y, sample_weight=None):
accept_sparse = False if self.positive else ["csr", "csc", "coo"]

X, y = self._validate_data(
X, y, accept_sparse=accept_sparse, y_numeric=True, multi_output=True
X,
y,
accept_sparse=accept_sparse,
y_numeric=True,
multi_output=True,
force_writeable=True,
)

has_sw = sample_weight is not None
Expand Down
11 changes: 9 additions & 2 deletions sklearn/linear_model/_bayes.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,9 @@ def fit(self, X, y, sample_weight=None):
self : object
Returns the instance itself.
"""
X, y = self._validate_data(X, y, dtype=[np.float64, np.float32], y_numeric=True)
X, y = self._validate_data(
X, y, dtype=[np.float64, np.float32], force_writeable=True, y_numeric=True
)
dtype = X.dtype

if sample_weight is not None:
Expand Down Expand Up @@ -620,7 +622,12 @@ def fit(self, X, y):
Fitted estimator.
"""
X, y = self._validate_data(
X, y, dtype=[np.float64, np.float32], y_numeric=True, ensure_min_samples=2
X,
y,
dtype=[np.float64, np.float32],
force_writeable=True,
y_numeric=True,
ensure_min_samples=2,
)
dtype = X.dtype

Expand Down
4 changes: 4 additions & 0 deletions sklearn/linear_model/_coordinate_descent.py
Original file line number Diff line number Diff line change
Expand Up @@ -979,6 +979,7 @@ def fit(self, X, y, sample_weight=None, check_input=True):
accept_sparse="csc",
order="F",
dtype=[np.float64, np.float32],
force_writeable=True,
accept_large_sparse=False,
copy=X_copied,
multi_output=True,
Expand Down Expand Up @@ -1607,6 +1608,7 @@ def fit(self, X, y, sample_weight=None, **params):
check_X_params = dict(
accept_sparse="csc",
dtype=[np.float64, np.float32],
force_writeable=True,
copy=False,
accept_large_sparse=False,
)
Expand All @@ -1632,6 +1634,7 @@ def fit(self, X, y, sample_weight=None, **params):
accept_sparse="csc",
dtype=[np.float64, np.float32],
order="F",
force_writeable=True,
copy=copy_X,
)
X, y = self._validate_data(
Expand Down Expand Up @@ -2508,6 +2511,7 @@ def fit(self, X, y):
check_X_params = dict(
dtype=[np.float64, np.float32],
order="F",
force_writeable=True,
copy=self.copy_X and self.fit_intercept,
)
check_y_params = dict(ensure_2d=False, order="F")
Expand Down
8 changes: 5 additions & 3 deletions sklearn/linear_model/_least_angle.py
Original file line number Diff line number Diff line change
Expand Up @@ -1177,7 +1177,9 @@ def fit(self, X, y, Xy=None):
self : object
Returns an instance of self.
"""
X, y = self._validate_data(X, y, y_numeric=True, multi_output=True)
X, y = self._validate_data(
X, y, force_writeable=True, y_numeric=True, multi_output=True
)

alpha = getattr(self, "alpha", 0.0)
if hasattr(self, "n_nonzero_coefs"):
Expand Down Expand Up @@ -1718,7 +1720,7 @@ def fit(self, X, y, **params):
"""
_raise_for_params(params, self, "fit")

X, y = self._validate_data(X, y, y_numeric=True)
X, y = self._validate_data(X, y, force_writeable=True, y_numeric=True)
X = as_float_array(X, copy=self.copy_X)
y = as_float_array(y, copy=self.copy_X)

Expand Down Expand Up @@ -2235,7 +2237,7 @@ def fit(self, X, y, copy_X=None):
"""
if copy_X is None:
copy_X = self.copy_X
X, y = self._validate_data(X, y, y_numeric=True)
X, y = self._validate_data(X, y, force_writeable=True, y_numeric=True)

X, y, Xmean, ymean, Xstd = _preprocess_data(
X, y, fit_intercept=self.fit_intercept, copy=copy_X
Expand Down
2 changes: 2 additions & 0 deletions sklearn/linear_model/_ridge.py
Original file line number Diff line number Diff line change
Expand Up @@ -1241,6 +1241,7 @@ def fit(self, X, y, sample_weight=None):
y,
accept_sparse=_accept_sparse,
dtype=[xp.float64, xp.float32],
force_writeable=True,
multi_output=True,
y_numeric=True,
)
Expand Down Expand Up @@ -1290,6 +1291,7 @@ def _prepare_data(self, X, y, sample_weight, solver):
accept_sparse=accept_sparse,
multi_output=True,
y_numeric=False,
force_writeable=True,
)

self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1)
Expand Down
Loading

0 comments on commit ef6efef

Please sign in to comment.