FEA Add writeable parameter to check_array (scikit-learn#29018)

Co-authored-by: Olivier Grisel <[email protected]>
neurodata · Jun 20, 2024 · ef6efef · ef6efef
1 parent ba2c93b
commit ef6efef
Show file tree

Hide file tree

Showing 19 changed files with 293 additions and 42 deletions.
diff --git a/doc/whats_new/v1.5.rst b/doc/whats_new/v1.5.rst
@@ -20,15 +20,19 @@ Version 1.5.1
 
 **TODO**
 
-Changelog
----------
-
 Changes impacting many modules
 ------------------------------
 
+- |Fix| Fixed a regression in the validation of the input data of all estimators where
+ an unexpected error was raised when passing a DataFrame backed by a read-only buffer.
+ :pr:`29018` by :user:`Jérémie du Boisberranger <jeremiedbb>`.
+
 - |Fix| Fixed a regression causing a dead-lock at import time in some settings.
  :pr:`29235` by :user:`Jérémie du Boisberranger <jeremiedbb>`.
 
+Changelog
+---------
+
 :mod:`sklearn.metrics`
 ......................
 
@@ -37,6 +41,10 @@ Changes impacting many modules
  instead of implicitly converting those inputs as regular NumPy arrays.
  :pr:`29119` by :user:`Olivier Grisel`.
 
+- |Fix| Fix a regression in :func:`metrics.zero_one_loss` causing an error
+ for Array API dispatch with multilabel inputs.
+ :pr:`29269` by :user:`Yaroslav Korobko <Tialo>`.
+
 :mod:`sklearn.model_selection`
 ..............................
 
@@ -48,12 +56,14 @@ Changes impacting many modules
  grids that have estimators as parameter values.
  :pr:`29179` by :user:`Marco Gorelli<MarcoGorelli>`.
 
-:mod:`sklearn.metrics`
-..............................
+:mod:`sklearn.utils`
+....................
 
-- |Fix| Fix a regression in :func:`metrics.zero_one_loss` causing an error
- for Array API dispatch with multilabel inputs.
- :pr:`29269` by :user:`Yaroslav Korobko <Tialo>`.
+- |API| :func:`utils.validation.check_array` has a new parameter, `force_writeable`, to
+ control the writeability of the output array. If set to `True`, the output array will
+ be guaranteed to be writeable and a copy will be made if the input array is read-only.
+ If set to `False`, no guarantee is made about the writeability of the output array.
+ :pr:`29018` by :user:`Jérémie du Boisberranger <jeremiedbb>`.
 
 .. _changes_1_5:
 

diff --git a/sklearn/cluster/_affinity_propagation.py b/sklearn/cluster/_affinity_propagation.py
@@ -502,13 +502,10 @@ def fit(self, X, y=None):
  Returns the instance itself.
  """
  if self.affinity == "precomputed":
- accept_sparse = False
- else:
- accept_sparse = "csr"
- X = self._validate_data(X, accept_sparse=accept_sparse)
- if self.affinity == "precomputed":
- self.affinity_matrix_ = X.copy() if self.copy else X
+ X = self._validate_data(X, copy=self.copy, force_writeable=True)
+ self.affinity_matrix_ = X
  else: # self.affinity == "euclidean"
+ X = self._validate_data(X, accept_sparse="csr")
  self.affinity_matrix_ = -euclidean_distances(X, squared=True)
 
  if self.affinity_matrix_.shape[0] != self.affinity_matrix_.shape[1]:

diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py
@@ -770,14 +770,17 @@ def fit(self, X, y=None):
  X,
  accept_sparse=["csr", "lil"],
  dtype=np.float64,
+ force_writeable=True,
  )
  else:
  # Only non-sparse, precomputed distance matrices are handled here
  # and thereby allowed to contain numpy.inf for missing distances
 
  # Perform data validation after removing infinite values (numpy.inf)
  # from the given distance matrix.
- X = self._validate_data(X, force_all_finite=False, dtype=np.float64)
+ X = self._validate_data(
+ X, force_all_finite=False, dtype=np.float64, force_writeable=True
+ )
  if np.isnan(X).any():
  # TODO: Support np.nan in Cython implementation for precomputed
  # dense HDBSCAN

diff --git a/sklearn/cross_decomposition/_pls.py b/sklearn/cross_decomposition/_pls.py
@@ -263,10 +263,19 @@ def fit(self, X, y=None, Y=None):
 
  check_consistent_length(X, y)
  X = self._validate_data(
- X, dtype=np.float64, copy=self.copy, ensure_min_samples=2
+ X,
+ dtype=np.float64,
+ force_writeable=True,
+ copy=self.copy,
+ ensure_min_samples=2,
  )
  y = check_array(
- y, input_name="y", dtype=np.float64, copy=self.copy, ensure_2d=False
+ y,
+ input_name="y",
+ dtype=np.float64,
+ force_writeable=True,
+ copy=self.copy,
+ ensure_2d=False,
  )
  if y.ndim == 1:
  self._predict_1d = True
@@ -1056,10 +1065,19 @@ def fit(self, X, y=None, Y=None):
  y = _deprecate_Y_when_required(y, Y)
  check_consistent_length(X, y)
  X = self._validate_data(
- X, dtype=np.float64, copy=self.copy, ensure_min_samples=2
+ X,
+ dtype=np.float64,
+ force_writeable=True,
+ copy=self.copy,
+ ensure_min_samples=2,
  )
  y = check_array(
- y, input_name="y", dtype=np.float64, copy=self.copy, ensure_2d=False
+ y,
+ input_name="y",
+ dtype=np.float64,
+ force_writeable=True,
+ copy=self.copy,
+ ensure_2d=False,
  )
  if y.ndim == 1:
  y = y.reshape(-1, 1)

diff --git a/sklearn/decomposition/_factor_analysis.py b/sklearn/decomposition/_factor_analysis.py
@@ -216,7 +216,9 @@ def fit(self, X, y=None):
  self : object
  FactorAnalysis class instance.
  """
- X = self._validate_data(X, copy=self.copy, dtype=np.float64)
+ X = self._validate_data(
+ X, copy=self.copy, dtype=np.float64, force_writeable=True
+ )
 
  n_samples, n_features = X.shape
  n_components = self.n_components

diff --git a/sklearn/decomposition/_incremental_pca.py b/sklearn/decomposition/_incremental_pca.py
@@ -228,6 +228,7 @@ def fit(self, X, y=None):
  accept_sparse=["csr", "csc", "lil"],
  copy=self.copy,
  dtype=[np.float64, np.float32],
+ force_writeable=True,
  )
  n_samples, n_features = X.shape
 
@@ -277,7 +278,11 @@ def partial_fit(self, X, y=None, check_input=True):
  "or use IncrementalPCA.fit to do so in batches."
  )
  X = self._validate_data(
- X, copy=self.copy, dtype=[np.float64, np.float32], reset=first_pass
+ X,
+ copy=self.copy,
+ dtype=[np.float64, np.float32],
+ force_writeable=True,
+ reset=first_pass,
  )
  n_samples, n_features = X.shape
  if first_pass:

diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py
@@ -505,6 +505,7 @@ def _fit(self, X):
  X = self._validate_data(
  X,
  dtype=[xp.float64, xp.float32],
+ force_writeable=True,
  accept_sparse=("csr", "csc"),
  ensure_2d=True,
  copy=False,

diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py
@@ -333,6 +333,7 @@ def _validate_input(self, X, in_fit):
  reset=in_fit,
  accept_sparse="csc",
  dtype=dtype,
+ force_writeable=True if not in_fit else None,
  force_all_finite=force_all_finite,
  copy=self.copy,
  )

diff --git a/sklearn/impute/_knn.py b/sklearn/impute/_knn.py
@@ -269,6 +269,7 @@ def transform(self, X):
  X,
  accept_sparse=False,
  dtype=FLOAT_DTYPES,
+ force_writeable=True,
  force_all_finite=force_all_finite,
  copy=self.copy,
  reset=False,

diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py
@@ -598,7 +598,12 @@ def fit(self, X, y, sample_weight=None):
  accept_sparse = False if self.positive else ["csr", "csc", "coo"]
 
  X, y = self._validate_data(
- X, y, accept_sparse=accept_sparse, y_numeric=True, multi_output=True
+ X,
+ y,
+ accept_sparse=accept_sparse,
+ y_numeric=True,
+ multi_output=True,
+ force_writeable=True,
  )
 
  has_sw = sample_weight is not None

diff --git a/sklearn/linear_model/_bayes.py b/sklearn/linear_model/_bayes.py
@@ -235,7 +235,9 @@ def fit(self, X, y, sample_weight=None):
  self : object
  Returns the instance itself.
  """
- X, y = self._validate_data(X, y, dtype=[np.float64, np.float32], y_numeric=True)
+ X, y = self._validate_data(
+ X, y, dtype=[np.float64, np.float32], force_writeable=True, y_numeric=True
+ )
  dtype = X.dtype
 
  if sample_weight is not None:
@@ -620,7 +622,12 @@ def fit(self, X, y):
  Fitted estimator.
  """
  X, y = self._validate_data(
- X, y, dtype=[np.float64, np.float32], y_numeric=True, ensure_min_samples=2
+ X,
+ y,
+ dtype=[np.float64, np.float32],
+ force_writeable=True,
+ y_numeric=True,
+ ensure_min_samples=2,
  )
  dtype = X.dtype
 

diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py
@@ -979,6 +979,7 @@ def fit(self, X, y, sample_weight=None, check_input=True):
  accept_sparse="csc",
  order="F",
  dtype=[np.float64, np.float32],
+ force_writeable=True,
  accept_large_sparse=False,
  copy=X_copied,
  multi_output=True,
@@ -1607,6 +1608,7 @@ def fit(self, X, y, sample_weight=None, **params):
  check_X_params = dict(
  accept_sparse="csc",
  dtype=[np.float64, np.float32],
+ force_writeable=True,
  copy=False,
  accept_large_sparse=False,
  )
@@ -1632,6 +1634,7 @@ def fit(self, X, y, sample_weight=None, **params):
  accept_sparse="csc",
  dtype=[np.float64, np.float32],
  order="F",
+ force_writeable=True,
  copy=copy_X,
  )
  X, y = self._validate_data(
@@ -2508,6 +2511,7 @@ def fit(self, X, y):
  check_X_params = dict(
  dtype=[np.float64, np.float32],
  order="F",
+ force_writeable=True,
  copy=self.copy_X and self.fit_intercept,
  )
  check_y_params = dict(ensure_2d=False, order="F")

diff --git a/sklearn/linear_model/_least_angle.py b/sklearn/linear_model/_least_angle.py
@@ -1177,7 +1177,9 @@ def fit(self, X, y, Xy=None):
  self : object
  Returns an instance of self.
  """
- X, y = self._validate_data(X, y, y_numeric=True, multi_output=True)
+ X, y = self._validate_data(
+ X, y, force_writeable=True, y_numeric=True, multi_output=True
+ )
 
  alpha = getattr(self, "alpha", 0.0)
  if hasattr(self, "n_nonzero_coefs"):
@@ -1718,7 +1720,7 @@ def fit(self, X, y, **params):
  """
  _raise_for_params(params, self, "fit")
 
- X, y = self._validate_data(X, y, y_numeric=True)
+ X, y = self._validate_data(X, y, force_writeable=True, y_numeric=True)
  X = as_float_array(X, copy=self.copy_X)
  y = as_float_array(y, copy=self.copy_X)
 
@@ -2235,7 +2237,7 @@ def fit(self, X, y, copy_X=None):
  """
  if copy_X is None:
  copy_X = self.copy_X
- X, y = self._validate_data(X, y, y_numeric=True)
+ X, y = self._validate_data(X, y, force_writeable=True, y_numeric=True)
 
  X, y, Xmean, ymean, Xstd = _preprocess_data(
  X, y, fit_intercept=self.fit_intercept, copy=copy_X

diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py
@@ -1241,6 +1241,7 @@ def fit(self, X, y, sample_weight=None):
  y,
  accept_sparse=_accept_sparse,
  dtype=[xp.float64, xp.float32],
+ force_writeable=True,
  multi_output=True,
  y_numeric=True,
  )
@@ -1290,6 +1291,7 @@ def _prepare_data(self, X, y, sample_weight, solver):
  accept_sparse=accept_sparse,
  multi_output=True,
  y_numeric=False,
+ force_writeable=True,
  )
 
  self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1)