[MRG] ENH faster sample_without_replacement for recent numpy (scikit-…

…learn#7703)
neurodata · Oct 29, 2016 · edc9e7f · edc9e7f
1 parent 061803c
commit edc9e7f
Show file tree

Hide file tree

Showing 4 changed files with 33 additions and 9 deletions.
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -90,6 +90,20 @@ Bug fixes
 Version 0.18.1
 ==============
 
+Enhancements
+.........
+ - Improved ``sample_without_replacement`` speed by utilizing
+ numpy.random.permutation for most cases. As a result,
+ samples may differ in this release for a fixed random state.
+ Affected estimators:
+ - :class:`ensemble.BaggingClassifier`
+ - :class:`ensemble.BaggingRegressor`
+ - :class:`linear_model.RANSACRegressor`
+ - :class:`model_selection.RandomizedSearchCV`
+ - :class:`random_projection.SparseRandomProjection`
+ This also affects the :meth:`datasets.make_classification`
+ method.
+
 Bug fixes
 .........
 

diff --git a/sklearn/decomposition/truncated_svd.py b/sklearn/decomposition/truncated_svd.py
@@ -88,9 +88,9 @@ class TruncatedSVD(BaseEstimator, TransformerMixin):
  TruncatedSVD(algorithm='randomized', n_components=5, n_iter=7,
  random_state=42, tol=0.0)
  >>> print(svd.explained_variance_ratio_) # doctest: +ELLIPSIS
- [ 0.0782... 0.0552... 0.0544... 0.0499... 0.0413...]
+ [ 0.0606... 0.0584... 0.0497... 0.0434... 0.0372...]
  >>> print(svd.explained_variance_ratio_.sum()) # doctest: +ELLIPSIS
- 0.279...
+ 0.249...
 
  See also
  --------

diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py
@@ -136,13 +136,13 @@ class Pipeline(_BasePipeline):
  Pipeline(steps=[...])
  >>> prediction = anova_svm.predict(X)
  >>> anova_svm.score(X, y) # doctest: +ELLIPSIS
- 0.77...
+ 0.829...
  >>> # getting the selected features chosen by anova_filter
  >>> anova_svm.named_steps['anova'].get_support()
  ... # doctest: +NORMALIZE_WHITESPACE
- array([ True, True, True, False, False,  True, False, True,  True, True,
- False, False, True, False, True, False, False, False, False,
- True], dtype=bool)
+ array([False, False, True, True, False, False, True,  True, False,
+ True, False, True,  True, False, True, False, True, True,
+ False, False], dtype=bool)
  """
 
  # BaseEstimator interface

diff --git a/sklearn/utils/_random.pyx b/sklearn/utils/_random.pyx
@@ -248,7 +248,11 @@ cpdef sample_without_replacement(np.int_t n_population,
  by `np.random`.
 
  method : "auto", "tracking_selection", "reservoir_sampling" or "pool"
- If method == "auto", an algorithm is automatically selected.
+ If method == "auto", the ratio of n_samples / n_population is used
+ to determine which algorithm to use:
+ If ratio is between 0 and 0.01, tracking selection is used.
+ If ratio is between 0.01 and 0.99, numpy.random.permutation is used.
+ If ratio is greater than 0.99, reservoir sampling is used.
  The order of the selected integers is undefined. If a random order is
  desired, the selected subset should be shuffled.
 
@@ -276,11 +280,17 @@ cpdef sample_without_replacement(np.int_t n_population,
 
  all_methods = ("auto", "tracking_selection", "reservoir_sampling", "pool")
 
+ ratio = n_samples / n_population if n_population != 0.0 else 1.0
+
+ # Check ratio and use permutation unless ratio < 0.01 or ratio > 0.99
+ if method == "auto" and ratio > 0.01 and ratio < 0.99:
+ rng = check_random_state(random_state)
+ return rng.permutation(n_population)[:n_samples]
+
  if method == "auto" or method == "tracking_selection":
  # TODO the pool based method can also be used.
  # however, it requires special benchmark to take into account
  # the memory requirement of the array vs the set.
- ratio = n_samples / n_population if n_population != 0.0 else 1.0
 
  # The value 0.2 has been determined through benchmarking.
  if ratio < 0.2:
@@ -296,7 +306,7 @@ cpdef sample_without_replacement(np.int_t n_population,
 
  elif method == "pool":
  return _sample_without_replacement_with_pool(n_population, n_samples,
- random_state)
+  random_state)
  else:
  raise ValueError('Expected a method name in %s, got %s. '
  % (all_methods, method))