Skip to content

Commit

Permalink
[MRG] ENH faster sample_without_replacement for recent numpy (scikit-…
Browse files Browse the repository at this point in the history
  • Loading branch information
jkarno authored and jnothman committed Oct 29, 2016
1 parent 061803c commit edc9e7f
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 9 deletions.
14 changes: 14 additions & 0 deletions doc/whats_new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,20 @@ Bug fixes
Version 0.18.1
==============

Enhancements
.........
- Improved ``sample_without_replacement`` speed by utilizing
numpy.random.permutation for most cases. As a result,
samples may differ in this release for a fixed random state.
Affected estimators:
- :class:`ensemble.BaggingClassifier`
- :class:`ensemble.BaggingRegressor`
- :class:`linear_model.RANSACRegressor`
- :class:`model_selection.RandomizedSearchCV`
- :class:`random_projection.SparseRandomProjection`
This also affects the :meth:`datasets.make_classification`
method.

Bug fixes
.........

Expand Down
4 changes: 2 additions & 2 deletions sklearn/decomposition/truncated_svd.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,9 +88,9 @@ class TruncatedSVD(BaseEstimator, TransformerMixin):
TruncatedSVD(algorithm='randomized', n_components=5, n_iter=7,
random_state=42, tol=0.0)
>>> print(svd.explained_variance_ratio_) # doctest: +ELLIPSIS
[ 0.0782... 0.0552... 0.0544... 0.0499... 0.0413...]
[ 0.0606... 0.0584... 0.0497... 0.0434... 0.0372...]
>>> print(svd.explained_variance_ratio_.sum()) # doctest: +ELLIPSIS
0.279...
0.249...
See also
--------
Expand Down
8 changes: 4 additions & 4 deletions sklearn/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,13 +136,13 @@ class Pipeline(_BasePipeline):
Pipeline(steps=[...])
>>> prediction = anova_svm.predict(X)
>>> anova_svm.score(X, y) # doctest: +ELLIPSIS
0.77...
0.829...
>>> # getting the selected features chosen by anova_filter
>>> anova_svm.named_steps['anova'].get_support()
... # doctest: +NORMALIZE_WHITESPACE
array([ True, True, True, False, False, True, False, True, True, True,
False, False, True, False, True, False, False, False, False,
True], dtype=bool)
array([False, False, True, True, False, False, True, True, False,
True, False, True, True, False, True, False, True, True,
False, False], dtype=bool)
"""

# BaseEstimator interface
Expand Down
16 changes: 13 additions & 3 deletions sklearn/utils/_random.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,11 @@ cpdef sample_without_replacement(np.int_t n_population,
by `np.random`.
method : "auto", "tracking_selection", "reservoir_sampling" or "pool"
If method == "auto", an algorithm is automatically selected.
If method == "auto", the ratio of n_samples / n_population is used
to determine which algorithm to use:
If ratio is between 0 and 0.01, tracking selection is used.
If ratio is between 0.01 and 0.99, numpy.random.permutation is used.
If ratio is greater than 0.99, reservoir sampling is used.
The order of the selected integers is undefined. If a random order is
desired, the selected subset should be shuffled.
Expand Down Expand Up @@ -276,11 +280,17 @@ cpdef sample_without_replacement(np.int_t n_population,

all_methods = ("auto", "tracking_selection", "reservoir_sampling", "pool")

ratio = n_samples / n_population if n_population != 0.0 else 1.0

# Check ratio and use permutation unless ratio < 0.01 or ratio > 0.99
if method == "auto" and ratio > 0.01 and ratio < 0.99:
rng = check_random_state(random_state)
return rng.permutation(n_population)[:n_samples]

if method == "auto" or method == "tracking_selection":
# TODO the pool based method can also be used.
# however, it requires special benchmark to take into account
# the memory requirement of the array vs the set.
ratio = n_samples / n_population if n_population != 0.0 else 1.0

# The value 0.2 has been determined through benchmarking.
if ratio < 0.2:
Expand All @@ -296,7 +306,7 @@ cpdef sample_without_replacement(np.int_t n_population,

elif method == "pool":
return _sample_without_replacement_with_pool(n_population, n_samples,
random_state)
random_state)
else:
raise ValueError('Expected a method name in %s, got %s. '
% (all_methods, method))

0 comments on commit edc9e7f

Please sign in to comment.