From e0ebc7839153da72e091f385ee4e6d4df51f96ef Mon Sep 17 00:00:00 2001 From: genvalen Date: Wed, 23 Feb 2022 08:44:25 -0500 Subject: [PATCH] DOC Update documentation of gradient boosting estimators w/ ranges (#22153) Co-authored-by: Thomas J. Fan --- sklearn/ensemble/_gb.py | 81 +++++++++++++++++++++++++---------------- 1 file changed, 49 insertions(+), 32 deletions(-) diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py index 0381537d2b48a..4818c8f7c9f1b 100644 --- a/sklearn/ensemble/_gb.py +++ b/sklearn/ensemble/_gb.py @@ -989,11 +989,13 @@ class GradientBoostingClassifier(ClassifierMixin, BaseGradientBoosting): learning_rate : float, default=0.1 Learning rate shrinks the contribution of each tree by `learning_rate`. There is a trade-off between learning_rate and n_estimators. + Values must be in the range `(0.0, inf)`. n_estimators : int, default=100 The number of boosting stages to perform. Gradient boosting is fairly robust to over-fitting so a large number usually results in better performance. + Values must be in the range `[1, inf)`. subsample : float, default=1.0 The fraction of samples to be used for fitting the individual base @@ -1001,6 +1003,7 @@ class GradientBoostingClassifier(ClassifierMixin, BaseGradientBoosting): Boosting. `subsample` interacts with the parameter `n_estimators`. Choosing `subsample < 1.0` leads to a reduction of variance and an increase in bias. + Values must be in the range `(0.0, 1.0]`. criterion : {'friedman_mse', 'squared_error', 'mse'}, \ default='friedman_mse' @@ -1019,10 +1022,9 @@ class GradientBoostingClassifier(ClassifierMixin, BaseGradientBoosting): min_samples_split : int or float, default=2 The minimum number of samples required to split an internal node: - - If int, then consider `min_samples_split` as the minimum number. - - If float, then `min_samples_split` is a fraction and - `ceil(min_samples_split * n_samples)` are the minimum - number of samples for each split. + - If int, values must be in the range `[2, inf)`. + - If float, values must be in the range `(0.0, 1.0]` and `min_samples_split` + will be `ceil(min_samples_split * n_samples)`. .. versionchanged:: 0.18 Added float values for fractions. @@ -1034,10 +1036,9 @@ class GradientBoostingClassifier(ClassifierMixin, BaseGradientBoosting): right branches. This may have the effect of smoothing the model, especially in regression. - - If int, then consider `min_samples_leaf` as the minimum number. - - If float, then `min_samples_leaf` is a fraction and - `ceil(min_samples_leaf * n_samples)` are the minimum - number of samples for each node. + - If int, values must be in the range `[1, inf)`. + - If float, values must be in the range `(0.0, 1.0]` and `min_samples_leaf` + will be `ceil(min_samples_leaf * n_samples)`. .. versionchanged:: 0.18 Added float values for fractions. @@ -1046,16 +1047,19 @@ class GradientBoostingClassifier(ClassifierMixin, BaseGradientBoosting): The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided. + Values must be in the range `[0.0, 0.5]`. max_depth : int, default=3 The maximum depth of the individual regression estimators. The maximum depth limits the number of nodes in the tree. Tune this parameter for best performance; the best value depends on the interaction of the input variables. + Values must be in the range `[1, inf)`. min_impurity_decrease : float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. + Values must be in the range `[0.0, inf)`. The weighted impurity decrease equation is the following:: @@ -1090,10 +1094,9 @@ class GradientBoostingClassifier(ClassifierMixin, BaseGradientBoosting): max_features : {'auto', 'sqrt', 'log2'}, int or float, default=None The number of features to consider when looking for the best split: - - If int, then consider `max_features` features at each split. - - If float, then `max_features` is a fraction and - `int(max_features * n_features)` features are considered at each - split. + - If int, values must be in the range `[1, inf)`. + - If float, values must be in the range `(0.0, 1.0]` and the features + considered at each split will be `int(max_features * n_features)`. - If 'auto', then `max_features=sqrt(n_features)`. - If 'sqrt', then `max_features=sqrt(n_features)`. - If 'log2', then `max_features=log2(n_features)`. @@ -1110,11 +1113,13 @@ class GradientBoostingClassifier(ClassifierMixin, BaseGradientBoosting): Enable verbose output. If 1 then it prints progress and performance once in a while (the more trees the lower the frequency). If greater than 1 then it prints progress and performance for every tree. + Values must be in the range `[0, inf)`. max_leaf_nodes : int, default=None Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. - If None then unlimited number of leaf nodes. + Values must be in the range `[2, inf)`. + If `None`, then unlimited number of leaf nodes. warm_start : bool, default=False When set to ``True``, reuse the solution of the previous call to fit @@ -1123,7 +1128,7 @@ class GradientBoostingClassifier(ClassifierMixin, BaseGradientBoosting): validation_fraction : float, default=0.1 The proportion of training data to set aside as validation set for - early stopping. Must be between 0 and 1. + early stopping. Values must be in the range `(0.0, 1.0)`. Only used if ``n_iter_no_change`` is set to an integer. .. versionadded:: 0.20 @@ -1136,6 +1141,7 @@ class GradientBoostingClassifier(ClassifierMixin, BaseGradientBoosting): data as validation and terminate training when validation score is not improving in all of the previous ``n_iter_no_change`` numbers of iterations. The split is stratified. + Values must be in the range `[1, inf)`. .. versionadded:: 0.20 @@ -1143,14 +1149,16 @@ class GradientBoostingClassifier(ClassifierMixin, BaseGradientBoosting): Tolerance for the early stopping. When the loss is not improving by at least tol for ``n_iter_no_change`` iterations (if set to a number), the training stops. + Values must be in the range `(0.0, inf)`. .. versionadded:: 0.20 ccp_alpha : non-negative float, default=0.0 Complexity parameter used for Minimal Cost-Complexity Pruning. The subtree with the largest cost complexity that is smaller than - ``ccp_alpha`` will be chosen. By default, no pruning is performed. See - :ref:`minimal_cost_complexity_pruning` for details. + ``ccp_alpha`` will be chosen. By default, no pruning is performed. + Values must be in the range `[0.0, inf)`. + See :ref:`minimal_cost_complexity_pruning` for details. .. versionadded:: 0.22 @@ -1548,11 +1556,13 @@ class GradientBoostingRegressor(RegressorMixin, BaseGradientBoosting): learning_rate : float, default=0.1 Learning rate shrinks the contribution of each tree by `learning_rate`. There is a trade-off between learning_rate and n_estimators. + Values must be in the range `(0.0, inf)`. n_estimators : int, default=100 The number of boosting stages to perform. Gradient boosting is fairly robust to over-fitting so a large number usually results in better performance. + Values must be in the range `[1, inf)`. subsample : float, default=1.0 The fraction of samples to be used for fitting the individual base @@ -1560,6 +1570,7 @@ class GradientBoostingRegressor(RegressorMixin, BaseGradientBoosting): Boosting. `subsample` interacts with the parameter `n_estimators`. Choosing `subsample < 1.0` leads to a reduction of variance and an increase in bias. + Values must be in the range `(0.0, 1.0]`. criterion : {'friedman_mse', 'squared_error', 'mse'}, \ default='friedman_mse' @@ -1578,10 +1589,9 @@ class GradientBoostingRegressor(RegressorMixin, BaseGradientBoosting): min_samples_split : int or float, default=2 The minimum number of samples required to split an internal node: - - If int, then consider `min_samples_split` as the minimum number. - - If float, then `min_samples_split` is a fraction and - `ceil(min_samples_split * n_samples)` are the minimum - number of samples for each split. + - If int, values must be in the range `[2, inf)`. + - If float, values must be in the range `(0.0, 1.0]` and `min_samples_split` + will be `ceil(min_samples_split * n_samples)`. .. versionchanged:: 0.18 Added float values for fractions. @@ -1593,10 +1603,9 @@ class GradientBoostingRegressor(RegressorMixin, BaseGradientBoosting): right branches. This may have the effect of smoothing the model, especially in regression. - - If int, then consider `min_samples_leaf` as the minimum number. - - If float, then `min_samples_leaf` is a fraction and - `ceil(min_samples_leaf * n_samples)` are the minimum - number of samples for each node. + - If int, values must be in the range `[1, inf)`. + - If float, values must be in the range `(0.0, 1.0]` and `min_samples_leaf` + will be `ceil(min_samples_leaf * n_samples)`. .. versionchanged:: 0.18 Added float values for fractions. @@ -1605,16 +1614,19 @@ class GradientBoostingRegressor(RegressorMixin, BaseGradientBoosting): The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided. + Values must be in the range `[0.0, 0.5]`. max_depth : int, default=3 Maximum depth of the individual regression estimators. The maximum depth limits the number of nodes in the tree. Tune this parameter for best performance; the best value depends on the interaction of the input variables. + Values must be in the range `[1, inf)`. min_impurity_decrease : float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. + Values must be in the range `[0.0, inf)`. The weighted impurity decrease equation is the following:: @@ -1650,10 +1662,9 @@ class GradientBoostingRegressor(RegressorMixin, BaseGradientBoosting): max_features : {'auto', 'sqrt', 'log2'}, int or float, default=None The number of features to consider when looking for the best split: - - If int, then consider `max_features` features at each split. - - If float, then `max_features` is a fraction and - `int(max_features * n_features)` features are considered at each - split. + - If int, values must be in the range `[1, inf)`. + - If float, values must be in the range `(0.0, 1.0]` and the features + considered at each split will be `int(max_features * n_features)`. - If "auto", then `max_features=n_features`. - If "sqrt", then `max_features=sqrt(n_features)`. - If "log2", then `max_features=log2(n_features)`. @@ -1669,16 +1680,19 @@ class GradientBoostingRegressor(RegressorMixin, BaseGradientBoosting): alpha : float, default=0.9 The alpha-quantile of the huber loss function and the quantile loss function. Only if ``loss='huber'`` or ``loss='quantile'``. + Values must be in the range `(0.0, 1.0)`. verbose : int, default=0 Enable verbose output. If 1 then it prints progress and performance once in a while (the more trees the lower the frequency). If greater than 1 then it prints progress and performance for every tree. + Values must be in the range `[0, inf)`. max_leaf_nodes : int, default=None Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. - If None then unlimited number of leaf nodes. + Values must be in the range `[2, inf)`. + If None, then unlimited number of leaf nodes. warm_start : bool, default=False When set to ``True``, reuse the solution of the previous call to fit @@ -1687,7 +1701,7 @@ class GradientBoostingRegressor(RegressorMixin, BaseGradientBoosting): validation_fraction : float, default=0.1 The proportion of training data to set aside as validation set for - early stopping. Must be between 0 and 1. + early stopping. Values must be in the range `(0.0, 1.0)`. Only used if ``n_iter_no_change`` is set to an integer. .. versionadded:: 0.20 @@ -1700,6 +1714,7 @@ class GradientBoostingRegressor(RegressorMixin, BaseGradientBoosting): data as validation and terminate training when validation score is not improving in all of the previous ``n_iter_no_change`` numbers of iterations. + Values must be in the range `[1, inf)`. .. versionadded:: 0.20 @@ -1707,14 +1722,16 @@ class GradientBoostingRegressor(RegressorMixin, BaseGradientBoosting): Tolerance for the early stopping. When the loss is not improving by at least tol for ``n_iter_no_change`` iterations (if set to a number), the training stops. + Values must be in the range `(0.0, inf)`. .. versionadded:: 0.20 ccp_alpha : non-negative float, default=0.0 Complexity parameter used for Minimal Cost-Complexity Pruning. The subtree with the largest cost complexity that is smaller than - ``ccp_alpha`` will be chosen. By default, no pruning is performed. See - :ref:`minimal_cost_complexity_pruning` for details. + ``ccp_alpha`` will be chosen. By default, no pruning is performed. + Values must be in the range `[0.0, inf)`. + See :ref:`minimal_cost_complexity_pruning` for details. .. versionadded:: 0.22