diff --git a/rllib/BUILD b/rllib/BUILD index 221c2362b56cf..c036da9e9b8ef 100644 --- a/rllib/BUILD +++ b/rllib/BUILD @@ -3693,7 +3693,7 @@ py_test( ) # Taking out this test for now: Mixed torch- and tf- policies within the same -# Trainer never really worked. +# Algorothm never really worked. # py_test( # name = "examples/multi_agent_two_trainers_mixed_torch_tf", # main = "examples/multi_agent_two_trainers.py", diff --git a/rllib/algorithms/a3c/tests/test_a3c.py b/rllib/algorithms/a3c/tests/test_a3c.py index 897dab6a959b0..4d34d327d7623 100644 --- a/rllib/algorithms/a3c/tests/test_a3c.py +++ b/rllib/algorithms/a3c/tests/test_a3c.py @@ -64,14 +64,14 @@ def test_a3c_entropy_coeff_schedule(self): min_time_s_per_iteration=0, min_sample_timesteps_per_iteration=20 ) - def _step_n_times(trainer, n: int): - """Step trainer n times. + def _step_n_times(algo, n: int): + """Step Algorithm n times. Returns: learning rate at the end of the execution. """ for _ in range(n): - results = trainer.train() + results = algo.train() return results["info"][LEARNER_INFO][DEFAULT_POLICY_ID][LEARNER_STATS_KEY][ "entropy_coeff" ] diff --git a/rllib/algorithms/algorithm.py b/rllib/algorithms/algorithm.py index 29de0b01a3be5..7b26dc1139bf9 100644 --- a/rllib/algorithms/algorithm.py +++ b/rllib/algorithms/algorithm.py @@ -362,7 +362,9 @@ def __init__( # Last resort: Create core AlgorithmConfig from merged dicts. if isinstance(default_config, dict): config = AlgorithmConfig.from_dict( - config_dict=self.merge_trainer_configs(default_config, config, True) + config_dict=self.merge_algorithm_configs( + default_config, config, True + ) ) # Default config is an AlgorithmConfig -> update its properties # from the given config dict. @@ -569,17 +571,17 @@ def setup(self, config: AlgorithmConfig) -> None: ) self.config.off_policy_estimation_methods = ope_dict - # Deprecated way of implementing Trainer sub-classes (or "templates" + # Deprecated way of implementing Algorithm sub-classes (or "templates" # via the `build_trainer` utility function). # Instead, sub-classes should override the Trainable's `setup()` # method and call super().setup() from within that override at some # point. - # Old design: Override `Trainer._init`. + # Old design: Override `Algorithm._init`. _init = False try: self._init(self.config, self.env_creator) _init = True - # New design: Override `Trainable.setup()` (as indented by tune.Trainable) + # New design: Override `Algorithm.setup()` (as indented by tune.Trainable) # and do or don't call `super().setup()` from within your override. # By default, `super().setup()` will create both worker sets: # "rollout workers" for collecting samples for training and - if @@ -731,7 +733,7 @@ def setup(self, config: AlgorithmConfig) -> None: # Run `on_algorithm_init` callback after initialization is done. self.callbacks.on_algorithm_init(algorithm=self) - # TODO: Deprecated: In your sub-classes of Trainer, override `setup()` + # TODO: Deprecated: In your sub-classes of Algorithm, override `setup()` # directly and call super().setup() from within it if you would like the # default setup behavior plus some own setup logic. # If you don't need the env/workers/config/etc.. setup for you by super, @@ -755,13 +757,13 @@ def get_default_policy_class( @override(Trainable) def step(self) -> ResultDict: - """Implements the main `Trainer.train()` logic. + """Implements the main `Algorithm.train()` logic. Takes n attempts to perform a single training step. Thereby catches RayErrors resulting from worker failures. After n attempts, fails gracefully. - Override this method in your Trainer sub-classes if you would like to + Override this method in your Algorithm sub-classes if you would like to handle worker failures yourself. Otherwise, override only `training_step()` to implement the core algorithm logic. @@ -803,7 +805,7 @@ def step(self) -> ResultDict: if not evaluate_this_iter and self.config.always_attach_evaluation_results: assert isinstance( self.evaluation_metrics, dict - ), "Trainer.evaluate() needs to return a dict." + ), "Algorithm.evaluate() needs to return a dict." results.update(self.evaluation_metrics) if hasattr(self, "workers") and isinstance(self.workers, WorkerSet): @@ -853,9 +855,6 @@ def evaluate( ) -> dict: """Evaluates current policy under `evaluation_config` settings. - Note that this default implementation does not do anything beyond - merging evaluation_config with the normal trainer config. - Args: duration_fn: An optional callable taking the already run num episodes as only arg and returning the number of @@ -902,7 +901,7 @@ def evaluate( ): raise ValueError( "Cannot evaluate w/o an evaluation worker set in " - "the Trainer or w/o an env on the local worker!\n" + "the Algorithm or w/o an env on the local worker!\n" "Try one of the following:\n1) Set " "`evaluation_interval` >= 0 to force creating a " "separate evaluation worker set.\n2) Set " @@ -1093,7 +1092,7 @@ def duration_fn(num_units_done): metrics["off_policy_estimator"][name] = avg_estimate # Evaluation does not run for every step. - # Save evaluation metrics on trainer, so it can be attached to + # Save evaluation metrics on Algorithm, so it can be attached to # subsequent step results as latest evaluation result. self.evaluation_metrics = {"evaluation": metrics} @@ -1286,7 +1285,7 @@ def remote_fn(worker): metrics["off_policy_estimator"][name] = estimates # Evaluation does not run for every step. - # Save evaluation metrics on trainer, so it can be attached to + # Save evaluation metrics on Algorithm, so it can be attached to # subsequent step results as latest evaluation result. self.evaluation_metrics = {"evaluation": metrics} @@ -1348,7 +1347,7 @@ def training_step(self) -> ResultDict: """Default single iteration logic of an algorithm. - Collect on-policy samples (SampleBatches) in parallel using the - Trainer's RolloutWorkers (@ray.remote). + Algorithm's RolloutWorkers (@ray.remote). - Concatenate collected SampleBatches into one train batch. - Note that we may have more than one policy in the multi-agent case: Call the different policies' `learn_on_batch` (simple optimizer) OR @@ -1419,10 +1418,10 @@ def training_step(self) -> ResultDict: @staticmethod def execution_plan(workers, config, **kwargs): raise NotImplementedError( - "It is not longer recommended to use Trainer's `execution_plan` method/API." + "It is no longer supported to use the `Algorithm.execution_plan()` API!" " Set `_disable_execution_plan_api=True` in your config and override the " - "`Trainer.training_step()` method with your algo's custom " - "execution logic." + "`Algorithm.training_step()` method with your algo's custom " + "execution logic instead." ) @PublicAPI @@ -1442,9 +1441,6 @@ def compute_single_action( episode: Optional[Episode] = None, unsquash_action: Optional[bool] = None, clip_action: Optional[bool] = None, - # Deprecated args. - unsquash_actions=DEPRECATED_VALUE, - clip_actions=DEPRECATED_VALUE, # Kwargs placeholder for future compatibility. **kwargs, ) -> Union[ @@ -1494,24 +1490,9 @@ def compute_single_action( or we have an RNN-based Policy. Raises: - KeyError: If the `policy_id` cannot be found in this Trainer's - local worker. + KeyError: If the `policy_id` cannot be found in this Algorithm's local + worker. """ - if clip_actions != DEPRECATED_VALUE: - deprecation_warning( - old="Trainer.compute_single_action(`clip_actions`=...)", - new="Trainer.compute_single_action(`clip_action`=...)", - error=True, - ) - clip_action = clip_actions - if unsquash_actions != DEPRECATED_VALUE: - deprecation_warning( - old="Trainer.compute_single_action(`unsquash_actions`=...)", - new="Trainer.compute_single_action(`unsquash_action`=...)", - error=True, - ) - unsquash_action = unsquash_actions - # `unsquash_action` is None: Use value of config['normalize_actions']. if unsquash_action is None: unsquash_action = self.config.normalize_actions @@ -1523,7 +1504,7 @@ def compute_single_action( # are all None. err_msg = ( "Provide either `input_dict` OR [`observation`, ...] as " - "args to Trainer.compute_single_action!" + "args to `Algorithm.compute_single_action()`!" ) if input_dict is not None: assert ( @@ -1537,12 +1518,12 @@ def compute_single_action( assert observation is not None, err_msg # Get the policy to compute the action for (in the multi-agent case, - # Trainer may hold >1 policies). + # Algorithm may hold >1 policies). policy = self.get_policy(policy_id) if policy is None: raise KeyError( f"PolicyID '{policy_id}' not found in PolicyMap of the " - f"Trainer's local worker!" + f"Algorithm's local worker!" ) local_worker = self.workers.local_worker() @@ -1645,8 +1626,6 @@ def compute_actions( episodes: Optional[List[Episode]] = None, unsquash_actions: Optional[bool] = None, clip_actions: Optional[bool] = None, - # Deprecated. - normalize_actions=None, **kwargs, ): """Computes an action for the specified policy on the local Worker. @@ -1688,14 +1667,6 @@ def compute_actions( the full output of policy.compute_actions_from_input_dict() if full_fetch=True or we have an RNN-based Policy. """ - if normalize_actions is not None: - deprecation_warning( - old="Trainer.compute_actions(`normalize_actions`=...)", - new="Trainer.compute_actions(`unsquash_actions`=...)", - error=True, - ) - unsquash_actions = normalize_actions - # `unsquash_actions` is None: Use value of config['normalize_actions']. if unsquash_actions is None: unsquash_actions = self.config.normalize_actions @@ -1822,8 +1793,6 @@ def add_policy( ] = None, evaluation_workers: bool = True, module_spec: Optional[SingleAgentRLModuleSpec] = None, - # Deprecated. - workers: Optional[List[Union[RolloutWorker, ActorHandle]]] = DEPRECATED_VALUE, ) -> Optional[Policy]: """Adds a new policy to this Algorithm. @@ -1861,10 +1830,6 @@ def add_policy( module_spec: In the new RLModule API we need to pass in the module_spec for the new module that is supposed to be added. Knowing the policy spec is not sufficient. - workers: A list of RolloutWorker/ActorHandles (remote - RolloutWorkers) to add this policy to. If defined, will only - add the given policy to these workers. - Returns: The newly added policy (the copy that got added to the local @@ -1872,16 +1837,6 @@ def add_policy( """ validate_policy_id(policy_id, error=True) - if workers is not DEPRECATED_VALUE: - deprecation_warning( - old="Algorithm.add_policy(.., workers=..)", - help=( - "The `workers` argument to `Algorithm.add_policy()` is deprecated! " - "Please do not use it anymore." - ), - error=True, - ) - self.workers.add_policy( policy_id, policy_cls, @@ -2004,7 +1959,6 @@ def export_policy_model( def export_policy_checkpoint( self, export_dir: str, - filename_prefix=DEPRECATED_VALUE, # deprecated arg, do not use anymore policy_id: PolicyID = DEFAULT_POLICY_ID, ) -> None: """Exports Policy checkpoint to a local directory and returns an AIR Checkpoint. @@ -2027,14 +1981,6 @@ def export_policy_checkpoint( >>> algo.train() # doctest: +SKIP >>> algo.export_policy_checkpoint("/tmp/export_dir") # doctest: +SKIP """ - # `filename_prefix` should not longer be used as new Policy checkpoints - # contain more than one file with a fixed filename structure. - if filename_prefix != DEPRECATED_VALUE: - deprecation_warning( - old="Algorithm.export_policy_checkpoint(filename_prefix=...)", - error=True, - ) - policy = self.get_policy(policy_id) if policy is None: raise KeyError(f"Policy with ID {policy_id} not found in Algorithm!") @@ -2161,7 +2107,8 @@ def load_checkpoint(self, checkpoint: str) -> None: def log_result(self, result: ResultDict) -> None: # Log after the callback is invoked, so that the user has a chance # to mutate the result. - # TODO: Remove `trainer` arg at some point to fully deprecate the old signature. + # TODO: Remove `algorithm` arg at some point to fully deprecate the old + # signature. self.callbacks.on_train_result(algorithm=self, result=result) # Then log according to Trainable's logging logic. Trainable.log_result(self, result) @@ -2465,7 +2412,7 @@ def get_auto_filled_metrics( return auto_filled @classmethod - def merge_trainer_configs( + def merge_algorithm_configs( cls, config1: AlgorithmConfigDict, config2: PartialAlgorithmConfigDict, @@ -2742,7 +2689,7 @@ def _checkpoint_info_to_algorithm_state( if isinstance(default_config, AlgorithmConfig): new_config = default_config.update_from_dict(state["config"]) else: - new_config = Algorithm.merge_trainer_configs( + new_config = Algorithm.merge_algorithm_configs( default_config, state["config"] ) @@ -3134,21 +3081,8 @@ def _record_usage(self, config): alg = "USER_DEFINED" record_extra_usage_tag(TagKey.RLLIB_ALGORITHM, alg) - @Deprecated(new="Algorithm.compute_single_action()", error=True) - def compute_action(self, *args, **kwargs): - return self.compute_single_action(*args, **kwargs) - - @Deprecated(new="construct WorkerSet(...) instance directly", error=True) - def _make_workers(self, *args, **kwargs): - pass - - @Deprecated(new="AlgorithmConfig.validate()", error=False) - def validate_config(self, config): - pass - - @staticmethod @Deprecated(new="AlgorithmConfig.validate()", error=True) - def _validate_config(config, trainer_or_none): + def validate_config(self, config): pass diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py index ae916a8186f35..9bb08df1f7053 100644 --- a/rllib/algorithms/algorithm_config.py +++ b/rllib/algorithms/algorithm_config.py @@ -127,7 +127,7 @@ class AlgorithmConfig(_Config): ... .resources(num_gpus=0) ... .rollouts(num_rollout_workers=4) ... .callbacks(MemoryTrackingCallbacks) - >>> # A config object can be used to construct the respective Trainer. + >>> # A config object can be used to construct the respective Algorithm. >>> rllib_algo = config.build() # doctest: +SKIP Example: @@ -139,7 +139,7 @@ class AlgorithmConfig(_Config): >>> # Use `to_dict()` method to get the legacy plain python config dict >>> # for usage with `tune.Tuner().fit()`. >>> tune.Tuner( # doctest: +SKIP - ... "[registered trainer class]", param_space=config.to_dict() + ... "[registered Algorithm class]", param_space=config.to_dict() ... ).fit() """ @@ -234,7 +234,7 @@ def overrides(cls, **kwargs): def __init__(self, algo_class=None): # Define all settings and their default values. - # Define the default RLlib Trainer class that this AlgorithmConfig will be + # Define the default RLlib Algorithm class that this AlgorithmConfig will be # applied to. self.algo_class = algo_class @@ -1154,7 +1154,7 @@ def resources( `num_gpus_per_learner_worker` accordingly (e.g. 4 GPUs total, and model needs 2 GPUs: `num_learner_workers = 2` and `num_gpus_per_learner_worker = 2`) - num_cpus_per_learner_worker: Number of CPUs allocated per trainer worker. + num_cpus_per_learner_worker: Number of CPUs allocated per Learner worker. Only necessary for custom processing pipeline inside each Learner requiring multiple CPU cores. Ignored if `num_learner_workers = 0`. num_gpus_per_learner_worker: Number of GPUs allocated per worker. If @@ -3094,7 +3094,7 @@ def get_default_learner_class(self) -> Union[Type["Learner"], str]: Returns: The Learner class to use for this algorithm either as a class type or as - a string (e.g. ray.rllib.core.learner.testing.torch.BCTrainer). + a string (e.g. ray.rllib.core.learner.testing.torch.BC). """ raise NotImplementedError diff --git a/rllib/algorithms/alpha_star/distributed_learners.py b/rllib/algorithms/alpha_star/distributed_learners.py index e2889d4063fe8..dd6fcb03e4561 100644 --- a/rllib/algorithms/alpha_star/distributed_learners.py +++ b/rllib/algorithms/alpha_star/distributed_learners.py @@ -166,7 +166,7 @@ def add_policy(self, policy_id: PolicyID, policy_spec: PolicySpec): # Merge the policies config overrides with the main config. # Also, adjust `num_gpus` (to indicate an individual policy's # num_gpus, not the total number of GPUs). - cfg = Algorithm.merge_trainer_configs( + cfg = Algorithm.merge_algorithm_configs( self.config, dict(policy_spec.config, **{"num_gpus": self.num_gpus_per_policy}), ) diff --git a/rllib/algorithms/alpha_star/league_builder.py b/rllib/algorithms/alpha_star/league_builder.py index 9451110c17e92..7eb316c8a7104 100644 --- a/rllib/algorithms/alpha_star/league_builder.py +++ b/rllib/algorithms/alpha_star/league_builder.py @@ -131,7 +131,7 @@ def __init__( "these evenly amongst league- and main-exploiters)!" ) - # Build trainer's multiagent config. + # Build Algorithm's multiagent config. self.config._is_frozen = False # Make sure the multiagent config dict has no policies defined: assert self.config.policies is None, ( diff --git a/rllib/algorithms/alpha_zero/README.md b/rllib/algorithms/alpha_zero/README.md index 7486877a545a2..66f726f7602ea 100644 --- a/rllib/algorithms/alpha_zero/README.md +++ b/rllib/algorithms/alpha_zero/README.md @@ -10,7 +10,7 @@ The code is Pytorch based. It assumes that the environment is a gym environment, It should also implement a `get_state`and a `set_state` function. - The model used in AlphaZero trainer should extend `ActorCriticModel` and implement the method `compute_priors_and_value`. + The model used in the AlphaZero Algorithm should extend `ActorCriticModel` and implement the method `compute_priors_and_value`. ## Example on CartPole diff --git a/rllib/algorithms/apex_ddpg/apex_ddpg.py b/rllib/algorithms/apex_ddpg/apex_ddpg.py index 794329bdb447c..86d251d006f55 100644 --- a/rllib/algorithms/apex_ddpg/apex_ddpg.py +++ b/rllib/algorithms/apex_ddpg/apex_ddpg.py @@ -11,7 +11,7 @@ class ApexDDPGConfig(DDPGConfig): - """Defines a configuration class from which an ApexDDPG Trainer can be built. + """Defines a configuration class from which an ApexDDPG can be built. Example: @@ -20,7 +20,7 @@ class ApexDDPGConfig(DDPGConfig): from ray.rllib.algorithms.apex_ddpg.apex_ddpg import ApexDDPGConfig config = ApexDDPGConfig().training(lr=0.01).resources(num_gpus=1) print(config.to_dict()) - # Build a Trainer object from the config and run one training iteration. + # Build an Algorithm object from the config and run one training iteration. algo = config.build(env="Pendulum-v1") algo.train() @@ -63,7 +63,8 @@ def __init__(self, algo_class=None): self.timeout_s_sampler_manager = 0.0 self.timeout_s_replay_manager = 0.0 - # Override some of Trainer/DDPG's default values with ApexDDPG-specific values. + # Override some of Algorithm/DDPG's default values with ApexDDPG-specific + # values. self.n_step = 3 self.exploration_config = {"type": "PerWorkerOrnsteinUhlenbeckNoise"} self.num_gpus = 0 diff --git a/rllib/algorithms/apex_dqn/tests/test_apex_dqn.py b/rllib/algorithms/apex_dqn/tests/test_apex_dqn.py index 22bbf9204e91f..31bbc4e10b922 100644 --- a/rllib/algorithms/apex_dqn/tests/test_apex_dqn.py +++ b/rllib/algorithms/apex_dqn/tests/test_apex_dqn.py @@ -130,7 +130,7 @@ def test_apex_lr_schedule(self): ) def _step_n_times(algo, n: int): - """Step trainer n times. + """Step Algorithm n times. Returns: learning rate at the end of the execution. diff --git a/rllib/algorithms/appo/tests/test_appo_learner.py b/rllib/algorithms/appo/tests/test_appo_learner.py index 1bc1bd1b0a087..342458d428c5d 100644 --- a/rllib/algorithms/appo/tests/test_appo_learner.py +++ b/rllib/algorithms/appo/tests/test_appo_learner.py @@ -143,7 +143,7 @@ def test_kl_coeff_changes(self): for _ in framework_iterator(config, frameworks=("torch", "tf2")): algo = config.build() # Call train while results aren't returned because this is - # a asynchronous trainer and results are returned asynchronously. + # a asynchronous algorithm and results are returned asynchronously. while True: results = algo.train() if results.get("info", {}).get(LEARNER_INFO, {}).get(DEFAULT_POLICY_ID): diff --git a/rllib/algorithms/bc/bc.py b/rllib/algorithms/bc/bc.py index 97e546b5a62cf..06d4cca2759f9 100644 --- a/rllib/algorithms/bc/bc.py +++ b/rllib/algorithms/bc/bc.py @@ -4,7 +4,7 @@ class BCConfig(MARWILConfig): - """Defines a configuration class from which a new BC Trainer can be built + """Defines a configuration class from which a new BC Algorithm can be built Example: >>> from ray.rllib.algorithms.bc import BCConfig @@ -13,7 +13,7 @@ class BCConfig(MARWILConfig): >>> config = config.offline_data( # doctest: +SKIP ... input_="./rllib/tests/data/cartpole/large.json") >>> print(config.to_dict()) # doctest:+SKIP - >>> # Build a Trainer object from the config and run 1 training iteration. + >>> # Build an Algorithm object from the config and run 1 training iteration. >>> algo = config.build() # doctest: +SKIP >>> algo.train() # doctest: +SKIP diff --git a/rllib/algorithms/callbacks.py b/rllib/algorithms/callbacks.py index 898a6072c2f32..b74e958b0c238 100644 --- a/rllib/algorithms/callbacks.py +++ b/rllib/algorithms/callbacks.py @@ -70,7 +70,7 @@ def on_algorithm_init( the initialization is done, and before actually training starts. Args: - algorithm: Reference to the trainer instance. + algorithm: Reference to the Algorithm instance. kwargs: Forward compatibility placeholder. """ pass @@ -382,14 +382,6 @@ def on_train_result( """ pass - @Deprecated( - old="on_trainer_init(trainer, **kwargs)", - new="on_algorithm_init(algorithm, **kwargs)", - error=True, - ) - def on_trainer_init(self, *args, **kwargs): - raise DeprecationWarning - class MemoryTrackingCallbacks(DefaultCallbacks): """MemoryTrackingCallbacks can be used to trace and track memory usage @@ -668,8 +660,6 @@ def on_learn_on_batch( def on_train_result(self, *, algorithm=None, result: dict, **kwargs) -> None: for callback in self._callback_list: - # TODO: Remove `trainer` arg at some point to fully deprecate the old - # term. callback.on_train_result(algorithm=algorithm, result=result, **kwargs) return _MultiCallbacks @@ -735,9 +725,8 @@ def on_learn_on_batch( @override(DefaultCallbacks) def on_train_result(self, *, result: dict, algorithm=None, **kwargs) -> None: # TODO(gjoliver): Remove explicit _step tracking and pass - # trainer._iteration as a parameter to on_learn_on_batch() call. + # Algorithm._iteration as a parameter to on_learn_on_batch() call. RE3UpdateCallbacks._step = result["training_iteration"] - # TODO: Remove `trainer` arg at some point to fully deprecate the old term. super().on_train_result(algorithm=algorithm, result=result, **kwargs) diff --git a/rllib/algorithms/cql/cql.py b/rllib/algorithms/cql/cql.py index c4aeaae91aad5..0e06aa3d2bbbb 100644 --- a/rllib/algorithms/cql/cql.py +++ b/rllib/algorithms/cql/cql.py @@ -41,7 +41,7 @@ class CQLConfig(SACConfig): - """Defines a configuration class from which a CQL Trainer can be built. + """Defines a configuration class from which a CQL can be built. Example: >>> from ray.rllib.algorithms.cql import CQLConfig @@ -49,7 +49,7 @@ class CQLConfig(SACConfig): >>> config = config.resources(num_gpus=0) >>> config = config.rollouts(num_rollout_workers=4) >>> print(config.to_dict()) # doctest: +SKIP - >>> # Build a Trainer object from the config and run 1 training iteration. + >>> # Build a Algorithm object from the config and run 1 training iteration. >>> algo = config.build(env="CartPole-v1") # doctest: +SKIP >>> algo.train() # doctest: +SKIP """ @@ -67,7 +67,7 @@ def __init__(self, algo_class=None): self.lagrangian_thresh = 5.0 self.min_q_weight = 5.0 - # Changes to Trainer's/SACConfig's default: + # Changes to Algorithm's/SACConfig's default: # .reporting() self.min_sample_timesteps_per_iteration = 0 diff --git a/rllib/algorithms/cql/tests/test_cql.py b/rllib/algorithms/cql/tests/test_cql.py index 22130f1209584..f884c19065000 100644 --- a/rllib/algorithms/cql/tests/test_cql.py +++ b/rllib/algorithms/cql/tests/test_cql.py @@ -90,7 +90,7 @@ def test_cql_compilation(self): if fw == "tf": pol.get_session().__enter__() - # Example on how to do evaluation on the trained Trainer + # Example on how to do evaluation on the trained Algorithm # using the data from CQL's global replay buffer. # Get a sample (MultiAgentBatch). @@ -119,7 +119,7 @@ def test_cql_compilation(self): ) # The estimated Q-values for the new actions computed - # by our trainer policy. + # by our policy. actions_new = pol.compute_actions_from_input_dict({"obs": obs})[0] if fw == "torch": q_values_new = cql_model.get_q_values( diff --git a/rllib/algorithms/crr/crr.py b/rllib/algorithms/crr/crr.py index 1c892ba36bdbf..d300c5b94475d 100644 --- a/rllib/algorithms/crr/crr.py +++ b/rllib/algorithms/crr/crr.py @@ -200,7 +200,7 @@ def validate(self) -> None: class CRR(Algorithm): # TODO: we have a circular dependency for get - # default config. config -> Trainer -> config + # default config. config -> Algorithm -> config # defining Config class in the same file for now as a workaround. def setup(self, config: AlgorithmConfig): diff --git a/rllib/algorithms/crr/torch/crr_torch_policy.py b/rllib/algorithms/crr/torch/crr_torch_policy.py index bb8cb769e3961..7bb018a7b06d2 100644 --- a/rllib/algorithms/crr/torch/crr_torch_policy.py +++ b/rllib/algorithms/crr/torch/crr_torch_policy.py @@ -30,7 +30,7 @@ l2_loss, ) from ray.rllib.utils.typing import ( - TrainerConfigDict, + AlgorithmConfigDict, TensorType, ) @@ -42,7 +42,7 @@ def __init__( self, observation_space: gym.spaces.Space, action_space: gym.spaces.Space, - config: TrainerConfigDict, + config: AlgorithmConfigDict, ): self.target_model = None # assign it in self.make_model diff --git a/rllib/algorithms/ddpg/ddpg.py b/rllib/algorithms/ddpg/ddpg.py index 4c82041dbaca7..d2ffb84558e83 100644 --- a/rllib/algorithms/ddpg/ddpg.py +++ b/rllib/algorithms/ddpg/ddpg.py @@ -11,13 +11,13 @@ class DDPGConfig(SimpleQConfig): - """Defines a configuration class from which a DDPG Trainer can be built. + """Defines a configuration class from which a DDPG can be built. Example: >>> from ray.rllib.algorithms.ddpg.ddpg import DDPGConfig >>> config = DDPGConfig().training(lr=0.01).resources(num_gpus=1) >>> print(config.to_dict()) # doctest: +SKIP - >>> # Build a Trainer object from the config and run one training iteration. + >>> # Build a Algorithm object from the config and run one training iteration. >>> algo = config.build(env="Pendulum-v1") # doctest: +SKIP >>> algo.train() # doctest: +SKIP diff --git a/rllib/algorithms/ddppo/ddppo.py b/rllib/algorithms/ddppo/ddppo.py index cb24f9d3979dc..4df6bc87fb78a 100644 --- a/rllib/algorithms/ddppo/ddppo.py +++ b/rllib/algorithms/ddppo/ddppo.py @@ -2,7 +2,7 @@ Decentralized Distributed PPO (DD-PPO) ====================================== -Unlike APPO or PPO, learning is no longer done centralized in the trainer +Unlike APPO or PPO, learning is no longer done centralized in the Algorithm process. Instead, gradients are computed remotely on each rollout worker and all-reduced to sync them at each mini-batch. This allows each worker's GPU to be used both for sampling and for training. @@ -147,7 +147,7 @@ def training( Args: keep_local_weights_in_sync: Download weights between each training step. This adds a bit of overhead but allows the user to access the weights - from the trainer. + from the Algorithm. torch_distributed_backend: The communication backend for PyTorch distributed. diff --git a/rllib/algorithms/dqn/dqn.py b/rllib/algorithms/dqn/dqn.py index cef8eb27814e6..0fc557cdca2ae 100644 --- a/rllib/algorithms/dqn/dqn.py +++ b/rllib/algorithms/dqn/dqn.py @@ -37,9 +37,6 @@ NUM_AGENT_STEPS_SAMPLED, SAMPLE_TIMER, ) -from ray.rllib.utils.deprecation import ( - Deprecated, -) from ray.rllib.utils.metrics import SYNCH_WORKER_WEIGHTS_TIMER from ray.rllib.execution.common import ( LAST_TARGET_UPDATE_TS, @@ -70,8 +67,8 @@ class DQNConfig(SimpleQConfig): >>> config = config.resources(num_gpus=1) # doctest: +SKIP >>> config = config.rollouts(num_rollout_workers=3) # doctest: +SKIP >>> config = config.environment("CartPole-v1") # doctest: +SKIP - >>> trainer = DQN(config=config) # doctest: +SKIP - >>> trainer.train() # doctest: +SKIP + >>> algo = DQN(config=config) # doctest: +SKIP + >>> algo.train() # doctest: +SKIP Example: >>> from ray.rllib.algorithms.dqn.dqn import DQNConfig @@ -475,8 +472,3 @@ def training_step(self) -> ResultDict: # Return all collected metrics for the iteration. return train_results - - -@Deprecated(new="Sub-class directly from `DQN` and override its methods", error=True) -class GenericOffPolicyTrainer(SimpleQ): - pass diff --git a/rllib/algorithms/dqn/tests/test_dqn.py b/rllib/algorithms/dqn/tests/test_dqn.py index 6fa11db3f8c7c..2a8c86fb19f24 100644 --- a/rllib/algorithms/dqn/tests/test_dqn.py +++ b/rllib/algorithms/dqn/tests/test_dqn.py @@ -34,29 +34,29 @@ def test_dqn_compilation(self): for _ in framework_iterator(config, with_eager_tracing=True): # Double-dueling DQN. print("Double-dueling") - trainer = config.build() + algo = config.build() for i in range(num_iterations): - results = trainer.train() + results = algo.train() check_train_results(results) print(results) - check_compute_single_action(trainer) - trainer.stop() + check_compute_single_action(algo) + algo.stop() # Rainbow. print("Rainbow") rainbow_config = deepcopy(config).training( num_atoms=10, noisy=True, double_q=True, dueling=True, n_step=5 ) - trainer = rainbow_config.build() + algo = rainbow_config.build() for i in range(num_iterations): - results = trainer.train() + results = algo.train() check_train_results(results) print(results) - check_compute_single_action(trainer) + check_compute_single_action(algo) - trainer.stop() + algo.stop() def test_dqn_compilation_integer_rewards(self): """Test whether DQN can be built on all frameworks. @@ -73,29 +73,29 @@ def test_dqn_compilation_integer_rewards(self): for _ in framework_iterator(config, with_eager_tracing=True): # Double-dueling DQN. print("Double-dueling") - trainer = config.build() + algo = config.build() for i in range(num_iterations): - results = trainer.train() + results = algo.train() check_train_results(results) print(results) - check_compute_single_action(trainer) - trainer.stop() + check_compute_single_action(algo) + algo.stop() # Rainbow. print("Rainbow") rainbow_config = deepcopy(config).training( num_atoms=10, noisy=True, double_q=True, dueling=True, n_step=5 ) - trainer = rainbow_config.build() + algo = rainbow_config.build() for i in range(num_iterations): - results = trainer.train() + results = algo.train() check_train_results(results) print(results) - check_compute_single_action(trainer) + check_compute_single_action(algo) - trainer.stop() + algo.stop() def test_dqn_exploration_and_soft_q_config(self): """Tests, whether a DQN Agent outputs exploration/softmaxed actions.""" @@ -111,59 +111,59 @@ def test_dqn_exploration_and_soft_q_config(self): # Test against all frameworks. for _ in framework_iterator(config): # Default EpsilonGreedy setup. - trainer = config.build() + algo = config.build() # Setting explore=False should always return the same action. - a_ = trainer.compute_single_action(obs, explore=False) + a_ = algo.compute_single_action(obs, explore=False) for _ in range(50): - a = trainer.compute_single_action(obs, explore=False) + a = algo.compute_single_action(obs, explore=False) check(a, a_) # explore=None (default: explore) should return different actions. actions = [] for _ in range(50): - actions.append(trainer.compute_single_action(obs)) + actions.append(algo.compute_single_action(obs)) check(np.std(actions), 0.0, false=True) - trainer.stop() + algo.stop() # Low softmax temperature. Behaves like argmax # (but no epsilon exploration). config.exploration( exploration_config={"type": "SoftQ", "temperature": 0.000001} ) - trainer = config.build() + algo = config.build() # Due to the low temp, always expect the same action. - actions = [trainer.compute_single_action(obs)] + actions = [algo.compute_single_action(obs)] for _ in range(50): - actions.append(trainer.compute_single_action(obs)) + actions.append(algo.compute_single_action(obs)) check(np.std(actions), 0.0, decimals=3) - trainer.stop() + algo.stop() # Higher softmax temperature. config.exploration_config["temperature"] = 1.0 - trainer = config.build() + algo = config.build() # Even with the higher temperature, if we set explore=False, we # should expect the same actions always. - a_ = trainer.compute_single_action(obs, explore=False) + a_ = algo.compute_single_action(obs, explore=False) for _ in range(50): - a = trainer.compute_single_action(obs, explore=False) + a = algo.compute_single_action(obs, explore=False) check(a, a_) # Due to the higher temp, expect different actions avg'ing # around 1.5. actions = [] for _ in range(300): - actions.append(trainer.compute_single_action(obs)) + actions.append(algo.compute_single_action(obs)) check(np.std(actions), 0.0, false=True) - trainer.stop() + algo.stop() # With Random exploration. config.exploration(exploration_config={"type": "Random"}, explore=True) - trainer = config.build() + algo = config.build() actions = [] for _ in range(300): - actions.append(trainer.compute_single_action(obs)) + actions.append(algo.compute_single_action(obs)) check(np.std(actions), 0.0, false=True) - trainer.stop() + algo.stop() if __name__ == "__main__": diff --git a/rllib/algorithms/dt/dt.py b/rllib/algorithms/dt/dt.py index 1ad1be3a73cc7..2c858ca99036a 100644 --- a/rllib/algorithms/dt/dt.py +++ b/rllib/algorithms/dt/dt.py @@ -81,7 +81,7 @@ def __init__(self, algo_class=None): # __sphinx_doc_end__ # fmt: on - # Overwriting the trainer config default + # Overwriting the Algorithm config default # Number of training_step calls between evaluation rollouts. self.min_train_timesteps_per_iteration = 5000 diff --git a/rllib/algorithms/dt/dt_torch_policy.py b/rllib/algorithms/dt/dt_torch_policy.py index b7f06e299ebef..08268317209b6 100644 --- a/rllib/algorithms/dt/dt_torch_policy.py +++ b/rllib/algorithms/dt/dt_torch_policy.py @@ -33,7 +33,7 @@ from ray.rllib.utils.threading import with_lock from ray.rllib.utils.torch_utils import apply_grad_clipping from ray.rllib.utils.typing import ( - TrainerConfigDict, + AlgorithmConfigDict, TensorType, TensorStructType, TensorShape, @@ -51,7 +51,7 @@ def __init__( self, observation_space: gym.spaces.Space, action_space: gym.spaces.Space, - config: TrainerConfigDict, + config: AlgorithmConfigDict, ): LearningRateSchedule.__init__( self, diff --git a/rllib/algorithms/impala/impala.py b/rllib/algorithms/impala/impala.py index 4403936aab156..9bd59b19dbd92 100644 --- a/rllib/algorithms/impala/impala.py +++ b/rllib/algorithms/impala/impala.py @@ -840,7 +840,7 @@ def default_resource_request( # TODO(avnishn): Remove this once we have a way to extend placement group # factories. if cf._enable_learner_api: - # resources for the trainer + # Resources for the Algorithm. if cf.num_learner_workers == 0: # if num_learner_workers is 0, then we need to allocate one gpu if # num_gpus_per_learner_worker is greater than 0. diff --git a/rllib/algorithms/leela_chess_zero/README.md b/rllib/algorithms/leela_chess_zero/README.md index 1ec9d7b724a7f..0902c1f23c2c8 100644 --- a/rllib/algorithms/leela_chess_zero/README.md +++ b/rllib/algorithms/leela_chess_zero/README.md @@ -11,7 +11,7 @@ It assumes that the environment is a MultiAgent Chess environment, that has a di It should also implement a `get_state`and a `set_state` function, used in the MCTS implementation. - The model used in AlphaZero trainer should extend `TorchModelV2` and implement the method `compute_priors_and_value`. + The model used in the AlphaZero Algorithm should extend `TorchModelV2` and implement the method `compute_priors_and_value`. ## Example on Chess diff --git a/rllib/algorithms/marwil/marwil.py b/rllib/algorithms/marwil/marwil.py index 24ff8b9532ebc..d0e8f42b2f531 100644 --- a/rllib/algorithms/marwil/marwil.py +++ b/rllib/algorithms/marwil/marwil.py @@ -83,7 +83,7 @@ def __init__(self, algo_class=None): # Override some of AlgorithmConfig's default values with MARWIL-specific values. # You should override input_ to point to an offline dataset - # (see trainer.py and trainer_config.py). + # (see algorithm.py and algorithm_config.py). # The dataset may have an arbitrary number of timesteps # (and even episodes) per line. # However, each line must only contain consecutive timesteps in diff --git a/rllib/algorithms/marwil/tests/test_marwil.py b/rllib/algorithms/marwil/tests/test_marwil.py index e514b1001f20f..ac69b5f0bbb90 100644 --- a/rllib/algorithms/marwil/tests/test_marwil.py +++ b/rllib/algorithms/marwil/tests/test_marwil.py @@ -93,7 +93,7 @@ def test_marwil_compilation_and_learning_from_offline_file(self): algo.stop() def test_marwil_cont_actions_from_offline_file(self): - """Test whether MARWILTrainer runs with cont. actions. + """Test whether MARWIL runs with cont. actions. Learns from a historic-data file. To generate this data, first run: diff --git a/rllib/algorithms/mbmpo/mbmpo_torch_policy.py b/rllib/algorithms/mbmpo/mbmpo_torch_policy.py index 112e65cd24609..891f33a581309 100644 --- a/rllib/algorithms/mbmpo/mbmpo_torch_policy.py +++ b/rllib/algorithms/mbmpo/mbmpo_torch_policy.py @@ -43,7 +43,7 @@ def make_model_and_action_dist( Args: obs_space (gym.spaces.Space): The observation space. action_space (gym.spaces.Space): The action space. - config: The SAC trainer's config dict. + config: The SACConfig object. Returns: ModelV2: The ModelV2 to be used by the Policy. Note: An additional diff --git a/rllib/algorithms/mock.py b/rllib/algorithms/mock.py index ae885d96679bb..ad4a9867f9c86 100644 --- a/rllib/algorithms/mock.py +++ b/rllib/algorithms/mock.py @@ -10,7 +10,7 @@ class _MockTrainer(Algorithm): - """Mock trainer for use in tests.""" + """Mock Algorithm for use in tests.""" @classmethod @override(Algorithm) @@ -88,7 +88,7 @@ def get_info(self, sess=None): class _SigmoidFakeData(_MockTrainer): - """Trainer that returns sigmoid learning curves. + """Algorithm that returns sigmoid learning curves. This can be helpful for evaluating early stopping algorithms.""" @@ -145,10 +145,10 @@ def step(self): def _algorithm_import_failed(trace): """Returns dummy Algorithm class for if PyTorch etc. is not installed.""" - class _TrainerImportFailed(Algorithm): - _name = "TrainerImportFailed" + class _AlgorithmImportFailed(Algorithm): + _name = "AlgorithmImportFailed" def setup(self, config): raise ImportError(trace) - return _TrainerImportFailed + return _AlgorithmImportFailed diff --git a/rllib/algorithms/pg/pg.py b/rllib/algorithms/pg/pg.py index ac24e8af26951..b897b092406af 100644 --- a/rllib/algorithms/pg/pg.py +++ b/rllib/algorithms/pg/pg.py @@ -104,9 +104,9 @@ def validate(self) -> None: class PG(Algorithm): - """Policy Gradient (PG) Trainer. + """Policy Gradient (PG) Algorithm. - Defines the distributed Trainer class for policy gradients. + Defines the distributed Algorithm class for policy gradients. See `pg_[tf|torch]_policy.py` for the definition of the policy losses for TensorFlow and PyTorch. diff --git a/rllib/algorithms/pg/pg_tf_policy.py b/rllib/algorithms/pg/pg_tf_policy.py index 06881a8a72cb6..7e967b821b16b 100644 --- a/rllib/algorithms/pg/pg_tf_policy.py +++ b/rllib/algorithms/pg/pg_tf_policy.py @@ -37,7 +37,7 @@ def get_pg_tf_policy(name: str, base: TFPolicyV2Type) -> TFPolicyV2Type: base: Base class for this policy. DynamicTFPolicyV2 or EagerTFPolicyV2. Returns: - A TF Policy to be used with PGTrainer. + A TF Policy to be used with PG. """ class PGTFPolicy( diff --git a/rllib/algorithms/pg/pg_torch_policy.py b/rllib/algorithms/pg/pg_torch_policy.py index a04f4834ca4ed..84ccc10a0b827 100644 --- a/rllib/algorithms/pg/pg_torch_policy.py +++ b/rllib/algorithms/pg/pg_torch_policy.py @@ -26,7 +26,7 @@ class PGTorchPolicy(LearningRateSchedule, TorchPolicyV2): - """PyTorch policy class used with PGTrainer.""" + """PyTorch policy class used with PG.""" def __init__(self, observation_space, action_space, config: PGConfig): diff --git a/rllib/algorithms/pg/tests/test_pg.py b/rllib/algorithms/pg/tests/test_pg.py index 5acdbbe6b7a3f..477525fa1731d 100644 --- a/rllib/algorithms/pg/tests/test_pg.py +++ b/rllib/algorithms/pg/tests/test_pg.py @@ -203,7 +203,7 @@ def test_pg_lr(self): ) def _step_n_times(algo, n: int): - """Step trainer n times. + """Step Algorithm n times. Returns: learning rate at the end of the execution. diff --git a/rllib/algorithms/ppo/ppo.py b/rllib/algorithms/ppo/ppo.py index d435e469b23ce..76e0e2c2996e1 100644 --- a/rllib/algorithms/ppo/ppo.py +++ b/rllib/algorithms/ppo/ppo.py @@ -386,7 +386,7 @@ def update(pi, pi_id): else: logger.warning("No data for {}, not updating kl".format(pi_id)) - # Update KL on all trainable policies within the local (trainer) + # Update KL on all trainable policies within the local (training) # Worker. self.workers.local_worker().foreach_policy_to_train(update) @@ -439,11 +439,10 @@ def training_step(self) -> ResultDict: # Train if self.config._enable_learner_api: # TODO (Kourosh) Clearly define what train_batch_size - # vs. sgd_minibatch_size and num_sgd_iter is in the config. - # TODO (Kourosh) Do this inside the RL Trainer so - # that we don't have to do this back and forth - # communication between driver and the remote - # trainer workers + # vs. sgd_minibatch_size and num_sgd_iter is in the config. + # TODO (Kourosh) Do this inside the Learner so that we don't have to do + # this back and forth communication between driver and the remote + # learner actors. is_module_trainable = self.workers.local_worker().is_policy_to_train self.learner_group.set_is_module_trainable(is_module_trainable) train_results = self.learner_group.update( diff --git a/rllib/algorithms/ppo/tests/test_ppo.py b/rllib/algorithms/ppo/tests/test_ppo.py index 3ee15f78229b5..51fc37fd346f6 100644 --- a/rllib/algorithms/ppo/tests/test_ppo.py +++ b/rllib/algorithms/ppo/tests/test_ppo.py @@ -283,9 +283,9 @@ def test_ppo_exploration_setup(self): # Test against all frameworks. for fw in framework_iterator(config): # Default Agent should be setup with StochasticSampling. - trainer = config.build() + algo = config.build() # explore=False, always expect the same (deterministic) action. - a_ = trainer.compute_single_action( + a_ = algo.compute_single_action( obs, explore=False, prev_action=np.array(2), prev_reward=np.array(1.0) ) @@ -293,14 +293,14 @@ def test_ppo_exploration_setup(self): # TODO (Kourosh): Only meaningful in the ModelV2 stack. config.validate() if not config._enable_rl_module_api and fw != "tf": - last_out = trainer.get_policy().model.last_output() + last_out = algo.get_policy().model.last_output() if fw == "torch": check(a_, np.argmax(last_out.detach().cpu().numpy(), 1)[0]) else: check(a_, np.argmax(last_out.numpy(), 1)[0]) for _ in range(50): - a = trainer.compute_single_action( + a = algo.compute_single_action( obs, explore=False, prev_action=np.array(2), @@ -312,12 +312,12 @@ def test_ppo_exploration_setup(self): actions = [] for _ in range(300): actions.append( - trainer.compute_single_action( + algo.compute_single_action( obs, prev_action=np.array(2), prev_reward=np.array(1.0) ) ) check(np.mean(actions), 1.5, atol=0.2) - trainer.stop() + algo.stop() def test_ppo_free_log_std(self): """Tests the free log std option works. @@ -347,8 +347,8 @@ def test_ppo_free_log_std(self): ) for fw, sess in framework_iterator(config, session=True): - trainer = config.build() - policy = trainer.get_policy() + algo = config.build() + policy = algo.get_policy() # Check the free log std var is created. if fw == "torch": @@ -384,7 +384,7 @@ def get_value(fw=fw, policy=policy, log_std_var=log_std_var): # Check the variable is updated. post_std = get_value() assert post_std != 0.0, post_std - trainer.stop() + algo.stop() def test_ppo_loss_function(self): """Tests the PPO loss function math. @@ -412,8 +412,8 @@ def test_ppo_loss_function(self): ) for fw, sess in framework_iterator(config, session=True): - trainer = config.build() - policy = trainer.get_policy() + algo = config.build() + policy = algo.get_policy() # Check no free log std var by default. if fw == "torch": @@ -512,7 +512,7 @@ def test_ppo_loss_function(self): check(policy._mean_policy_loss, np.mean(-pg_loss)) check(policy._mean_vf_loss, np.mean(vf_loss), decimals=4) check(policy._total_loss, overall_loss, decimals=4) - trainer.stop() + algo.stop() def _ppo_loss_helper( self, policy, model, dist_class, train_batch, logits, vf_outs, sess=None diff --git a/rllib/algorithms/ppo/tests/test_ppo_learner.py b/rllib/algorithms/ppo/tests/test_ppo_learner.py index 1eb8e082c1025..ebfb9eb7e7949 100644 --- a/rllib/algorithms/ppo/tests/test_ppo_learner.py +++ b/rllib/algorithms/ppo/tests/test_ppo_learner.py @@ -200,7 +200,7 @@ def test_kl_coeff_changes(self): for _ in framework_iterator(config, ("torch", "tf2"), with_eager_tracing=True): algo = config.build() # Call train while results aren't returned because this is - # a asynchronous trainer and results are returned asynchronously. + # a asynchronous Algorithm and results are returned asynchronously. curr_kl_coeff_1 = None curr_kl_coeff_2 = None while not curr_kl_coeff_1 or not curr_kl_coeff_2: diff --git a/rllib/algorithms/ppo/tests/test_ppo_rl_module.py b/rllib/algorithms/ppo/tests/test_ppo_rl_module.py index 938e285b1eb32..de6dbe61aa6fe 100644 --- a/rllib/algorithms/ppo/tests/test_ppo_rl_module.py +++ b/rllib/algorithms/ppo/tests/test_ppo_rl_module.py @@ -50,7 +50,7 @@ def get_expected_module_config( def dummy_torch_ppo_loss(module, batch, fwd_out): """Dummy PPO loss function for testing purposes. - Will eventually use the actual PPO loss function implemented in the PPOTfTrainer. + Will eventually use the actual PPO loss function implemented in PPO. Args: batch: SampleBatch used for training. @@ -78,7 +78,7 @@ def dummy_torch_ppo_loss(module, batch, fwd_out): def dummy_tf_ppo_loss(module, batch, fwd_out): """Dummy PPO loss function for testing purposes. - Will eventually use the actual PPO loss function implemented in the PPOTfTrainer. + Will eventually use the actual PPO loss function implemented in PPO. Args: module: PPOTfRLModule diff --git a/rllib/algorithms/registry.py b/rllib/algorithms/registry.py index 5387420cc5230..aa39512a827a3 100644 --- a/rllib/algorithms/registry.py +++ b/rllib/algorithms/registry.py @@ -309,7 +309,7 @@ def get_algorithm_class( alg: str, return_config=False, ) -> Union[Type["Algorithm"], Tuple[Type["Algorithm"], "AlgorithmConfig"]]: - """Returns the class of a known Trainer given its name.""" + """Returns the class of a known Algorithm given its name.""" try: return _get_algorithm_class(alg, return_config=return_config) @@ -323,10 +323,6 @@ def get_algorithm_class( return class_ -# Backward compat alias. -get_trainer_class = get_algorithm_class - - def _get_algorithm_class(alg: str) -> type: # This helps us get around a circular import (tune calls rllib._register_all when # checking if a rllib Trainable is registered) diff --git a/rllib/algorithms/sac/sac_tf_policy.py b/rllib/algorithms/sac/sac_tf_policy.py index a2a72cd96f563..3a3072986e159 100644 --- a/rllib/algorithms/sac/sac_tf_policy.py +++ b/rllib/algorithms/sac/sac_tf_policy.py @@ -61,7 +61,7 @@ def build_sac_model( policy: The TFPolicy that will use the models. obs_space (gym.spaces.Space): The observation space. action_space (gym.spaces.Space): The action space. - config: The SAC trainer's config dict. + config: The SACConfig object. Returns: ModelV2: The ModelV2 to be used by the Policy. Note: An additional diff --git a/rllib/algorithms/sac/sac_torch_policy.py b/rllib/algorithms/sac/sac_torch_policy.py index aa79c7b7bd50b..eebcc18d3a223 100644 --- a/rllib/algorithms/sac/sac_torch_policy.py +++ b/rllib/algorithms/sac/sac_torch_policy.py @@ -99,7 +99,7 @@ def build_sac_model_and_action_dist( policy: The TFPolicy that will use the models. obs_space (gym.spaces.Space): The observation space. action_space (gym.spaces.Space): The action space. - config: The SAC trainer's config dict. + config: The SACConfig object. Returns: ModelV2: The ModelV2 to be used by the Policy. Note: An additional diff --git a/rllib/algorithms/simple_q/simple_q_tf_policy.py b/rllib/algorithms/simple_q/simple_q_tf_policy.py index e892d5ebc959f..e899ec9c3c3ab 100644 --- a/rllib/algorithms/simple_q/simple_q_tf_policy.py +++ b/rllib/algorithms/simple_q/simple_q_tf_policy.py @@ -39,7 +39,7 @@ def get_simple_q_tf_policy( base: Base class for this policy. DynamicTFPolicyV2 or EagerTFPolicyV2. Returns: - A TF Policy to be used with MAMLTrainer. + A TF Policy to be used with MAML. """ class SimpleQTFPolicy(LearningRateSchedule, TargetNetworkMixin, base): diff --git a/rllib/algorithms/simple_q/simple_q_torch_policy.py b/rllib/algorithms/simple_q/simple_q_torch_policy.py index 091d346f8344c..a0d9921230bf8 100644 --- a/rllib/algorithms/simple_q/simple_q_torch_policy.py +++ b/rllib/algorithms/simple_q/simple_q_torch_policy.py @@ -31,7 +31,7 @@ class SimpleQTorchPolicy( TargetNetworkMixin, TorchPolicyV2, ): - """PyTorch policy class used with SimpleQTrainer.""" + """PyTorch policy class used with SimpleQ.""" def __init__(self, observation_space, action_space, config): TorchPolicyV2.__init__( diff --git a/rllib/algorithms/simple_q/tests/test_simple_q.py b/rllib/algorithms/simple_q/tests/test_simple_q.py index 18ba913bd6ebe..b44c188c96c88 100644 --- a/rllib/algorithms/simple_q/tests/test_simple_q.py +++ b/rllib/algorithms/simple_q/tests/test_simple_q.py @@ -70,8 +70,8 @@ def test_simple_q_loss_function(self): for fw in framework_iterator(config): # Generate Algorithm and get its default Policy object. - trainer = config.build() - policy = trainer.get_policy() + algo = config.build() + policy = algo.get_policy() # Batch of size=2. input_ = SampleBatch( { @@ -168,7 +168,7 @@ def test_simple_q_lr_schedule(self): config.training(lr=0.2, lr_schedule=[[0, 0.2], [500, 0.001]]) def _step_n_times(algo, n: int): - """Step trainer n times. + """Step Algorithm n times. Returns: learning rate at the end of the execution. diff --git a/rllib/connectors/util.py b/rllib/connectors/util.py index 565995aaea144..faa39d7982f1e 100644 --- a/rllib/connectors/util.py +++ b/rllib/connectors/util.py @@ -17,17 +17,17 @@ MeanStdObservationFilterAgentConnector, ConcurrentMeanStdObservationFilterAgentConnector, ) -from ray.rllib.utils.typing import TrainerConfigDict from ray.util.annotations import PublicAPI, DeveloperAPI from ray.rllib.connectors.agent.synced_filter import SyncedFilterAgentConnector if TYPE_CHECKING: + from ray.rllib.algorithms.algorithm_config import AlgorithmConfig from ray.rllib.policy.policy import Policy logger = logging.getLogger(__name__) -def __preprocessing_enabled(config: TrainerConfigDict): +def __preprocessing_enabled(config: "AlgorithmConfig"): if config._disable_preprocessor_api: return False # Same conditions as in RolloutWorker.__init__. @@ -38,7 +38,7 @@ def __preprocessing_enabled(config: TrainerConfigDict): return True -def __clip_rewards(config: TrainerConfigDict): +def __clip_rewards(config: "AlgorithmConfig"): # Same logic as in RolloutWorker.__init__. # We always clip rewards for Atari games. return config.clip_rewards or config.is_atari @@ -47,7 +47,7 @@ def __clip_rewards(config: TrainerConfigDict): @PublicAPI(stability="alpha") def get_agent_connectors_from_config( ctx: ConnectorContext, - config: TrainerConfigDict, + config: "AlgorithmConfig", ) -> AgentConnectorPipeline: connectors = [] @@ -81,13 +81,13 @@ def get_agent_connectors_from_config( @PublicAPI(stability="alpha") def get_action_connectors_from_config( ctx: ConnectorContext, - config: TrainerConfigDict, + config: "AlgorithmConfig", ) -> ActionConnectorPipeline: """Default list of action connectors to use for a new policy. Args: ctx: context used to create connectors. - config: trainer config. + config: The AlgorithmConfig object. """ connectors = [ConvertToNumpyConnector(ctx)] if config.get("normalize_actions", False): @@ -99,12 +99,12 @@ def get_action_connectors_from_config( @PublicAPI(stability="alpha") -def create_connectors_for_policy(policy: "Policy", config: TrainerConfigDict): +def create_connectors_for_policy(policy: "Policy", config: "AlgorithmConfig"): """Util to create agent and action connectors for a Policy. Args: policy: Policy instance. - config: Trainer config dict. + config: Algorithm config dict. """ ctx: ConnectorContext = ConnectorContext.from_policy(policy) diff --git a/rllib/core/learner/learner.py b/rllib/core/learner/learner.py index 1a329d2a978e5..a240ad7df83cb 100644 --- a/rllib/core/learner/learner.py +++ b/rllib/core/learner/learner.py @@ -987,7 +987,7 @@ def additional_update( timestep: int, **kwargs, ) -> Mapping[ModuleID, Any]: - """Apply additional non-gradient based updates to this Trainer. + """Apply additional non-gradient based updates to this Algorithm. For example, this could be used to do a polyak averaging update of a target network in off policy algorithms like SAC or DQN. diff --git a/rllib/core/learner/learner_group_config.py b/rllib/core/learner/learner_group_config.py index 350ba282950e3..69d74a4ffd9b9 100644 --- a/rllib/core/learner/learner_group_config.py +++ b/rllib/core/learner/learner_group_config.py @@ -69,7 +69,7 @@ def validate(self) -> None: if self.learner_class is None: raise ValueError( "Cannot initialize an Learner without an Learner class. Please provide " - "the Learner class with .learner(learner_class=MyTrainerClass)." + "the Learner class with .learner(learner_class=MyLearnerClass)." ) def build(self) -> LearnerGroup: diff --git a/rllib/env/env_context.py b/rllib/env/env_context.py index 922f19bada25a..a25d8e9e213a3 100644 --- a/rllib/env/env_context.py +++ b/rllib/env/env_context.py @@ -29,7 +29,7 @@ def __init__( Args: env_config: The env's configuration defined under the - "env_config" key in the Trainer's config. + "env_config" key in the Algorithm's config. worker_index: When there are multiple workers created, this uniquely identifies the worker the env is created in. 0 for local worker, >0 for remote workers. @@ -42,7 +42,7 @@ def __init__( 0 if only a local worker exists. recreated_worker: Whether the worker that holds this env is a recreated one. This means that it replaced a previous (failed) worker when - `recreate_failed_workers=True` in the Trainer's config. + `recreate_failed_workers=True` in the Algorithm's config. """ # Store the env_config in the (super) dict. dict.__init__(self, env_config) @@ -78,7 +78,7 @@ def copy_with_overrides( the one from the source (self). recreated_worker: Optional flag, indicating, whether the worker that holds the env is a recreated one. This means that it replaced a previous - (failed) worker when `recreate_failed_workers=True` in the Trainer's + (failed) worker when `recreate_failed_workers=True` in the Algorithm's config. Returns: diff --git a/rllib/env/external_env.py b/rllib/env/external_env.py index 11dbdd2d21fdf..5a2b9c618f0d1 100644 --- a/rllib/env/external_env.py +++ b/rllib/env/external_env.py @@ -44,9 +44,9 @@ class ExternalEnv(threading.Thread): >>> YourExternalEnv = ... # doctest: +SKIP >>> register_env("my_env", # doctest: +SKIP ... lambda config: YourExternalEnv(config)) - >>> trainer = DQN(env="my_env") # doctest: +SKIP + >>> algo = DQN(env="my_env") # doctest: +SKIP >>> while True: # doctest: +SKIP - >>> print(trainer.train()) # doctest: +SKIP + >>> print(algo.train()) # doctest: +SKIP """ @PublicAPI diff --git a/rllib/env/tests/test_policy_client_server_setup.sh b/rllib/env/tests/test_policy_client_server_setup.sh index a3d97cb5ca721..5ee25921f54a8 100755 --- a/rllib/env/tests/test_policy_client_server_setup.sh +++ b/rllib/env/tests/test_policy_client_server_setup.sh @@ -17,20 +17,20 @@ if [ "$2" == "cartpole" ]; then server_script=cartpole_server.py client_script=cartpole_client.py stop_criterion="--stop-reward=150.0" - trainer_cls="PPO" + algo_cls="PPO" use_lstm="" elif [ "$2" == "cartpole_lstm" ]; then server_script=cartpole_server.py client_script=cartpole_client.py stop_criterion="--stop-reward=150.0" - trainer_cls="IMPALA" + algo_cls="IMPALA" use_lstm="--use-lstm" # Unity3D dummy setup. elif [ "$2" == "unity3d" ]; then server_script=unity3d_server.py client_script=unity3d_dummy_client.py stop_criterion="--num-episodes=10" - trainer_cls="PPO" + algo_cls="PPO" use_lstm="" # CartPole dummy test using 2 simultaneous episodes on the client. # One episode has training_enabled=False (its data should NOT arrive at server). @@ -38,7 +38,7 @@ else server_script=cartpole_server.py client_script=dummy_client_with_two_episodes.py stop_criterion="--dummy-arg=dummy" # no stop criterion: client script terminates either way - trainer_cls="PPO" + algo_cls="PPO" use_lstm="" fi @@ -61,7 +61,7 @@ fi # connections). # Do not attempt to restore from checkpoint; leads to errors on travis. # shellcheck disable=SC2086 -(python $basedir/$server_script --run="$trainer_cls" --num-workers=2 $use_lstm --no-restore --port=$worker_1_port 2>&1 | grep -v 200) & +(python $basedir/$server_script --run="$algo_cls" --num-workers=2 $use_lstm --no-restore --port=$worker_1_port 2>&1 | grep -v 200) & server_pid=$! echo "Waiting for server to start ..." diff --git a/rllib/env/wrappers/recsim.py b/rllib/env/wrappers/recsim.py index 988e3e46cc610..5da8288321c92 100644 --- a/rllib/env/wrappers/recsim.py +++ b/rllib/env/wrappers/recsim.py @@ -225,7 +225,7 @@ def make_recsim_env( reward). Returns: - An RLlib-ready gym.Env class to use inside a Trainer. + An RLlib-ready gym.Env class to use inside an Algorithm. """ class _RecSimEnv(gym.Wrapper): diff --git a/rllib/evaluation/rollout_worker.py b/rllib/evaluation/rollout_worker.py index d6a4ca253d3cf..4244f95b1d506 100644 --- a/rllib/evaluation/rollout_worker.py +++ b/rllib/evaluation/rollout_worker.py @@ -68,11 +68,7 @@ from ray.rllib.utils import check_env, force_list from ray.rllib.utils.annotations import DeveloperAPI, override from ray.rllib.utils.debug import summarize, update_global_seed_if_necessary -from ray.rllib.utils.deprecation import ( - DEPRECATED_VALUE, - Deprecated, - deprecation_warning, -) +from ray.rllib.utils.deprecation import DEPRECATED_VALUE, deprecation_warning from ray.rllib.utils.error import ERR_MSG_NO_GPUS, HOWTO_CHANGE_CONFIG from ray.rllib.utils.filter import Filter, NoFilter, get_filter from ray.rllib.utils.framework import try_import_tf, try_import_torch @@ -407,7 +403,7 @@ def gen_rollouts(): if not self.config.disable_env_checking: check_env(self.env, self.config) # Custom validation function given, typically a function attribute of the - # algorithm trainer. + # Algorithm. if validate_env is not None: validate_env(self.env, self.env_context) @@ -2048,34 +2044,3 @@ def _make_sub_env_remote(vector_index): else: return _make_sub_env_local - - @Deprecated( - new="Trainer.get_policy().export_model([export_dir], [onnx]?)", error=True - ) - def export_policy_model(self, *args, **kwargs): - pass - - @Deprecated( - new="Trainer.get_policy().import_model_from_h5([import_file])", error=True - ) - def import_policy_model_from_h5(self, *args, **kwargs): - pass - - @Deprecated( - new="Trainer.get_policy().export_checkpoint([export_dir], [filename]?)", - error=True, - ) - def export_policy_checkpoint(self, *args, **kwargs): - pass - - @Deprecated(new="RolloutWorker.foreach_policy_to_train", error=True) - def foreach_trainable_policy(self, func, **kwargs): - pass - - @Deprecated(new="state_dict = RolloutWorker.get_state()", error=True) - def save(self): - pass - - @Deprecated(new="RolloutWorker.set_state([state_dict])", error=True) - def restore(self, objs): - pass diff --git a/rllib/evaluation/worker_set.py b/rllib/evaluation/worker_set.py index 100b815d2b621..dbb28bb3a5d04 100644 --- a/rllib/evaluation/worker_set.py +++ b/rllib/evaluation/worker_set.py @@ -94,9 +94,6 @@ def __init__( local_worker: bool = True, logdir: Optional[str] = None, _setup: bool = True, - # deprecated args. - policy_class=DEPRECATED_VALUE, - trainer_config=DEPRECATED_VALUE, ): """Initializes a WorkerSet instance. @@ -118,19 +115,6 @@ def __init__( logdir: Optional logging directory for workers. _setup: Whether to setup workers. This is only for testing. """ - if policy_class != DEPRECATED_VALUE: - deprecation_warning( - old="WorkerSet(policy_class=..)", - new="WorkerSet(default_policy_class=..)", - error=True, - ) - if trainer_config != DEPRECATED_VALUE: - deprecation_warning( - old="WorkerSet(trainer_config=..)", - new="WorkerSet(config=..)", - error=True, - ) - from ray.rllib.algorithms.algorithm_config import AlgorithmConfig # Make sure `config` is an AlgorithmConfig object. diff --git a/rllib/examples/action_masking.py b/rllib/examples/action_masking.py index b9df2c5f84ec3..622b280e3f1e4 100644 --- a/rllib/examples/action_masking.py +++ b/rllib/examples/action_masking.py @@ -181,7 +181,7 @@ def get_cli_args(): print(f"Obs: {obs}, Action: {action}") obs = next_obs - # run with tune for auto trainer creation, stopping, TensorBoard, etc. + # Run with tune for auto Algorithm creation, stopping, TensorBoard, etc. else: tuner = tune.Tuner( args.run, diff --git a/rllib/examples/bandit/tune_lin_ts_train_wheel_env.py b/rllib/examples/bandit/tune_lin_ts_train_wheel_env.py index 4f8ac7b20e5af..d4d71ce29afa0 100644 --- a/rllib/examples/bandit/tune_lin_ts_train_wheel_env.py +++ b/rllib/examples/bandit/tune_lin_ts_train_wheel_env.py @@ -87,14 +87,14 @@ def plot_model_weights(means, covs, ax): ax1.set_title("Episode reward mean") ax1.set_xlabel("Training steps") - # Restore trainer from checkpoint + # Restore Algorithm from checkpoint checkpoint = results.get_best_result().checkpoint - trainer = config.build() + algo = config.build() with checkpoint.as_directory() as checkpoint_dir: - trainer.restore(checkpoint_dir) + algo.restore(checkpoint_dir) # Get model to plot arm weights distribution - model = trainer.get_policy().model + model = algo.get_policy().model means = [model.arms[i].theta.numpy() for i in range(5)] covs = [model.arms[i].covariance.numpy() for i in range(5)] diff --git a/rllib/examples/connectors/self_play_with_policy_checkpoint.py b/rllib/examples/connectors/self_play_with_policy_checkpoint.py index 7692846ba297c..abe7ab9f38290 100644 --- a/rllib/examples/connectors/self_play_with_policy_checkpoint.py +++ b/rllib/examples/connectors/self_play_with_policy_checkpoint.py @@ -51,7 +51,7 @@ def on_algorithm_init(self, *, algorithm, **kwargs): self._checkpoint_dir, policy_ids=[OPPONENT_POLICY_ID] ) - # Add restored policy to trainer. + # Add restored policy to Algorithm. # Note that this policy doesn't have to be trained with the same algorithm # of the training stack. You can even mix up TF policies with a Torch stack. algorithm.add_policy( diff --git a/rllib/examples/custom_logger.py b/rllib/examples/custom_logger.py index b02c3a30197bc..3c2a017c25bf8 100644 --- a/rllib/examples/custom_logger.py +++ b/rllib/examples/custom_logger.py @@ -1,7 +1,6 @@ """ This example script demonstrates how one can define a custom logger -object for any RLlib Trainer via the Trainer's config dict's -"logger_config" key. +object for any RLlib Algorithm via the Algorithm's config's `logger_config` property. By default (logger_config=None), RLlib will construct a tune UnifiedLogger object, which logs JSON, CSV, and TBX output. diff --git a/rllib/examples/env/bandit_envs_recommender_system.py b/rllib/examples/env/bandit_envs_recommender_system.py index 584f29b55f394..05a29082a0d33 100644 --- a/rllib/examples/env/bandit_envs_recommender_system.py +++ b/rllib/examples/env/bandit_envs_recommender_system.py @@ -1,6 +1,6 @@ -"""Examples for recommender system simulating envs ready to be used by - RLlib Trainers. - This env follows RecSim obs and action APIs. +"""Examples for recommender system simulating envs ready to be used by RLlib Algorithms. + +This env follows RecSim obs and action APIs. """ import gymnasium as gym import numpy as np diff --git a/rllib/examples/env/recommender_system_envs_with_recsim.py b/rllib/examples/env/recommender_system_envs_with_recsim.py index 8b97057a8d963..f2f7a28e4b39c 100644 --- a/rllib/examples/env/recommender_system_envs_with_recsim.py +++ b/rllib/examples/env/recommender_system_envs_with_recsim.py @@ -1,4 +1,4 @@ -"""Examples for RecSim envs ready to be used by RLlib Trainers +"""Examples for RecSim envs ready to be used by RLlib Algorithms. RecSim is a configurable recommender systems simulation platform. Source: https://github.com/google-research/recsim diff --git a/rllib/examples/export/onnx_tf.py b/rllib/examples/export/onnx_tf.py index 78f3efa725d89..44c35001aac40 100644 --- a/rllib/examples/export/onnx_tf.py +++ b/rllib/examples/export/onnx_tf.py @@ -21,7 +21,7 @@ args = parser.parse_args() - # Configure our PPO trainer + # Configure our PPO Algorithm. config = ( ppo.PPOConfig() .rollouts(num_rollout_workers=1) diff --git a/rllib/examples/export/onnx_torch.py b/rllib/examples/export/onnx_torch.py index 3438d51840b76..b3d99a1495d18 100644 --- a/rllib/examples/export/onnx_torch.py +++ b/rllib/examples/export/onnx_torch.py @@ -8,7 +8,7 @@ import torch if __name__ == "__main__": - # Configure our PPO trainer + # Configure our PPO Algorithm. config = ( ppo.PPOConfig() .rollouts(num_rollout_workers=1) diff --git a/rllib/examples/inference_and_serving/policy_inference_after_training.py b/rllib/examples/inference_and_serving/policy_inference_after_training.py index 17f033847ec1c..51a83fc53154d 100644 --- a/rllib/examples/inference_and_serving/policy_inference_after_training.py +++ b/rllib/examples/inference_and_serving/policy_inference_after_training.py @@ -92,7 +92,7 @@ ) results = tuner.fit() - print("Training completed. Restoring new Trainer for action inference.") + print("Training completed. Restoring new Algorithm for action inference.") # Get the last checkpoint from the above training run. checkpoint = results.get_best_result().checkpoint # Create new Algorithm and restore its state from the last checkpoint. diff --git a/rllib/examples/inference_and_serving/policy_inference_after_training_with_attention.py b/rllib/examples/inference_and_serving/policy_inference_after_training_with_attention.py index 61ef4dd4f8e1f..1ac6b652c0764 100644 --- a/rllib/examples/inference_and_serving/policy_inference_after_training_with_attention.py +++ b/rllib/examples/inference_and_serving/policy_inference_after_training_with_attention.py @@ -125,10 +125,10 @@ ) results = tuner.fit() - print("Training completed. Restoring new Trainer for action inference.") + print("Training completed. Restoring new Algorithm for action inference.") # Get the last checkpoint from the above training run. checkpoint = results.get_best_result().checkpoint - # Create new Trainer and restore its state from the last checkpoint. + # Create new Algorithm and restore its state from the last checkpoint. algo = Algorithm.from_checkpoint(checkpoint) # Create the env to do inference in. diff --git a/rllib/examples/inference_and_serving/serve_and_rllib.py b/rllib/examples/inference_and_serving/serve_and_rllib.py index bc05efe3d1283..500bdfc252889 100644 --- a/rllib/examples/inference_and_serving/serve_and_rllib.py +++ b/rllib/examples/inference_and_serving/serve_and_rllib.py @@ -34,7 +34,7 @@ class ServeRLlibPolicy: """Callable class used by Ray Serve to handle async requests. All the necessary serving logic is implemented in here: - - Creation and restoring of the (already trained) RLlib Trainer. + - Creation and restoring of the (already trained) RLlib Algorithm. - Calls to algo.compute_action upon receiving an action request (with a current observation). """ @@ -56,7 +56,7 @@ async def __call__(self, request: Request): def train_rllib_policy(config: AlgorithmConfig): """Trains a DQN on ALE/MsPacman-v5 for n iterations. - Saves the trained Trainer to disk and returns the checkpoint path. + Saves the trained Algorithm to disk and returns the checkpoint path. Args: config: The algo config object for the Algorithm. @@ -77,7 +77,7 @@ def train_rllib_policy(config: AlgorithmConfig): if __name__ == "__main__": - # Config for the served RLlib Policy/Trainer. + # Config for the served RLlib Policy/Algorithm. config = DQNConfig().environment("ALE/MsPacman-v5").framework(args.framework) # Train the Algorithm for some time, then save it and get the checkpoint path. diff --git a/rllib/examples/self_play_league_based_with_open_spiel.py b/rllib/examples/self_play_league_based_with_open_spiel.py index e0d80551f5a38..a78024a19bdd4 100644 --- a/rllib/examples/self_play_league_based_with_open_spiel.py +++ b/rllib/examples/self_play_league_based_with_open_spiel.py @@ -69,7 +69,7 @@ def get_cli_args(): type=str, default=None, help="Full path to a checkpoint file for restoring a previously saved " - "Trainer state.", + "Algorithm state.", ) parser.add_argument( "--env", @@ -357,7 +357,7 @@ def policy_mapping_fn(agent_id, episode, worker, **kwargs): ), ).fit() - # Restore trained trainer (set to non-explore behavior) and play against + # Restore trained Algorithm (set to non-explore behavior) and play against # human on command line. if args.num_episodes_human_play > 0: num_episodes = 0 diff --git a/rllib/examples/self_play_with_open_spiel.py b/rllib/examples/self_play_with_open_spiel.py index f611cac7d1556..b0823699b83db 100644 --- a/rllib/examples/self_play_with_open_spiel.py +++ b/rllib/examples/self_play_with_open_spiel.py @@ -301,7 +301,7 @@ def policy_mapping_fn(agent_id, episode, worker, **kwargs): ), ).fit() - # Restore trained trainer (set to non-explore behavior) and play against + # Restore trained Algorithm (set to non-explore behavior) and play against # human on command line. if args.num_episodes_human_play > 0: num_episodes = 0 diff --git a/rllib/examples/serving/unity3d_server.py b/rllib/examples/serving/unity3d_server.py index b04a2397f5241..b66291be11b51 100755 --- a/rllib/examples/serving/unity3d_server.py +++ b/rllib/examples/serving/unity3d_server.py @@ -156,7 +156,7 @@ def _input(ioctx): config.rl_module(_enable_rl_module_api=False) config._enable_learner_api = False - # Create the Trainer used for Policy serving. + # Create the Algorithm used for Policy serving. algo = config.build() # Attempt to restore from checkpoint if possible. @@ -169,7 +169,7 @@ def _input(ioctx): # Serving and training loop. count = 0 while True: - # Calls to train() will block on the configured `input` in the Trainer + # Calls to train() will block on the configured `input` in the Algorithm # config above (PolicyServerInput). print(algo.train()) if count % args.checkpoint_freq == 0: diff --git a/rllib/examples/sumo_env_local.py b/rllib/examples/sumo_env_local.py index 47f888e631fd0..f9435dfb9d6eb 100644 --- a/rllib/examples/sumo_env_local.py +++ b/rllib/examples/sumo_env_local.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -""" Example Trainer for RLLIB + SUMO Utlis +""" Example Algorithm for RLLIB + SUMO Utlis Author: Lara CODECA lara.codeca@gmail.com @@ -50,7 +50,7 @@ type=str, default=None, help="Full path to a checkpoint file for restoring a previously saved " - "Trainer state.", + "Algorithm state.", ) parser.add_argument("--num-workers", type=int, default=0) parser.add_argument( diff --git a/rllib/examples/unity3d_env_local.py b/rllib/examples/unity3d_env_local.py index 93569a19e4c50..b83e0cba8abd8 100644 --- a/rllib/examples/unity3d_env_local.py +++ b/rllib/examples/unity3d_env_local.py @@ -1,5 +1,5 @@ """ -Example of running an RLlib Trainer against a locally running Unity3D editor +Example of running an RLlib Algorithm against a locally running Unity3D editor instance (available as Unity3DEnv inside RLlib). For a distributed cloud setup example with Unity, see `examples/serving/unity3d_[server|client].py` @@ -64,7 +64,7 @@ type=str, default=None, help="Full path to a checkpoint file for restoring a previously saved " - "Trainer state.", + "Algorithm state.", ) parser.add_argument("--num-workers", type=int, default=0) parser.add_argument( diff --git a/rllib/execution/train_ops.py b/rllib/execution/train_ops.py index 5f60c04c07a32..94190147bf668 100644 --- a/rllib/execution/train_ops.py +++ b/rllib/execution/train_ops.py @@ -172,7 +172,7 @@ def multi_gpu_train_one_step(algorithm, train_batch) -> Dict: load_timer.push_units_processed(train_batch.count) learn_timer.push_units_processed(train_batch.count) - # TODO: Move this into Trainer's `training_iteration` method for + # TODO: Move this into Algorithm's `training_step` method for # better transparency. algorithm._counters[NUM_ENV_STEPS_TRAINED] += train_batch.count algorithm._counters[NUM_AGENT_STEPS_TRAINED] += train_batch.agent_steps() diff --git a/rllib/models/catalog.py b/rllib/models/catalog.py index 83c942d1a2779..9d4c9aca522b7 100644 --- a/rllib/models/catalog.py +++ b/rllib/models/catalog.py @@ -436,7 +436,7 @@ def get_model_v2( action_space: Action space of the target gym env. num_outputs: The size of the output vector of the model. model_config: The "model" sub-config dict - within the Trainer's config dict. + within the Algorithm's config dict. framework: One of "tf2", "tf", "torch", or "jax". name: Name (scope) for the model. model_interface: Interface required for the model @@ -944,7 +944,7 @@ def _validate_config( Args: config: The "model" sub-config dict - within the Trainer's config dict. + within the Algorithm's config dict. action_space: The action space of the model, whose config are validated. framework: One of "jax", "tf2", "tf", or "torch". diff --git a/rllib/models/modelv2.py b/rllib/models/modelv2.py index daa2f2cfb8418..b63f0d9a8d97e 100644 --- a/rllib/models/modelv2.py +++ b/rllib/models/modelv2.py @@ -284,10 +284,10 @@ def import_from_h5(self, h5_file: str) -> None: Example: >>> from ray.rllib.algorithms.ppo import PPO - >>> trainer = PPO(...) # doctest: +SKIP - >>> trainer.import_policy_model_from_h5("/tmp/weights.h5") # doctest: +SKIP + >>> algo = PPO(...) # doctest: +SKIP + >>> algo.import_policy_model_from_h5("/tmp/weights.h5") # doctest: +SKIP >>> for _ in range(10): # doctest: +SKIP - >>> trainer.train() # doctest: +SKIP + >>> algo.train() # doctest: +SKIP """ raise NotImplementedError diff --git a/rllib/models/tf/attention_net.py b/rllib/models/tf/attention_net.py index b15dc2d14c61d..ebdfded9cb1b6 100644 --- a/rllib/models/tf/attention_net.py +++ b/rllib/models/tf/attention_net.py @@ -171,7 +171,7 @@ class GTrXLNet(RecurrentNetwork): Can be used as a drop-in replacement for LSTMs in PPO and IMPALA. For an example script, see: `ray/rllib/examples/attention_net.py`. - To use this network as a replacement for an RNN, configure your Trainer + To use this network as a replacement for an RNN, configure your Algorithm as follows: Examples: diff --git a/rllib/models/torch/attention_net.py b/rllib/models/torch/attention_net.py index 454c0a555c977..f72bec839e36c 100644 --- a/rllib/models/torch/attention_net.py +++ b/rllib/models/torch/attention_net.py @@ -41,7 +41,7 @@ class GTrXLNet(RecurrentNetwork, nn.Module): Can be used as a drop-in replacement for LSTMs in PPO and IMPALA. For an example script, see: `ray/rllib/examples/attention_net.py`. - To use this network as a replacement for an RNN, configure your Trainer + To use this network as a replacement for an RNN, configure your Algorithm as follows: Examples: diff --git a/rllib/policy/policy.py b/rllib/policy/policy.py index a519020f6343d..cd56b12e40edd 100644 --- a/rllib/policy/policy.py +++ b/rllib/policy/policy.py @@ -209,7 +209,7 @@ def __init__( observation_space: Observation space of the policy. action_space: Action space of the policy. config: A complete Algorithm/Policy config dict. For the default - config keys and values, see rllib/trainer/trainer.py. + config keys and values, see rllib/algorithm/algorithm.py. """ self.observation_space: gym.Space = observation_space self.action_space: gym.Space = action_space @@ -313,7 +313,7 @@ def from_checkpoint( for pid, policy_state in policy_states.items(): # Get spec and config, merge config with serialized_policy_spec = worker_state["policy_specs"][pid] - policy_config = Algorithm.merge_trainer_configs( + policy_config = Algorithm.merge_algorithm_configs( worker_state["policy_config"], serialized_policy_spec["config"] ) serialized_policy_spec.update({"config": policy_config}) @@ -1269,8 +1269,8 @@ def _get_num_gpus_for_policy(self) -> int: # If in local debugging mode, and _fake_gpus is not on. num_gpus = 0 elif worker_idx == 0: - # if we are in the new rl trainer world num_gpus is deprecated. - # so use num_gpus_per_worker for policy sampling + # If we are on the new RLModule/Learner stack, `num_gpus` is deprecated. + # so use `num_gpus_per_worker` for policy sampling # we need this .get() syntax here to ensure backwards compatibility. if self.config.get("_enable_learner_api", False): num_gpus = self.config["num_gpus_per_worker"] @@ -1297,7 +1297,7 @@ def _get_num_gpus_for_policy(self) -> int: def _create_exploration(self) -> Exploration: """Creates the Policy's Exploration object. - This method only exists b/c some Trainers do not use TfPolicy nor + This method only exists b/c some Algorithms do not use TfPolicy nor TorchPolicy, but inherit directly from Policy. Others inherit from TfPolicy w/o using DynamicTFPolicy. TODO(sven): unify these cases. diff --git a/rllib/policy/tests/test_policy_checkpoint_restore.py b/rllib/policy/tests/test_policy_checkpoint_restore.py index 9f77cc5100baf..7fc72dc59e15d 100644 --- a/rllib/policy/tests/test_policy_checkpoint_restore.py +++ b/rllib/policy/tests/test_policy_checkpoint_restore.py @@ -73,15 +73,15 @@ def test_add_policy_connector_enabled(self): self.assertIsNotNone(policy) - # Add this policy to a trainer. - trainer = APPOConfig().framework(framework="torch").build("CartPole-v0") + # Add this policy to an Algorithm. + algo = APPOConfig().framework(framework="torch").build("CartPole-v0") # Add the entire policy. - self.assertIsNotNone(trainer.add_policy("test_policy", policy=policy)) + self.assertIsNotNone(algo.add_policy("test_policy", policy=policy)) # Add the same policy, but using individual parameter API. self.assertIsNotNone( - trainer.add_policy( + algo.add_policy( "test_policy_2", policy_cls=type(policy), observation_space=policy.observation_space, diff --git a/rllib/tests/test_dependency_tf.py b/rllib/tests/test_dependency_tf.py index f161e32f34785..dda0ae282722c 100644 --- a/rllib/tests/test_dependency_tf.py +++ b/rllib/tests/test_dependency_tf.py @@ -7,7 +7,7 @@ # Do not import tf for testing purposes. os.environ["RLLIB_TEST_NO_TF_IMPORT"] = "1" - # Test registering (includes importing) all Trainers. + # Test registering (includes importing) all Algorithms. from ray.rllib import _register_all # This should surface any dependency on tf, e.g. inside function diff --git a/rllib/tests/test_gpus.py b/rllib/tests/test_gpus.py index 3d01901a5db63..e5179a901ea60 100644 --- a/rllib/tests/test_gpus.py +++ b/rllib/tests/test_gpus.py @@ -46,12 +46,12 @@ def test_gpus_in_non_local_mode(self): ("tf", "torch") if num_gpus > 1 else ("tf2", "tf", "torch") ) for _ in framework_iterator(config, frameworks=frameworks): - # Expect that trainer creation causes a num_gpu error. + # Expect that Algorithm creation causes a num_gpu error. if ( actual_gpus < num_gpus + 2 * num_gpus_per_worker and not fake_gpus ): - # "Direct" RLlib (create Trainer on the driver). + # "Direct" RLlib (create Algorithm on the driver). # Cannot run through ray.tune.Tuner().fit() as it would # simply wait infinitely for the resources to # become available. diff --git a/rllib/utils/debug/memory.py b/rllib/utils/debug/memory.py index 6b1a9b8929e1c..d9d99a279ce90 100644 --- a/rllib/utils/debug/memory.py +++ b/rllib/utils/debug/memory.py @@ -15,9 +15,9 @@ def check_memory_leaks( repeats: Optional[int] = None, max_num_trials: int = 3, ) -> DefaultDict[str, List[Suspect]]: - """Diagnoses the given trainer for possible memory leaks. + """Diagnoses the given Algorithm for possible memory leaks. - Isolates single components inside the trainer's local worker, e.g. the env, + Isolates single components inside the Algorithm's local worker, e.g. the env, policy, etc.. and calls some of their methods repeatedly, while checking the memory footprints and keeping track of which lines in the code add un-GC'd items to memory. @@ -46,7 +46,7 @@ def check_memory_leaks( # Test a single sub-env (first in the VectorEnv)? if "env" in to_check: assert local_worker.async_env is not None, ( - "ERROR: Cannot test 'env' since given trainer does not have one " + "ERROR: Cannot test 'env' since given Algorithm does not have one " "in its local worker. Try setting `create_env_on_driver=True`." ) diff --git a/rllib/utils/tests/run_memory_leak_tests.py b/rllib/utils/tests/run_memory_leak_tests.py index bcba5d4fada2a..2a25bd1591eb4 100644 --- a/rllib/utils/tests/run_memory_leak_tests.py +++ b/rllib/utils/tests/run_memory_leak_tests.py @@ -122,7 +122,7 @@ print("== Test config ==") print(yaml.dump(experiment)) - # Construct the trainer instance based on the given config. + # Construct the Algorithm instance based on the given config. leaking = True try: ray.init(num_cpus=5, local_mode=args.local_mode) diff --git a/rllib/utils/typing.py b/rllib/utils/typing.py index 79128ad3d34d7..b61b5298bb411 100644 --- a/rllib/utils/typing.py +++ b/rllib/utils/typing.py @@ -50,11 +50,11 @@ # Note: Policy config dicts are usually the same as AlgorithmConfigDict, but # parts of it may sometimes be altered in e.g. a multi-agent setup, # where we have >1 Policies in the same Algorithm. -AlgorithmConfigDict = TrainerConfigDict = dict +AlgorithmConfigDict = dict # An algorithm config dict that only has overrides. It needs to be combined with # the default algorithm config to be used. -PartialAlgorithmConfigDict = PartialTrainerConfigDict = dict +PartialAlgorithmConfigDict = dict # Represents the model config sub-dict of the algo config that is passed to # the model catalog.