o
    ci\                     @   sV  d Z ddlZddlmZmZmZmZmZmZm	Z	 ddl
mZ ddlmZmZ ddlmZ ddlmZmZ ddlmZmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z) ddl*m+Z+ ddl,m-Z- ddl.m/Z/ ddl0m1Z1 e	rddl2m3Z3 e4e5Z6dZ7dZ8dZ9dZ:dZ;G dd deZ<G dd deZ=dS )a3  
Proximal Policy Optimization (PPO)
==================================

This file defines the distributed Algorithm class for proximal policy
optimization.
See `ppo_[tf|torch]_policy.py` for the definition of the policy loss.

Detailed documentation: https://docs.ray.io/en/master/rllib-algorithms.html#ppo
    N)AnyDictListOptionalTypeUnionTYPE_CHECKING)	Algorithm)AlgorithmConfigNotProvided)RLModuleSpec)standardize_fieldssynchronous_parallel_sample)train_one_stepmulti_gpu_train_one_step)Policy)OldAPIStackoverride)DEPRECATED_VALUE)ENV_RUNNER_RESULTSENV_RUNNER_SAMPLING_TIMERLEARNER_RESULTSLEARNER_UPDATE_TIMERNUM_AGENT_STEPS_SAMPLEDNUM_ENV_STEPS_SAMPLEDNUM_ENV_STEPS_SAMPLED_LIFETIMESYNCH_WORKER_WEIGHTS_TIMERSAMPLE_TIMERTIMERSALL_MODULES)LEARNER_STATS_KEY)	Scheduler)
ResultDict)log_once)Learnervf_loss_unclippedvf_explained_varmean_kl_losscurr_kl_coeffcurr_entropy_coeffc                !       sX  e Zd ZdZd fdd	ZeedefddZeede	e
d ef fd	d
Zeeeeeeeeeeeeeeeeddee dee dee dee dee dee dee dee deeee	eef    dee dee dee deeee	eef    dd f fddZeed  fddZeeedeeef f fddZ  ZS )!	PPOConfigaT  Defines a configuration class from which a PPO Algorithm can be built.

    .. testcode::

        from ray.rllib.algorithms.ppo import PPOConfig

        config = PPOConfig()
        config.environment("CartPole-v1")
        config.env_runners(num_env_runners=1)
        config.training(
            gamma=0.9, lr=0.01, kl_coeff=0.3, train_batch_size_per_learner=256
        )

        # Build a Algorithm object from the config and run 1 training iteration.
        algo = config.build()
        algo.train()

    .. testcode::

        from ray.rllib.algorithms.ppo import PPOConfig
        from ray import tune

        config = (
            PPOConfig()
            # Set the config object's env.
            .environment(env="CartPole-v1")
            # Update the config object's training parameters.
            .training(
                lr=0.001, clip_param=0.2
            )
        )

        tune.Tuner(
            "PPO",
            run_config=tune.RunConfig(stop={"training_iteration": 1}),
            param_space=config,
        ).fit()

    .. testoutput::
        :hide:

        ...
    Nc                    s   ddi| _ t j|ptd d| _d| _d| _d| _d| _d| _	d	| _
d| _d
| _d| _d| _d| _d
| _d| _d| _d| _d| _d| _d| jd< d| _d| _t| _t| _dS )z!Initializes a PPOConfig instance.typeStochasticSampling)
algo_classg-C6
?autoi  T      g      ?g?g{Gz?        g333333?g      $@N   Fvf_share_layers)exploration_configsuper__init__PPOlrrollout_fragment_lengthtrain_batch_size
use_criticuse_gae
num_epochsminibatch_sizeshuffle_batch_per_epochlambda_use_kl_losskl_coeff	kl_targetvf_loss_coeffentropy_coeff
clip_paramvf_clip_param	grad_clipnum_env_runnersmodelentropy_coeff_schedulelr_scheduler   sgd_minibatch_sizer3   )selfr-   	__class__ P/home/ubuntu/.local/lib/python3.10/site-packages/ray/rllib/algorithms/ppo/ppo.pyr6   i   s4   


zPPOConfig.__init__returnc                 C   s2   | j dkrddlm} t|dS td| j  d)Ntorchr   )DefaultPPOTorchRLModule)module_classThe framework z/ is not supported. Use either 'torch' or 'tf2'.)framework_str:ray.rllib.algorithms.ppo.torch.default_ppo_torch_rl_modulerU   r   
ValueError)rN   rU   rQ   rQ   rR   get_default_rl_module_spec   s   

z$PPOConfig.get_default_rl_module_specr$   c                 C   s>   | j dkrddlm} |S | j dv rtdtd| j  d)NrT   r   )PPOTorchLearner)tf2tfzPTensorFlow is no longer supported on the new API stack! Use `framework='torch'`.rW   z+ is not supported. Use `framework='torch'`.)rX   0ray.rllib.algorithms.ppo.torch.ppo_torch_learnerr\   rZ   )rN   r\   rQ   rQ   rR   get_default_learner_class   s   

z#PPOConfig.get_default_learner_class)r;   r<   r@   rA   rB   rC   rD   rE   rK   rF   rG   rH   rL   r3   r;   r<   r@   rA   rB   rC   rD   rE   rK   rF   rG   rH   rL   c                   s   t  jdi | |tur|| _|tur|| _|tur|| _|tur%|| _|tur,|| _|tur3|| _|tur:|| _	|turA|| _
|
turH|
| _|turO|| _|turV|| _|tur]|| _|	turd|	| _| S )a	  Sets the training related configuration.

        Args:
            use_critic: Should use a critic as a baseline (otherwise don't use value
                baseline; required for using GAE).
            use_gae: If true, use the Generalized Advantage Estimator (GAE)
                with a value function, see https://arxiv.org/pdf/1506.02438.pdf.
            lambda_: The lambda parameter for General Advantage Estimation (GAE).
                Defines the exponential weight used between actually measured rewards
                vs value function estimates over multiple time steps. Specifically,
                `lambda_` balances short-term, low-variance estimates against long-term,
                high-variance returns. A `lambda_` of 0.0 makes the GAE rely only on
                immediate rewards (and vf predictions from there on, reducing variance,
                but increasing bias), while a `lambda_` of 1.0 only incorporates vf
                predictions at the truncation points of the given episodes or episode
                chunks (reducing bias but increasing variance).
            use_kl_loss: Whether to use the KL-term in the loss function.
            kl_coeff: Initial coefficient for KL divergence.
            kl_target: Target value for KL divergence.
            vf_loss_coeff: Coefficient of the value function loss. IMPORTANT: you must
                tune this if you set vf_share_layers=True inside your model's config.
            entropy_coeff: The entropy coefficient (float) or entropy coefficient
                schedule in the format of
                [[timestep, coeff-value], [timestep, coeff-value], ...]
                In case of a schedule, intermediary timesteps will be assigned to
                linearly interpolated coefficient values. A schedule config's first
                entry must start with timestep 0, i.e.: [[0, initial_value], [...]].
            clip_param: The PPO clip parameter.
            vf_clip_param: Clip param for the value function. Note that this is
                sensitive to the scale of the rewards. If your expected V is large,
                increase this.
            grad_clip: If specified, clip the global norm of gradients by this amount.

        Returns:
            This updated AlgorithmConfig object.
        NrQ   )r5   trainingr   r;   r<   r@   rA   rB   rC   rD   rE   rF   rG   rH   rL   rK   )rN   r;   r<   r@   rA   rB   rC   rD   rE   rK   rF   rG   rH   rL   r3   kwargsrO   rQ   rR   ra      s8   ;zPPOConfig.trainingc                    s8  t    |   | js)| j| jkr)| d| j d| j d| j d| j d	 n-| jrV| j}| jp4| j}t	|t
rVt	|t
rV||krV| d| d| d| d| j d	 | jsf| jdkrf| jsf| d | jr| jd urs| d	 | jd ur}| d
 tj| jddd t	| jtr| jdk r| d d S d S d S )Nz`minibatch_size` (z!) must be <= `train_batch_size` (z.). In PPO, the train batch will be split into zG chunks, each of which is iterated over (used for updating the policy) z times.z-) must be <= `train_batch_size_per_learner` (truncate_episodeszEpisode truncation is not supported without a value function (to estimate the return at the end of the truncated trajectory). Consider setting batch_mode=complete_episodes.zW`lr_schedule` is deprecated and must be None! Use the `lr` setting to setup a schedule.zm`entropy_coeff_schedule` is deprecated and must be None! Use the `entropy_coeff` setting to setup a schedule.rE   zentropy coefficient)fixed_value_or_schedulesetting_namedescriptionr1   z`entropy_coeff` must be >= 0.0)r5   validate4validate_train_batch_size_vs_rollout_fragment_lengthenable_rl_module_and_learnerr>   r:   _value_errorr=   train_batch_size_per_learner
isinstanceintin_evaluation
batch_moder<   rL   rK   r!   rE   float)rN   mbstbsrO   rQ   rR   rg     sh   




zPPOConfig.validatec                    s   t  jddiB S )Nr3   F)r5   _model_config_auto_includesrN   rO   rQ   rR   rs   d  s   z%PPOConfig._model_config_auto_includesNrS   N)__name__
__module____qualname____doc__r6   r   r
   r   r[   r   r   strr`   r   r   r   boolrp   r   rm   ra   rg   propertyr   r   rs   __classcell__rQ   rQ   rO   rR   r*   <   sp    ,1	
]J&r*   c                   @   sp   e Zd ZeeedefddZeeededee	e
  fddZeeddd	Zedefd
dZdS )r7   rS   c                 C   s   t  S ru   )r*   )clsrQ   rQ   rR   get_default_configk  s   zPPO.get_default_configconfigc                 C   sH   |d dkrddl m} |S |d dkrddlm} |S ddlm} |S )N	frameworkrT   r   )PPOTorchPolicyr^   )PPOTF1Policy)PPOTF2Policy))ray.rllib.algorithms.ppo.ppo_torch_policyr   &ray.rllib.algorithms.ppo.ppo_tf_policyr   r   )r   r   r   r   r   rQ   rQ   rR   get_default_policy_classp  s   zPPO.get_default_policy_classNc                 C   s  | j js|  S | jttfF | j jdkr*t| j	| j j
| j j| j jdd\}}nt| j	| j j
| j j| j jdd\}}|sG	 W d    d S | jj|td W d    n1 sYw   Y  | jttf) | jj|t| jttfi| j j| j j| j jd}| jj|td W d    n1 sw   Y  | jttf t|d  th }| j	j| j|dd W d    d S 1 sw   Y  d S )	Nagent_stepsT)
worker_setmax_agent_stepssample_timeout_s_uses_new_env_runners_return_metrics)r   max_env_stepsr   r   r   )key)episodes	timestepsr=   r>   r?   r   )from_worker_or_learner_grouppoliciesinference_only)r   "enable_env_runner_and_connector_v2_training_step_old_api_stackmetricslog_timer   r   count_steps_byr   env_runner_grouptotal_train_batch_sizer   	aggregater   r   learner_groupupdater   peekr=   r>   r?   r   r   setkeysr   sync_weights)rN   r   env_runner_resultslearner_resultsmodules_to_updaterQ   rQ   rR   training_step  sZ   


	"zPPO.training_stepc              
      sP   j t R  jjdkrt j jj jjd}nt j jj jjd}|s0i W  d    S | } j	t
  | 7  <  j	t  | 7  < t|dg}W d    n1 sZw   Y   jjrit |}nt |}t| } j	t
  fdd|D d} j t   j dkrd } jj|||d	 W d    n1 sw   Y  | D ]q\}}|t d
} ||  jj|t d  }	|t d }
tdr jdi dr|	dkrtd ||	|
 |j!| "d  |j!| d # }tdr| jj$krd _%td| d jd  d| d q j&'| |S )Nr   )r   r   r   )r   r   r   
advantagesc                    s   i | ]
}| j j| jqS rQ   )
env_runner
policy_mapnum_grad_updates).0pidrt   rQ   rR   
<dictcomp>  s    z4PPO._training_step_old_api_stack.<locals>.<dictcomp>)timestepnum_grad_updates_per_policyr   )r   r   global_varsklvf_losspolicy_lossppo_warned_lr_ratiorJ   r3   d   zThe magnitude of your value function loss for policy: {} is extremely large ({}) compared to the policy loss ({}). This can prevent the policy from learning. Consider scaling down the VF loss by reducing vf_loss_coeff, or disabling vf_share_layers.rewardsppo_warned_vf_clipTz1The mean reward returned from the environment is z! but the vf_clip_param is set to rG   z%. Consider increasing it for policy: z' to improve value function convergence.)(_timersr   r   r   r   r   r   r   as_multi_agent	_countersr   r   r   	env_stepsr   simple_optimizerr   r   listr   r   num_remote_workersr   itemsr    get
get_policy	update_klrD   r#   loggerwarningformatpolicy_batchesset_get_interceptormeanrG   warned_vf_clipr   set_global_vars)rN   train_batchtrain_resultspolicies_to_updater   r   	policy_idpolicy_infokl_divergencescaled_vf_lossr   mean_rewardrQ   rt   rR   r     s   



z PPO._training_step_old_api_stackrv   )rw   rx   ry   classmethodr   r	   r
   r   r   r   r   r   r   r   r"   r   rQ   rQ   rQ   rR   r7   j  s    
Gr7   )>rz   loggingtypingr   r   r   r   r   r   r   ray.rllib.algorithms.algorithmr	   %ray.rllib.algorithms.algorithm_configr
   r   "ray.rllib.core.rl_module.rl_moduler   ray.rllib.execution.rollout_opsr   r   ray.rllib.execution.train_opsr   r   ray.rllib.policy.policyr   ray.rllib.utils.annotationsr   r   ray.rllib.utils.deprecationr   ray.rllib.utils.metricsr   r   r   r   r   r   r   r   r   r   r   $ray.rllib.utils.metrics.learner_infor    #ray.rllib.utils.schedules.schedulerr!   ray.rllib.utils.typingr"   ray.util.debugr#   ray.rllib.core.learner.learnerr$   	getLoggerrw   r   %LEARNER_RESULTS_VF_LOSS_UNCLIPPED_KEY$LEARNER_RESULTS_VF_EXPLAINED_VAR_KEYLEARNER_RESULTS_KL_KEY!LEARNER_RESULTS_CURR_KL_COEFF_KEY&LEARNER_RESULTS_CURR_ENTROPY_COEFF_KEYr*   r7   rQ   rQ   rQ   rR   <module>   s8    $4
  0