o
    $i]                     @   sf  d Z ddlZddlmZmZmZmZmZmZm	Z	 ddl
mZ ddlmZ ddlmZ ddlmZmZ ddlmZ dd	lmZmZ dd
lmZmZ ddlmZ ddlmZmZ ddl m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z, ddl-m.Z. ddl/m0Z0 ddl1m2Z2 ddl3m4Z4 erddl5m6Z6 e7e8Z9dZ:dZ;dZ<dZ=dZ>G dd deZ?G dd deZ@dS )a3  
Proximal Policy Optimization (PPO)
==================================

This file defines the distributed Algorithm class for proximal policy
optimization.
See `ppo_[tf|torch]_policy.py` for the definition of the policy loss.

Detailed documentation: https://docs.ray.io/en/master/rllib-algorithms.html#ppo
    N)TYPE_CHECKINGAnyDictListOptionalTypeUnion)Self)DEPRECATED_VALUE)	Algorithm)AlgorithmConfigNotProvided)RLModuleSpec)standardize_fieldssynchronous_parallel_sample)multi_gpu_train_one_steptrain_one_step)Policy)OldAPIStackoverride)ALL_MODULESENV_RUNNER_RESULTSENV_RUNNER_SAMPLING_TIMERLEARNER_RESULTSLEARNER_UPDATE_TIMERNUM_AGENT_STEPS_SAMPLEDNUM_ENV_STEPS_SAMPLEDNUM_ENV_STEPS_SAMPLED_LIFETIME!NUM_MODULE_STEPS_TRAINED_LIFETIMESAMPLE_TIMERSYNCH_WORKER_WEIGHTS_TIMERTIMERS)LEARNER_STATS_KEY)	Scheduler)
ResultDict)log_once)Learnervf_loss_unclippedvf_explained_varmean_kl_losscurr_kl_coeffcurr_entropy_coeffc                !       sX  e Zd ZdZd fdd	ZeedefddZeede	e
d ef fd	d
Zeeeeeeeeeeeeeeeeddee dee dee dee dee dee dee dee deeee	eef    dee dee dee deeee	eef    def fddZeed  fddZeeedeeef f fddZ  ZS )!	PPOConfigaT  Defines a configuration class from which a PPO Algorithm can be built.

    .. testcode::

        from ray.rllib.algorithms.ppo import PPOConfig

        config = PPOConfig()
        config.environment("CartPole-v1")
        config.env_runners(num_env_runners=1)
        config.training(
            gamma=0.9, lr=0.01, kl_coeff=0.3, train_batch_size_per_learner=256
        )

        # Build a Algorithm object from the config and run 1 training iteration.
        algo = config.build()
        algo.train()

    .. testcode::

        from ray.rllib.algorithms.ppo import PPOConfig
        from ray import tune

        config = (
            PPOConfig()
            # Set the config object's env.
            .environment(env="CartPole-v1")
            # Update the config object's training parameters.
            .training(
                lr=0.001, clip_param=0.2
            )
        )

        tune.Tuner(
            "PPO",
            run_config=tune.RunConfig(stop={"training_iteration": 1}),
            param_space=config,
        ).fit()

    .. testoutput::
        :hide:

        ...
    Nc                    s   ddi| _ t j|ptd d| _d| _d| _d| _d| _d| _	d	| _
d| _d
| _d| _d| _d| _d
| _d| _d| _d| _d| _d| _d| jd< d| _d| _t| _t| _dS )z!Initializes a PPOConfig instance.typeStochasticSampling)
algo_classg-C6
?autoi  T      g      ?g?g{Gz?        g333333?g      $@N   Fvf_share_layers)exploration_configsuper__init__PPOlrrollout_fragment_lengthtrain_batch_size
use_criticuse_gae
num_epochsminibatch_sizeshuffle_batch_per_epochlambda_use_kl_losskl_coeff	kl_targetvf_loss_coeffentropy_coeff
clip_paramvf_clip_param	grad_clipnum_env_runnersmodelentropy_coeff_schedulelr_scheduler
   sgd_minibatch_sizer5   )selfr/   	__class__ Y/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/ray/rllib/algorithms/ppo/ppo.pyr8   l   s4   


zPPOConfig.__init__returnc                 C   s2   | j dkrddlm} t|dS td| j  d)Ntorchr   )DefaultPPOTorchRLModule)module_classThe framework z/ is not supported. Use either 'torch' or 'tf2'.)framework_str:ray.rllib.algorithms.ppo.torch.default_ppo_torch_rl_modulerW   r   
ValueError)rP   rW   rS   rS   rT   get_default_rl_module_spec   s   

z$PPOConfig.get_default_rl_module_specr&   c                 C   s>   | j dkrddlm} |S | j dv rtdtd| j  d)NrV   r   )PPOTorchLearner)tf2tfzPTensorFlow is no longer supported on the new API stack! Use `framework='torch'`.rY   z+ is not supported. Use `framework='torch'`.)rZ   0ray.rllib.algorithms.ppo.torch.ppo_torch_learnerr^   r\   )rP   r^   rS   rS   rT   get_default_learner_class   s   

z#PPOConfig.get_default_learner_class)r=   r>   rB   rC   rD   rE   rF   rG   rM   rH   rI   rJ   rN   r5   r=   r>   rB   rC   rD   rE   rF   rG   rM   rH   rI   rJ   rN   c                   s   t  jdi | |tur|| _|tur|| _|tur|| _|tur%|| _|tur,|| _|tur3|| _|tur:|| _	|turA|| _
|
turH|
| _|turO|| _|turV|| _|tur]|| _|	turd|	| _| S )a	  Sets the training related configuration.

        Args:
            use_critic: Should use a critic as a baseline (otherwise don't use value
                baseline; required for using GAE).
            use_gae: If true, use the Generalized Advantage Estimator (GAE)
                with a value function, see https://arxiv.org/pdf/1506.02438.pdf.
            lambda_: The lambda parameter for General Advantage Estimation (GAE).
                Defines the exponential weight used between actually measured rewards
                vs value function estimates over multiple time steps. Specifically,
                `lambda_` balances short-term, low-variance estimates against long-term,
                high-variance returns. A `lambda_` of 0.0 makes the GAE rely only on
                immediate rewards (and vf predictions from there on, reducing variance,
                but increasing bias), while a `lambda_` of 1.0 only incorporates vf
                predictions at the truncation points of the given episodes or episode
                chunks (reducing bias but increasing variance).
            use_kl_loss: Whether to use the KL-term in the loss function.
            kl_coeff: Initial coefficient for KL divergence.
            kl_target: Target value for KL divergence.
            vf_loss_coeff: Coefficient of the value function loss. IMPORTANT: you must
                tune this if you set vf_share_layers=True inside your model's config.
            entropy_coeff: The entropy coefficient (float) or entropy coefficient
                schedule in the format of
                [[timestep, coeff-value], [timestep, coeff-value], ...]
                In case of a schedule, intermediary timesteps will be assigned to
                linearly interpolated coefficient values. A schedule config's first
                entry must start with timestep 0, i.e.: [[0, initial_value], [...]].
            clip_param: The PPO clip parameter.
            vf_clip_param: Clip param for the value function. Note that this is
                sensitive to the scale of the rewards. If your expected V is large,
                increase this.
            grad_clip: If specified, clip the global norm of gradients by this amount.

        Returns:
            This updated AlgorithmConfig object.
        NrS   )r7   trainingr   r=   r>   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rN   rM   )rP   r=   r>   rB   rC   rD   rE   rF   rG   rM   rH   rI   rJ   rN   r5   kwargsrQ   rS   rT   rc      s8   ;zPPOConfig.trainingc                    s8  t    |   | js)| j| jkr)| d| j d| j d| j d| j d	 n-| jrV| j}| jp4| j}t	|t
rVt	|t
rV||krV| d| d| d| d| j d	 | jsf| jdkrf| jsf| d | jr| jd urs| d	 | jd ur}| d
 tj| jddd t	| jtr| jdk r| d d S d S d S )Nz`minibatch_size` (z!) must be <= `train_batch_size` (z.). In PPO, the train batch will be split into zG chunks, each of which is iterated over (used for updating the policy) z times.z-) must be <= `train_batch_size_per_learner` (truncate_episodeszEpisode truncation is not supported without a value function (to estimate the return at the end of the truncated trajectory). Consider setting batch_mode=complete_episodes.zW`lr_schedule` is deprecated and must be None! Use the `lr` setting to setup a schedule.zm`entropy_coeff_schedule` is deprecated and must be None! Use the `entropy_coeff` setting to setup a schedule.rG   zentropy coefficient)fixed_value_or_schedulesetting_namedescriptionr3   z`entropy_coeff` must be >= 0.0)r7   validate4validate_train_batch_size_vs_rollout_fragment_lengthenable_rl_module_and_learnerr@   r<   _value_errorr?   train_batch_size_per_learner
isinstanceintin_evaluation
batch_moder>   rN   rM   r#   rG   float)rP   mbstbsrQ   rS   rT   ri     sh   




zPPOConfig.validatec                    s   t  jddiB S )Nr5   F)r7   _model_config_auto_includesrP   rQ   rS   rT   ru   g  s   z%PPOConfig._model_config_auto_includesNrU   N)__name__
__module____qualname____doc__r8   r   r   r   r]   r   r   strrb   r   r
   r   boolrr   r   ro   r	   rc   ri   propertyr   r   ru   __classcell__rS   rS   rQ   rT   r,   ?   sp    ,1	
]J&r,   c                   @   sp   e Zd ZeeedefddZeeedede	e
e  fddZeeddd	Zedefd
dZdS )r9   rU   c                 C   s   t  S rw   )r,   )clsrS   rS   rT   get_default_confign  s   zPPO.get_default_configconfigc                 C   sH   |d dkrddl m} |S |d dkrddlm} |S ddlm} |S )N	frameworkrV   r   )PPOTorchPolicyr`   )PPOTF1Policy)PPOTF2Policy))ray.rllib.algorithms.ppo.ppo_torch_policyr   &ray.rllib.algorithms.ppo.ppo_tf_policyr   r   )r   r   r   r   r   rS   rS   rT   get_default_policy_classs  s   zPPO.get_default_policy_classNc              
   C   s  | j js|  S | jttfF | j jdkr*t| j	| j j
| j j| j jdd\}}nt| j	| j j
| j j| j jdd\}}|sG	 W d    d S | jj|td W d    n1 sYw   Y  | jttf4 | jj|t| jttft| jjtttfddi| j j| j j| j jd}| jj|td W d    n1 sw   Y  | jttf t|d  th }| j	j| j|dd	 W d    d S 1 sw   Y  d S )
Nagent_stepsT)
worker_setmax_agent_stepssample_timeout_s_uses_new_env_runners_return_metrics)r   max_env_stepsr   r   r   )keyr   )default)episodes	timestepsr?   r@   rA   )from_worker_or_learner_grouppoliciesinference_only)r   "enable_env_runner_and_connector_v2_training_step_old_api_stackmetricslog_timer!   r   count_steps_byr   env_runner_grouptotal_train_batch_sizer   	aggregater   r   learner_groupupdater   peekr   r   r   r?   r@   rA   r    setkeyssync_weights)rP   r   env_runner_resultslearner_resultsmodules_to_updaterS   rS   rT   training_step  sj   


	"zPPO.training_stepc              
      sP   j t R  jjdkrt j jj jjd}nt j jj jjd}|s0i W  d    S | } j	t
  | 7  <  j	t  | 7  < t|dg}W d    n1 sZw   Y   jjrit |}nt |}t| } j	t
  fdd|D d} j t   j dkrd } jj|||d	 W d    n1 sw   Y  | D ]q\}}|t d
} ||  jj|t d  }	|t d }
tdr jdi dr|	dkrtd ||	|
 |j!| "d  |j!| d # }tdr| jj$krd _%td| d jd  d| d q j&'| |S )Nr   )r   r   r   )r   r   r   
advantagesc                    s   i | ]
}| j j| jqS rS   )
env_runner
policy_mapnum_grad_updates).0pidrv   rS   rT   
<dictcomp>  s    z4PPO._training_step_old_api_stack.<locals>.<dictcomp>)timestepnum_grad_updates_per_policyr   )r   r   global_varsklvf_losspolicy_lossppo_warned_lr_ratiorL   r5   d   zThe magnitude of your value function loss for policy: {} is extremely large ({}) compared to the policy loss ({}). This can prevent the policy from learning. Consider scaling down the VF loss by reducing vf_loss_coeff, or disabling vf_share_layers.rewardsppo_warned_vf_clipTz1The mean reward returned from the environment is z! but the vf_clip_param is set to rI   z%. Consider increasing it for policy: z' to improve value function convergence.)(_timersr   r   r   r   r   r   r   as_multi_agent	_countersr   r   r   	env_stepsr   simple_optimizerr   r   listr   r    num_remote_workersr   itemsr"   get
get_policy	update_klrF   r%   loggerwarningformatpolicy_batchesset_get_interceptormeanrI   warned_vf_clipr   set_global_vars)rP   train_batchtrain_resultspolicies_to_updater   r   	policy_idpolicy_infokl_divergencescaled_vf_lossr   mean_rewardrS   rv   rT   r     s   



z PPO._training_step_old_api_stackrx   )ry   rz   r{   classmethodr   r   r,   r   r   r   r   r   r   r   r   r$   r   rS   rS   rS   rT   r9   m  s    
Qr9   )Ar|   loggingtypingr   r   r   r   r   r   r   typing_extensionsr	   ray._common.deprecationr
   ray.rllib.algorithms.algorithmr   %ray.rllib.algorithms.algorithm_configr   r   "ray.rllib.core.rl_module.rl_moduler   ray.rllib.execution.rollout_opsr   r   ray.rllib.execution.train_opsr   r   ray.rllib.policy.policyr   ray.rllib.utils.annotationsr   r   ray.rllib.utils.metricsr   r   r   r   r   r   r   r   r   r   r    r!   $ray.rllib.utils.metrics.learner_infor"   #ray.rllib.utils.schedules.schedulerr#   ray.rllib.utils.typingr$   ray.util.debugr%   ray.rllib.core.learner.learnerr&   	getLoggerry   r   %LEARNER_RESULTS_VF_LOSS_UNCLIPPED_KEY$LEARNER_RESULTS_VF_EXPLAINED_VAR_KEYLEARNER_RESULTS_KL_KEY!LEARNER_RESULTS_CURR_KL_COEFF_KEY&LEARNER_RESULTS_CURR_ENTROPY_COEFF_KEYr,   r9   rS   rS   rS   rT   <module>   s:    $8
  0