o
    ci+                     @   s0  d Z ddlZddlmZ ddlZddlZddlZddlmZ ddl	m
Z
 ddlmZmZ ddlmZmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZmZ ddlmZ ddlmZ ddlm Z  ddl!m"Z"m#Z# ddl$m%Z%m&Z&m'Z' ddl(m)Z) ddl*m+Z+ ddl,m-Z-m.Z. ddl/m0Z0 e" \Z1Z2Z3e# \Z4Z5dZ6dZ7e8 Z9e9j:dg dddd e9j:dddd e9j:d e;d!d"d# e9j:d$e;d%d&d# e9j:d'e<d(d)d# G d*d+ d+Z=	dOd,d-Z>d.d/ Z?d0d1 Z@d2d3 ZAeAeZBeAeZCG d4d5 d5e=eZDG d6d7 d7eZEeFd8krejGd9d: e9H ZIeJd;eIjKdkrene e jLd<d<d=MeKeIjKjNd>dd?jOd@d;idAjPdedBejQejReIjKdCfdedBejQejReIjKdCfdDdEdF dGjSe;ejTUdHdIdJZVe
eIjWe'eIjXe% dKe& eIjYiZZej[eEeV\ ej]eZdLdMdNZ^e^_ Z`eIjare+e`eIjY dS dS dS )Pah  An example of customizing PPO to leverage a centralized critic.

Here the model and policy are hard-coded to implement a centralized critic
for TwoStepGame, but you can adapt this for your own use cases.

Compared to simply running `rllib/examples/two_step_game.py --run=PPO`,
this centralized critic version reaches vf_explained_variance=1.0 more stably
since it takes into account the opponent actions as well as the policy's.
Note that this is also using two independent policies instead of weight-sharing
with one.

See also: centralized_critic_2.py for a simpler approach that instead
modifies the environment.
    N)Discrete)tune)TRAINING_ITERATION)PPO	PPOConfig)PPOTF1PolicyPPOTF2Policy)PPOTorchPolicy)compute_advantagesPostprocessing)TwoStepGame)CentralizedCriticModelTorchCentralizedCriticModel)ModelCatalog)SampleBatch)override)try_import_tftry_import_torch)ENV_RUNNER_RESULTSEPISODE_RETURN_MEANNUM_ENV_STEPS_SAMPLED_LIFETIME)convert_to_numpy)check_learning_achieved)explained_variancemake_tf_callable)convert_to_torch_tensoropponent_obsopponent_actionz--framework)tftf2torchr    zThe DL framework specifier.)choicesdefaulthelpz	--as-test
store_truezuWhether this script should be run as a test: --stop-reward must be achieved within --stop-timesteps AND --stop-iters.)actionr#   z--stop-itersd   zNumber of iterations to train.)typer"   r#   z--stop-timestepsi zNumber of timesteps to train.z--stop-rewardg(\@z!Reward at which we stop training.c                   @   s   e Zd ZdZdd ZdS )CentralizedValueMixinzAAdd method to evaluate the central value function from the model.c                 C   s6   | j d dkrt|  | jj| _d S | jj| _d S )N	frameworkr    )configr   get_sessionmodelcentral_value_functioncompute_central_vf)self r0   Y/home/ubuntu/.local/lib/python3.10/site-packages/ray/rllib/examples/centralized_critic.py__init__Z   s
   

zCentralizedValueMixin.__init__N)__name__
__module____qualname____doc__r2   r0   r0   r0   r1   r(   W   s    r(   c           
      C   sp  | j d dk}|rt| ds|so|  ro|d usJ t| \\}}}|tj |t< |tj |t	< t
jdkr[| t|tj | jt|t | jt|t	 | j   |tj< n6t| |tj |t |t	 |tj< n"t|tj |t< t|tj |t	< tj|tj tjd|tj< |tj d }|rd}n|tj d }t||| j d | j d | j d	 d
}	|	S )Nr)   r    r.   )dtypeg        gammalambdause_gae)r;   )r*   hasattrloss_initializedlistvaluesr   CUR_OBSOPPONENT_OBSACTIONSOPPONENT_ACTIONargsr)   r.   r   devicecpudetachnumpyVF_PREDSr   np
zeros_likeREWARDSfloat32TERMINATEDSr
   )
policysample_batchother_agent_batchesepisodepytorch_opponent_batch	completedlast_rtrain_batchr0   r0   r1   !centralized_critic_postprocessinge   s`   

	
	rY   c                    s8   |j } fdd|_ |   _|||}||_ |S )Nc                      s    j tj t t S N)r,   r-   r   r@   rA   rC   r0   rO   rX   r0   r1   <lambda>   s
    z*loss_with_central_critic.<locals>.<lambda>)value_function_central_value_outloss)rO   base_policyr,   
dist_classrX   vf_savedr_   r0   r[   r1   loss_with_central_critic   s   
rc   c                 C   s   dt |tj | jiS )Nvf_explained_var)r   r   VALUE_TARGETSr^   r[   r0   r0   r1   central_vf_stats   s   rf   c                    s   G  fdddt  }|S )Nc                       s^   e Zd ZfddZe fddZe	dddZedef fd	d
Z  Z	S )z'get_ccppo_policy.<locals>.CCPPOTFPolicyc                    s     | ||| t |  d S rZ   )r2   r(   r/   observation_spaceaction_spacer*   baser0   r1   r2         z0get_ccppo_policy.<locals>.CCPPOTFPolicy.__init__c                       t | t |||S rZ   rc   superr/   r,   ra   rX   	__class__r0   r1   r_      s   z,get_ccppo_policy.<locals>.CCPPOTFPolicy.lossNc                 S      t | |||S rZ   rY   r/   rP   rQ   rR   r0   r0   r1   postprocess_trajectory      z>get_ccppo_policy.<locals>.CCPPOTFPolicy.postprocess_trajectoryrX   c                    s    t  |}|t| | |S rZ   )ro   stats_fnupdaterf   )r/   rX   statsrq   r0   r1   rx      s   z0get_ccppo_policy.<locals>.CCPPOTFPolicy.stats_fnNN)
r3   r4   r5   r2   r   r_   rv   r   rx   __classcell__r0   rj   rq   r1   CCPPOTFPolicy   s    	r}   )r(   )rk   r}   r0   rj   r1   get_ccppo_policy   s   r~   c                       s@   e Zd Zdd Zee fddZee	dddZ  ZS )	CCPPOTorchPolicyc                 C   s   t | ||| t|  d S rZ   )r	   r2   r(   rg   r0   r0   r1   r2      rl   zCCPPOTorchPolicy.__init__c                    rm   rZ   rn   rp   rq   r0   r1   r_      s   zCCPPOTorchPolicy.lossNc                 C   rs   rZ   rt   ru   r0   r0   r1   rv      rw   z'CCPPOTorchPolicy.postprocess_trajectoryr{   )	r3   r4   r5   r2   r   r	   r_   rv   r|   r0   r0   rq   r1   r      s    r   c                   @   s    e Zd Zeeedd ZdS )CentralizedCriticc                 C   s$   |d dkrt S |d dkrtS tS )Nr)   r    r   )r   CCPPOStaticGraphTFPolicyCCPPOEagerTFPolicy)clsr*   r0   r0   r1   get_default_policy_class   s
   z*CentralizedCritic.get_default_policy_classN)r3   r4   r5   classmethodr   r   r   r0   r0   r0   r1   r      s    r   __main__T)
local_modecc_modelF)"enable_env_runner_and_connector_v2enable_rl_module_and_learnercomplete_episodes)
batch_modenum_env_runnerscustom_model)r,      )framework_str)pol1pol2c                 K   s   | dkrdS dS )Nr   r   r   r0   )agent_idrR   workerkwargsr0   r0   r1   r\   )  s   r\   )policiespolicy_mapping_fnRLLIB_NUM_GPUS0)num_gpus/   )stopverbose)param_space
run_configr{   )br6   argparsegymnasium.spacesr   rH   rJ   osrayr   ray.tune.resultr   ray.rllib.algorithms.ppo.ppor   r   &ray.rllib.algorithms.ppo.ppo_tf_policyr   r   )ray.rllib.algorithms.ppo.ppo_torch_policyr	   #ray.rllib.evaluation.postprocessingr
   r   9ray.rllib.examples.envs.classes.multi_agent.two_step_gamer   Bray.rllib.examples._old_api_stack.models.centralized_critic_modelsr   r   ray.rllib.modelsr   ray.rllib.policy.sample_batchr   ray.rllib.utils.annotationsr   ray.rllib.utils.frameworkr   r   ray.rllib.utils.metricsr   r   r   ray.rllib.utils.numpyr   ray.rllib.utils.test_utilsr   ray.rllib.utils.tf_utilsr   r   ray.rllib.utils.torch_utilsr   tf1r   tfvr    nnrA   rC   ArgumentParserparseradd_argumentintfloatr(   rY   rc   rf   r~   r   r   r   r   r3   init
parse_argsrD   register_custom_modelr)   	api_stackenvironmentenv_runnerstrainingmulti_agentri   	overrides	resourcesenvirongetr*   
stop_itersstop_timestepsstop_rewardr   Tunerto_dict	RunConfigtunerfitresultsas_testr0   r0   r0   r1   <module>   s   

?	!
	$;