o
    `۷i#                     @   sT  d dl Z d dlZd dlmZmZmZ d dlZd dlm	Z	 d dl
mZ d dlmZ d dlmZmZmZ er:d dlmZ eeZee dg d	Zd d i i i i i d
i f	ej_ededefddZe			
ddddeee  dededef
ddZe		ddddeee  dedee fddZe		
ddee dee dedefddZ dS )     N)TYPE_CHECKINGListOptional)DEFAULT_POLICY_ID)OldAPIStack)LEARNER_STATS_KEY)GradInfoDictLearnerStatsDict
ResultDict)EnvRunnerGroupRolloutMetrics)	episode_lengthepisode_rewardagent_rewardscustom_metrics
perf_stats	hist_datamediaepisode_faultyconnector_metricsF	grad_inforeturnc                 C   sJ   t | v r| t  S i }|  D ]\}}t|tu r"t |v r"|t  ||< q|S )aD  Return optimization stats reported from the policy.

    .. testcode::
        :skipif: True

        grad_info = worker.learn_on_batch(samples)

        # {"td_error": [...], "learner_stats": {"vf_loss": ..., ...}}

        print(get_stats(grad_info))

    .. testoutput::

        {"vf_loss": ..., "policy_loss": ...}
    )r   itemstypedict)r   multiagent_statskv r   R/home/ubuntu/vllm_env/lib/python3.10/site-packages/ray/rllib/evaluation/metrics.pyget_learner_stats$   s   r       workersr   remote_worker_idstimeout_secondskeep_custom_metricsc                 C   s    t | ||d}t|||d}|S )a  Gathers episode metrics from rollout worker set.

    Args:
        workers: EnvRunnerGroup.
        remote_worker_ids: Optional list of IDs of remote workers to collect
            metrics from.
        timeout_seconds: Timeout in seconds for collecting metrics from remote workers.
        keep_custom_metrics: Whether to keep custom metrics in the result dict as
            they are (True) or to aggregate them (False).

    Returns:
        A result dict of metrics.
    )r$   )r%   )collect_episodessummarize_episodes)r"   r#   r$   r%   episodesmetricsr   r   r   collect_metricsA   s   r*   c                 C   sH   | j dd d||d}t|dkrtd g }|D ]}|| q|S )a`  Gathers new episodes metrics tuples from the given RolloutWorkers.

    Args:
        workers: EnvRunnerGroup.
        remote_worker_ids: Optional list of IDs of remote workers to collect
            metrics from.
        timeout_seconds: Timeout in seconds for collecting metrics from remote workers.

    Returns:
        List of RolloutMetrics.
    c                 S   s   |   S )N)get_metrics)wr   r   r   <lambda>s   s    z"collect_episodes.<locals>.<lambda>T)local_env_runnerr#   r$   r   zWARNING: collected no metrics.)foreach_env_runnerlenloggerwarningextend)r"   r#   r$   metric_listsr(   r)   r   r   r   r&   ^   s   
r&   r(   new_episodesc           "      C   s  |du r| }g }g }t t}t t}t t}t t}t t}	t t}
d}| D ]}|j D ]\}}|| | q3|jrG|d7 }q,||j ||j |j	 D ]\}}|| | qXt
|jdkpot|jv}|r|j D ]\\}}}|| | qw|j D ]\}}||  |7  < q|j D ]\}}|	| | qt|dr|j D ]}| D ]}| D ]\}}|
| | qqqq,|rt|}t|}t|}ntd}td}td}|rt|}ntd}||d< ||d< i }i }i }|  D ]!\}}t|||< t|||< t|||< ||d|< q	|  D ]G\}}d	d
 |D } |rE| ||< q1t| ||d < | rdt| ||d < t| ||d < ntd||d < td||d < ||= q1|  D ]\}}t|||< qt }!|
 D ]\}}t||!|< qtd i d|d|d|d|dt|	dt|d|d|d|dt|dt|dt|d|d|!dt
|d|d|d|dt
|S )!a  Summarizes a set of episode metrics tuples.

    Args:
        episodes: List of most recent n episodes. This may include historical ones
            (not newly collected in this iteration) in order to achieve the size of
            the smoothing window.
        new_episodes: All the episodes that were completed in this iteration.
        keep_custom_metrics: Whether to keep custom metrics in the result dict as
            they are (True) or to aggregate them (False).

    Returns:
        A result dict of metrics.
    Nr      r   nanr   episode_lengthszpolicy_{}_rewardc                 S   s    g | ]}t t |s|qS r   )npanyisnan).0r   r   r   r   
<listcomp>   s     z&summarize_episodes.<locals>.<listcomp>_mean_min_maxepisode_reward_maxepisode_reward_minepisode_reward_meanepisode_len_meanepisode_mediaepisodes_timesteps_totalpolicy_reward_minpolicy_reward_maxpolicy_reward_meanr   
hist_statssampler_perfnum_faulty_episodesnum_episodesepisode_return_maxepisode_return_minepisode_return_meanepisodes_this_iterr   )collectionsdefaultdictlistr   r   appendr   r   r   r   r0   r   r   r   r   hasattrr   valuesminmaxr9   meanfloatcopyformatr   sum)"r(   r5   r%   episode_rewardsr8   policy_rewardsr   r   rJ   rE   r   rL   episoder   r   is_multi_agent_	policy_idrewardper_pipeline_metricsper_connector_metricsconnector_metric_nameval
min_reward
max_reward
avg_reward
avg_lengthrG   rI   rH   rewardsv_listfiltmean_connector_metricsr   r   r   r'      s   







	
r'   )Nr!   F)Nr!   )NF)!rR   loggingtypingr   r   r   numpyr9   ray.rllib.policy.sample_batchr   ray.rllib.utils.annotationsr   $ray.rllib.utils.metrics.learner_infor   ray.rllib.utils.typingr   r	   r
   ray.rllib.env.env_runner_groupr   	getLogger__name__r1   
namedtupler   __new____defaults__r    intboolr*   r&   r'   r   r   r   r   <module>   sv    


#