o
    ciZ                     @   s   d dl Z d dlmZmZmZmZ d dlZd dlZd dl	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZmZ d dlmZ d dlmZ e  Z eG dd deZ!dS )    N)DictAnyOptionalList)Dataset)OffPolicyEstimator)compute_q_and_v_values)OfflineEvaluator)FQETorchModel)Policy) convert_ma_batch_to_sample_batch)SampleBatch)DeveloperAPIoverride)SampleBatchType)convert_to_numpyc                       s   e Zd ZdZee		ddedededee	 f fdd	Z
eed
ede	eef fddZeedede	eee f fddZdd Zeedede	eef fddZeedddedede	eef fddZ  ZS )DirectMethoda  The Direct Method estimator.

    Let s_t, a_t, and r_t be the state, action, and reward at timestep t.

    This method trains a Q-model for the evaluation policy \pi_e on behavior
    data generated by \pi_b. Currently, RLlib implements this using
    Fitted-Q Evaluation (FQE). You can also implement your own model
    and pass it in as `q_model_config = {"type": your_model_class, **your_kwargs}`.

    This estimator computes the expected return for \pi_e for an episode as:
    V^{\pi_e}(s_0) = \sum_{a \in A} \pi_e(a | s_0) Q(s_0, a)
    and returns the mean and standard deviation over episodes.

    For more information refer to https://arxiv.org/pdf/1911.06854.pdf        Npolicygammaepsilon_greedyq_model_configc                    sv   t  ||| t|dr|jdddksJ d|pi }|dt}|d
||d|| _t| jds9J dd	S )a  Initializes a Direct Method OPE Estimator.

        Args:
            policy: Policy to evaluate.
            gamma: Discount factor of the environment.
            epsilon_greedy: The probability by which we act acording to a fully random
                policy during deployment. With 1-epsilon_greedy we act according the
                target policy.
            q_model_config: Arguments to specify the Q-model. Must specify
                a `type` key pointing to the Q-model class.
                This Q-model is trained in the train() method and is used
                to compute the state-value estimates for the DirectMethod estimator.
                It must implement `train` and `estimate_v`.
                TODO (Rohan138): Unify this with RLModule API.
        config	frameworktorchz,Framework must be torch to use DirectMethod.type)r   r   
estimate_vz'self.model must implement `estimate_v`!N )super__init__hasattrr   getpopr
   model)selfr   r   r   r   	model_cls	__class__r   ^/home/ubuntu/.local/lib/python3.10/site-packages/ray/rllib/offline/estimators/direct_method.pyr   '   s$   
zDirectMethod.__init__episodereturnc                 C   s\   i }|d }d}t |jD ]}||| | j|  7 }q| |d d }||d< ||d< |S )Nrewardsr      
v_behaviorv_target)rangecountr   _compute_v_target)r$   r)   estimates_per_epsioder+   r-   tr.   r   r   r(   estimate_on_single_episodeS   s   z'DirectMethod.estimate_on_single_episodebatchc                 C   s.   i }|d }|}|  |}||d< ||d< |S )Nr+   r-   r.   )r1   )r$   r5   r2   r+   r-   r.   r   r   r(   estimate_on_single_step_samplesc   s   
z,DirectMethod.estimate_on_single_step_samplesc                 C   s   | j |}t|}|S )N)r#   r   r   )r$   	init_stepr.   r   r   r(   r1   r   s   zDirectMethod._compute_v_targetc                 C   s"   t |}| j|}dt|iS )zTrains self.model on the given batch.

        Args:
            batch: A SampleBatchType to train on

        Returns:
            A dict with key "loss" and value as the mean training loss.
        loss)r   r#   trainnpmean)r$   r5   lossesr   r   r(   r9   w   s   
zDirectMethod.train.)n_parallelismdatasetr=   c          	      C   s|   t | | d}|jt|d| jj| j ddd}|d}|d}|| }|d| t	
|  }||||dS )	ao  Calculates the Direct Method estimate on the given dataset.

        Note: This estimate works for only discrete action spaces for now.

        Args:
            dataset: Dataset to compute the estimate on. Each record in dataset should
                include the following columns: `obs`, `actions`, `action_prob` and
                `rewards`. The `obs` on each row shoud be a vector of D dimensions.
            n_parallelism: The number of parallel workers to use.

        Returns:
            Dictionary with the following keys:
                v_target: The estimated value of the target policy.
                v_behavior: The estimated value of the behavior policy.
                v_gain: The estimated gain of the target policy over the behavior
                    policy.
                v_std: The standard deviation of the estimated value of the target.
        r,   pandasF)model_classmodel_statecompute_q_values)
batch_sizebatch_format	fn_kwargsr+   v_values)r-   r.   v_gain_mean
v_gain_ste)maxr0   map_batchesr   r#   r'   	get_stater;   stdmathsqrt)	r$   r>   r=   rC   
updated_dsr-   r.   rG   rH   r   r   r(   estimate_on_dataset   s(   

z DirectMethod.estimate_on_dataset)r   N)__name__
__module____qualname____doc__r   r   r   floatr   r   r   r   strr   r4   r   r6   r1   r   r9   r	   r   intrP   __classcell__r   r   r&   r(   r      sD    +
r   )"loggingtypingr   r   r   r   rM   numpyr:   ray.datar   1ray.rllib.offline.estimators.off_policy_estimatorr   *ray.rllib.offline.offline_evaluation_utilsr   #ray.rllib.offline.offline_evaluatorr	   ,ray.rllib.offline.estimators.fqe_torch_modelr
   ray.rllib.policyr   ray.rllib.policy.sample_batchr   r   ray.rllib.utils.annotationsr   r   ray.rllib.utils.typingr   ray.rllib.utils.numpyr   	getLoggerloggerr   r   r   r   r(   <module>   s$    