o
    ci                     @   s   d dl Zd dlZd dlmZmZmZmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ er:d dlmZ d dlmZ e		dd
ejded deeef dedejf
ddZed
ejdeeef ded dejfddZed
ejdejfddZdS )    N)AnyDictTypeTYPE_CHECKING)SampleBatch)Policy)convert_to_numpy)DeveloperAPI)FQETorchModel)OffPolicyEstimatorTbatchmodel_classr
   model_statecompute_q_valuesreturnc                 C   sx   | |}ttjt| tj tjt| tj di}||}t|}|| d< |r:|	|}t|}|| d< | S )a  Computes the Q and V values for the given batch of samples.

    This function is to be used with map_batches() to perform a batch prediction on a
    dataset of records with `obs` and `actions` columns.

    Args:
        batch: A sub-batch from the dataset.
        model_class: The model class to use for the prediction. This class should be a
            sub-class of FQEModel that implements the estimate_q() and estimate_v()
            methods.
        model_state: The state of the model to use for the prediction.
        compute_q_values: Whether to compute the Q values or not. If False, only the V
            is computed and returned.

    Returns:
        The modified batch with the Q and V values added as columns.
    v_valuesq_values)

from_stater   OBSnpvstackACTIONSsqueeze
estimate_vr   
estimate_q)r   r   r   r   modelsample_batchr   r    r   ^/home/ubuntu/.local/lib/python3.10/site-packages/ray/rllib/offline/offline_evaluation_utils.pycompute_q_and_v_values   s   


r    policy_stateestimator_classr   c                 C   s   t |}||ddd}ttjt| d jtjt| d jdtj	t| d jdtj
t| d jdi}||}|tj	 }|tj
 }|| }	|	| }
|	| d< |
| d	< || d
< || d< | S )a%  Computes the importance sampling weights for the given batch of samples.

    For a lot of off-policy estimators, the importance sampling weights are computed as
    the propensity score ratio between the new and old policies
    (i.e. new_pi(act|obs) / old_pi(act|obs)). This function is to be used with
    map_batches() to perform a batch prediction on a dataset of records with `obs`,
    `actions`, `action_prob` and `rewards` columns.

    Args:
        batch: A sub-batch from the dataset.
        policy_state: The state of the policy to use for the prediction.
        estimator_class: The estimator class to use for the prediction. This class

    Returns:
        The modified batch with the importance sampling weights, weighted rewards, new
        and old propensities added as columns.
    r   )policygammaepsilon_greedyobsactionsr   action_probrewardsweightsweighted_rewardsnew_probold_prob)r   r   r   r   r   r   valuesr   r   ACTION_PROBREWARDScompute_action_probs)r   r!   r"   r#   	estimatorr   r,   r-   r)   r*   r+   r   r   r   compute_is_weights<   s&   



r3   c                 C   sJ   t jt jt jt jt jt jh}| jD ]}||v r"| | dd | |< q| S )a  Removes the time dimension from the given sub-batch of the dataset.

    If each row in a dataset has a time dimension ([T, D]), and T=1, this function will
    remove the T dimension to convert each row to of shape [D]. If T > 1, the row is
    left unchanged. This function is to be used with map_batches().

    Args:
        batch: The batch to remove the time dimension from.
    Returns:
        The modified batch with the time dimension removed (when applicable)
    c                 S   s   t | dkr
| d S | S )N   r   )len)xr   r   r   <lambda>   s    z!remove_time_dim.<locals>.<lambda>)	r   r   r   r/   r0   NEXT_OBSDONEScolumnsapply)r   BATCHED_KEYSkr   r   r   remove_time_dimk   s   
r>   )T)numpyr   pandaspdtypingr   r   r   r   ray.rllib.policy.sample_batchr   ray.rllib.policyr   ray.rllib.utils.numpyr   ray.rllib.utils.annotationsr	   ,ray.rllib.offline.estimators.fqe_torch_modelr
   1ray.rllib.offline.estimators.off_policy_estimatorr   	DataFramestrboolr    r3   r>   r   r   r   r   <module>   sF    
,
.