o
    ci$                     @   s   d dl Zd dlZd dlZd dlZd dlmZmZ d dl	m
Z
 d dlmZ d dlmZmZ d dlmZmZ d dlmZmZmZ d dlmZ d d	lmZ d d
lmZmZ d dlmZ e \Z Z!Z"e \Z#Z$eG dd deZ%dS )    N)UnionOptional)TorchMultiActionDistribution)ActionDistribution)overrideOldAPIStack)Exploration
TensorType)try_import_tftry_import_torchget_variable)from_config)convert_to_numpy)SchedulePiecewiseSchedule)	FLOAT_MINc                       s  e Zd ZdZdddeddddejjd	ed
e	de	dedede
e f fddZeedddedeeef de
eeef  fddZdedeeef deeef ddfddZdededeeef ddfddZeed&de
d  fd!d"Zeed&d#ede
d  ddfd$d%Z  ZS )'EpsilonGreedya
  Epsilon-greedy Exploration class that produces exploration actions.

    When given a Model's output and a current epsilon value (based on some
    Schedule), it produces a random action (if rand(1) < eps) or
    uses the model-computed one (if rand(1) >= eps).
    g      ?g?r   g     j@N)initial_epsilonfinal_epsilonwarmup_timestepsepsilon_timestepsepsilon_scheduleaction_space	frameworkr   r   r   r   r   c          	         s   |dusJ t  jd	||d| tt||dp+td|f||f|| |fg|| jd| _tt	dtj
|dtj
d| _| jdkrH|  | _dS dS )
aT  Create an EpsilonGreedy exploration class.

        Args:
            action_space: The action space the exploration should occur in.
            framework: The framework specifier.
            initial_epsilon: The initial epsilon value to use.
            final_epsilon: The final epsilon value to use.
            warmup_timesteps: The timesteps over which to not change epsilon in the
                beginning.
            epsilon_timesteps: The timesteps (additional to `warmup_timesteps`)
                after which epsilon should always be `final_epsilon`.
                E.g.: warmup_timesteps=20k epsilon_timesteps=50k -> After 70k timesteps,
                epsilon will reach its final value.
            epsilon_schedule: An optional Schedule object
                to use (instead of constructing one from the given parameters).
        N)r   r   )r   r   )	endpointsoutside_valuer   timestep)r   tf_namedtypetf )super__init__r   r   r   r   r   r   nparrayint64last_timestep	get_state_tf_state_op)	selfr   r   r   r   r   r   r   kwargs	__class__r    ^/home/ubuntu/.local/lib/python3.10/site-packages/ray/rllib/utils/exploration/epsilon_greedy.pyr"      s.   

zEpsilonGreedy.__init__T)exploreaction_distributionr   r.   c                C   s&   | j dv r| |||S | |||S )N)tf2r   )r   _get_tf_exploration_action_op_get_torch_exploration_action)r)   r/   r   r.   r    r    r-   get_exploration_actionU   s   
	z$EpsilonGreedy.get_exploration_actionreturnz	tf.Tensorc           
         sj  |j }| |dur|n| j}tj|ddt|d }tt|tjj	t
|tjj	 t
|}tjtj|dddtjjt|gddtjd|k  tjt|tr`tj|tjdn| fddfd	dd
}| jdkr| jd s|| _|tj|tjdfS t| jt|tj}	t|	g |tj|tjdfW  d   S 1 sw   Y  dS )a!  TF method to produce the tf op for an epsilon exploration action.

        Args:
            action_distribution: The instantiated ActionDistribution object
                to work with when creating exploration actions.

        Returns:
            The tf exploration-action op.
        N   axisr   )minvalmaxvalr   r   c                      s   t  S N)r   wherer    chose_randomexploit_actionrandom_actionsr    r-   <lambda>   s    z=EpsilonGreedy._get_tf_exploration_action_op.<locals>.<lambda>c                      s    S r;   r    r    )r?   r    r-   rA      s    )predtrue_fnfalse_fnr0   eager_tracing)inputsr   r&   r   argmaxshaper<   equalfloat32min	ones_likesqueezerandomcategoricaluniformstackcond
isinstanceboolconstantr   policy_config
zeros_liketf1assigncastr%   control_dependencies)
r)   r/   r.   r   q_valuesepsilon
batch_sizerandom_valid_action_logitsaction	assign_opr    r=   r-   r1   g   sD   
$z+EpsilonGreedy._get_tf_exploration_action_opztorch.Tensorc                 C   s*  |j }|| _| }| d }tj|tjd}|r| | j}t|t	r^t
|}t|D ]$}	t |k rRt
| j }
tt|D ]}t|
| || |	< qDq.t
|j|}||fS t|tkt|d t|}tjt|ddd}tt|f | j|k ||}||fS ||fS )a   Torch method to produce an epsilon exploration action.

        Args:
            action_distribution: The instantiated
                ActionDistribution object to work with when creating
                exploration actions.

        Returns:
            The exploration-action.
        r   r:   g        r5   r6   )rF   r&   deterministic_samplesizetorchzerosfloatr   rS   r   treeflattenrangerN   r   samplelentensorunflatten_asaction_space_structr<   r   rL   rM   multinomialemptyuniform_todevice)r)   r/   r.   r   r\   r?   r^   action_logpr]   irandom_actionjr_   r@   r`   r    r    r-   r2      sD   

z+EpsilonGreedy._get_torch_exploration_actionsessz
tf.Sessionc                 C   sR   |r| | jS | | j}| jdkrt|n|| jdkr$t| jdS | jdS )Nr   )cur_epsilonr&   )runr(   r   r&   r   r   )r)   rx   epsr    r    r-   r'      s   
zEpsilonGreedy.get_statestatec                 C   sP   | j dkr| jj|d |d d S t| jtr|d | _d S | j|d  d S )Nr   r&   )session)r   r&   loadrS   intrY   )r)   r|   rx   r    r    r-   	set_state   s
   
zEpsilonGreedy.set_stater;   )__name__
__module____qualname____doc__r   gymspacesSpacestrrf   r   r   r"   r   r   r   r   r	   rT   r3   r1   r2   r'   dictr   __classcell__r    r    r+   r-   r      sj    	7



;

A&r   )&	gymnasiumr   numpyr#   rg   rN   typingr   r   (ray.rllib.models.torch.torch_action_distr   ray.rllib.models.action_distr   ray.rllib.utils.annotationsr   r   'ray.rllib.utils.exploration.explorationr   r	   ray.rllib.utils.frameworkr
   r   r   ray.rllib.utils.from_configr   ray.rllib.utils.numpyr   ray.rllib.utils.schedulesr   r   ray.rllib.utils.torch_utilsr   rX   r   tfvrd   _r   r    r    r    r-   <module>   s$    
