o
    ci$                     @   sx   d Z ddlmZ ddlZddlZddlZddlmZ ddl	m
Z
 ddlmZ dZdZG d	d
 d
Zede
fddZdS )z
[1] IMPACT: Importance Weighted Asynchronous Architectures with Clipped Target Networks.
Luo et al. 2020
https://arxiv.org/pdf/1912.00167
    )dequeN)ModelCatalog)ModelV2)OldAPIStackfunctarget_funcc                   @   sL   e Zd ZdZdedefddZdd Zdd	 Zed
d Z	defddZ
dS )CircularBufferaJ  A circular batch-wise buffer as described in [1] for APPO.

    The buffer holds at most N batches, which are sampled at random (uniformly).
    If full and a new batch is added, the oldest batch is discarded. Also, each batch
    currently in the buffer can be sampled at most K times (after which it is also
    discarded).
    num_batchesiterations_per_batchc                 C   sj   || _ || _| j | j | _d| _tdd t| jD | jd| _t | _| j| _	t
 | _tj | _d S )Nr   c                 S   s   g | ]}d qS N ).0_r   r   S/home/ubuntu/.local/lib/python3.10/site-packages/ray/rllib/algorithms/appo/utils.py
<listcomp>'   s    z+CircularBuffer.__init__.<locals>.<listcomp>)maxlen)r	   r
   _NxK
_num_addedr   range_bufferset_indices_offset	threadingLock_locknprandomdefault_rng_rng)selfr	   r
   r   r   r   __init__   s    
zCircularBuffer.__init__c                 C   s   | j : | jd }t| jD ] }| j| | j| j | j| j| j	  |  jd7  _q|  j
d7  _
W d    n1 s@w   Y  d}|d urO| }|S )Nr      )r   r   r   r
   appendr   addr   discardr   r   	env_steps)r    batchdropped_entryr   
dropped_tsr   r   r   r$   .   s   

zCircularBuffer.addc                 C   s   t | dkrtd t | dks| j? | jt| j}|| j | j	 }| j
| }|d us@J ||| j| jdd | j
D fd | j
|< | j| W d    |S 1 sVw   Y  |S )Nr   g-C6?c                 S   s   g | ]}|d u qS r   r   )r   br   r   r   r   O   s    z)CircularBuffer.sample.<locals>.<listcomp>)lentimesleepr   r   choicelistr   r   r   r   r%   )r    idxactual_buffer_idxr'   r   r   r   sample@   s(   




zCircularBuffer.samplec                 C   s6   | j  | j| jkW  d   S 1 sw   Y  dS )zIWhether the buffer has been filled once with at least `self.num_batches`.N)r   r   r	   r    r   r   r   filledW   s   
$zCircularBuffer.filledreturnc                 C   s4   | j  t| jW  d   S 1 sw   Y  dS )zIReturns the number of actually valid (non-expired) batches in the buffer.N)r   r+   r   r3   r   r   r   __len__]   s   $zCircularBuffer.__len__N)__name__
__module____qualname____doc__intr!   r$   r2   propertyr4   r6   r   r   r   r   r      s    
r   r5   c                 C   s~   t | j| jd \}}t j| j| j|| jd t| jd| _| j	 | _
t j| j| j|| jd t| jd| _| j	 | _| jS )zBuilds model and target model for APPO.

    Returns:
        ModelV2: The Model for the Policy to use.
            Note: The target model will not be returned, just assigned to
            `policy.target_model`.
    model)name	framework)r   get_action_distaction_spaceconfigget_model_v2observation_spacePOLICY_SCOPEr?   r=   	variablesmodel_variablesTARGET_POLICY_SCOPEtarget_modeltarget_model_variables)policyr   	logit_dimr   r   r   make_appo_modelsc   s,   
rM   )r:   collectionsr   r   r,   numpyr   ray.rllib.models.catalogr   ray.rllib.models.modelv2r   ray.rllib.utils.annotationsr   rE   rH   r   rM   r   r   r   r   <module>   s    N