o
    ci                     @   s   d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZ d dl	m
Z
mZ d dlmZ d dlmZ d dlmZ d d	lmZmZ eG d
d dZdS )    N)Optional)_TimerSimpleReplayBuffer)DEFAULT_POLICY_IDconcat_samples)OldAPIStack)
ReplayMode)_ALL_POLICIES)PolicyIDSampleBatchTypec                   @   sf   e Zd ZdZejfdededefddZde	dd	fd
dZ
efdedee	 fddZdefddZd	S )MixInMultiAgentReplayBuffera  This buffer adds replayed samples to a stream of new experiences.

    - Any newly added batch (`add()`) is immediately returned upon
    the next `replay` call (close to on-policy) as well as being moved
    into the buffer.
    - Additionally, a certain number of old samples is mixed into the
    returned sample according to a given "replay ratio".
    - If >1 calls to `add()` are made without any `replay()` calls
    in between, all newly added batches are returned (plus some older samples
    according to the "replay ratio").

    .. testcode::

        from ray.rllib.execution.buffers.mixin_replay_buffer import (
            MixInMultiAgentReplayBuffer)
        from ray.rllib.policy.sample_batch import SampleBatch
        # replay ratio 0.66 (2/3 replayed, 1/3 new samples):
        buffer = MixInMultiAgentReplayBuffer(capacity=100,
                                             replay_ratio=0.66)
        A, B, C = (SampleBatch({"obs": [1]}), SampleBatch({"obs": [2]}),
            SampleBatch({"obs": [3]}))
        buffer.add(A)
        buffer.add(B)
        buffer.add(B)
        print(buffer.replay()["obs"])

    .. testoutput::
        :hide:

        ...
    capacityreplay_ratioreplay_modec                    s    | _ || _d| _| jdkr| jd| j  | _|dtjfv r#tj| _n|dtjfv r/tj| _ntd| fdd}t	
|| _t | _t | _t | _d| _t	
t| _dS )	a  Initializes MixInReplay instance.

        Args:
            capacity: Number of batches to store in total.
            replay_ratio: Ratio of replayed samples in the returned
                batches. E.g. a ratio of 0.0 means only return new samples
                (no replay), a ratio of 0.5 means always return newest sample
                plus one old one (1:1), a ratio of 0.66 means always return
                the newest sample plus 2 old (replayed) ones (1:2), etc...
        N      ?lockstepindependentzUnsupported replay mode: {}c                      s
   t  dS )N)	num_slotsr    r   r   c/home/ubuntu/.local/lib/python3.10/site-packages/ray/rllib/execution/buffers/mixin_replay_buffer.py
new_bufferN   s   
z8MixInMultiAgentReplayBuffer.__init__.<locals>.new_bufferr   )r   r   replay_proportionr	   LOCKSTEPr   INDEPENDENT
ValueErrorformatcollectionsdefaultdictreplay_buffersr   add_batch_timerreplay_timerupdate_priorities_timer	num_addedlistlast_added_batches)selfr   r   r   r   r   r   r   __init__1   s"   


z$MixInMultiAgentReplayBuffer.__init__batchreturnNc                 C   s   |  }| }| j9 | jtjkr#| jt | | j	t 
| n|j D ]\}}| j| | | j	| 
| q(W d   n1 sGw   Y  |  j|j7  _dS )a2  Adds a batch to the appropriate policy's replay buffer.

        Turns the batch into a MultiAgentBatch of the DEFAULT_POLICY_ID if
        it is not a MultiAgentBatch. Subsequently adds the individual policy
        batches to the storage.

        Args:
            batch: The batch to be added.
        N)copyas_multi_agentr!   r   r	   r   r    r
   	add_batchr&   appendpolicy_batchesitemsr$   count)r'   r)   	policy_idsample_batchr   r   r   add^   s   zMixInMultiAgentReplayBuffer.addr2   c                 C   s"  | j tjkr|tkrtdt| j| }t|dks*t| j| dkr,| j	dk r,d S | j
X | j| }g | j|< | j	dkrJt|W  d    S | j	dkrZ| W  d    S t|}| j}t || k r||d8 }||  t || k sit|W  d    S 1 sw   Y  d S )NzTrying to sample from single policy's buffer in lockstep mode. In lockstep mode, all policies' experiences are sampled from a single replay buffer which is accessed with the policy id `{}`r   r   g           )r   r	   r   r
   r   r   r    lenr&   r   r"   r   replayr   randomr.   )r'   r2   bufferoutput_batchesnum_newr   r   r   r   r7   {   s4   




	$z"MixInMultiAgentReplayBuffer.replayc                 C   s   t  S )zReturns the computer's network name.

        Returns:
            The computer's networks name or an empty string, if the network
            name could not be determined.
        )platformnode)r'   r   r   r   get_host   s   z$MixInMultiAgentReplayBuffer.get_host)__name__
__module____qualname____doc__r	   r   intfloatr(   r   r4   r   r   r   r7   strr>   r   r   r   r   r      s$    $
-
+r   )r   r<   r8   typingr   ray.util.timerr   ray.rllib.execution.replay_opsr   ray.rllib.policy.sample_batchr   r   ray.rllib.utils.annotationsr   8ray.rllib.utils.replay_buffers.multi_agent_replay_bufferr	   ,ray.rllib.utils.replay_buffers.replay_bufferr
   ray.rllib.utils.typingr   r   r   r   r   r   r   <module>   s    