o
    Ti*                     @   sN   d dl mZmZmZ d dlZG dd dZG dd deZG dd deZdS )	    )ListTupleUnionNc                   @   sN   e Zd ZedefddZeddedefddZeddedefdd	Zd
S )BaseSequenceDescriptorreturnc                 C      t  )z\
        The number of tokens for this sequence that have completed a forward pass.
        NotImplementedErrorself r   e/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/inference/v2/ragged/sequence_descriptor.pyseen_tokens      z"BaseSequenceDescriptor.seen_tokensr   cache_groupc                 C   r   )zP
        The number of KV blocks currently allocated for this sequence.
        r   r   r   r   r   r   cur_allocated_blocks   r   z+BaseSequenceDescriptor.cur_allocated_blocksc                 C   r   )zA
        The pointer to the KV blocks for this sequence.
        r   r   r   r   r   kv_blocks_ptr   r   z$BaseSequenceDescriptor.kv_blocks_ptrNr   )__name__
__module____qualname__propertyintr   r   r   r   r   r   r   r      s    r   c                   @   s^   e Zd ZdZddddZedefddZedd	edefd
dZedd	edefddZ	dS )PlaceholderSequenceDescriptorz
    The DummySequenceDescriptor is an empty object that allows us to perform schedulability
    checks before formally tracking a sequence.
    r   r   Nc                 C   s   || _ || _|| _d S N)_seen_tokens_cur_allocated_blocks_kv_blocks_ptr)r   r   r   r   r   r   r   __init__)   s   
z&PlaceholderSequenceDescriptor.__init__c                 C      | j S r   r   r
   r   r   r   r   .      z)PlaceholderSequenceDescriptor.seen_tokensr   c                 C   r    r   )r   r   r   r   r   r   2   r"   z2PlaceholderSequenceDescriptor.cur_allocated_blocksc                 C   r    r   )r   r   r   r   r   r   6   r"   z+PlaceholderSequenceDescriptor.kv_blocks_ptr)r   r   r   r   Nr   )
r   r   r   __doc__r   r   r   r   r   r   r   r   r   r   r   #   s    r   c                   @   s  e Zd ZU eed< 	 eed< 	 eed< 	 eedf ed< 	 eejdf ed< 	 eejdf ed< eejdf ed< 	 eed	< 	
d2dedeejdf deejdf deddf
ddZ	e
defddZe
defddZe
defddZe
defddZe
d3dedefddZd4ded edejfd!d"Ze
d3dedefd#d$Zd3dedejfd%d&Zd'eddfd(d)Zd5d*d+Zd3d,eeej ejf deddfd-d.Zd3d/eeej ejf deddfd0d1ZdS )6DSSequenceDescriptorr   _in_flight_tokens_max_context._num_allocation_groups_blocks_per_allocation_group_kv_cache_ids_kv_cache_ids_shadow_tracking_idtracking_idkv_cache_idskv_cache_ids_shadowmax_contextr   Nc                 C   s   || _ || _|| _|| _t|| _d| _d| _tdd |D | _	tdd | j	D | _
t|D ]\}}| j	| |jd ks@J t|jdksIJ q0dS )a  
        Create the metadata to track a single sequence in the system.

        Arguments:
            tracking_id (int): The slot in the tracking buffers used to track this sequence.
            kv_cache_ids (Tuple[torch.Tensor, ...]): The KV-cache IDs for the sequence. The shape
                of the tensor should be [num_allocation_groups, max_blocks_per_allocation_group].
                There should be one tensor per cache group.
            kv_cache_ids_shadow (Tuple[torch.Tensor, ...]): The shadow tensor for the KV-cache IDs.
                This tensor should be allocated on the host and should have the same shape as the
                tensor provided in ``kv_cache_ids``. There should be one tensor per cache group.
            max_context (int): The maximum number of tokens this sequence may eventually include.
                Currently unused but may be used in future implementations for speculative caching.
        r   c                 s   s    | ]}|j d  V  qdS )r   N)shape).0r0   r   r   r   	<genexpr>~   s    z0DSSequenceDescriptor.__init__.<locals>.<genexpr>c                 s   s"    | ]}t j|t jd dV  qdS )cpu)dtypedeviceN)torchzerosint32)r3   
num_groupsr   r   r   r4      s    
   N)r,   r*   r+   r'   len_n_cache_groupsr   r&   tupler(   r)   	enumerater2   )r   r.   r/   r0   r1   r   r   r   r   r   b   s"   


zDSSequenceDescriptor.__init__c                 C   r    )zV
        Number of tokens in the sequence that have completed a forward pass.
        r!   r
   r   r   r   r      r   z DSSequenceDescriptor.seen_tokensc                 C   r    )z[
        Number of tokens that have begun a forward pass but not yet completed it.
        r&   r
   r   r   r   in_flight_tokens   r   z%DSSequenceDescriptor.in_flight_tokensc                 C   r    )zO
        Maximum number of tokens for this sequence. Currently unused.
        )r'   r
   r   r   r   r1      r   z DSSequenceDescriptor.max_contextc                 C   r    )zV
        Return the slot in the tracking buffers used to track this sequence.
        )r,   r
   r   r   r   r.      r   z DSSequenceDescriptor.tracking_idr   r   c                 C   s.   t | jdkr| jd  S | j|   S )z
        Returns the number of blocks currently allocated for this sequence in the specified cache group.

        Arguments:
            cache_group (int): The cache group to query.
           r   )r=   r)   itemsumr   r   r   r   r      s   
z)DSSequenceDescriptor.cur_allocated_blocksF	on_devicec                 C   s   |r| j | S | j| S )a=  
        Returns the Tensor containing the block IDs for this sequence on the appropriate device
        for the specified cache group.

        Arguments:
            cache_group (int): The cache group to query.
            on_device (bool): Whether or not to return the Tensor on the device or on the host.
        )r*   r+   )r   r   rF   r   r   r   r/      s   	

z!DSSequenceDescriptor.kv_cache_idsc                 C   s   | j |  S )z
        Get the device pointer to the base of the KV-cache ids for the specified cache group and
        sequence.

        Arguments:
            cache_group (int): The cache group to query.
        )r*   data_ptrr   r   r   r   r      s   	z"DSSequenceDescriptor.kv_blocks_ptrc                 C   s@   g }t | j| | j| D ]\}}||d|  qt|S )z
        Return the Tensor containing all block IDs for this sequence in the specified cache group.

        Arguments:
            cache_group (int): The cache group to query.
        N)zipr*   r)   appendr8   cat)r   r   	block_idsallocation_group
num_blocksr   r   r   all_block_ids   s   

z"DSSequenceDescriptor.all_block_ids
num_tokensc                 C   s
   || _ dS )z
        Update the state of the sequence before a forward pass.

        Arguments:
            num_tokens (int): The number of tokens in the sequence that will be executed during the
                next forward pass of the model.
        NrA   )r   rO   r   r   r   pre_forward   s   
z DSSequenceDescriptor.pre_forwardc                 C   s   |  j | j7  _ d| _dS )a  
        Update the state of the sequence after a forward pass. This should be called after the forward
        pass completes. NOTE: due to the asynchronous nature of the accelerator, this may be called
        before the forward pass completes on the device itself.
        r   N)r   r&   r
   r   r   r   post_forward   s   
z!DSSequenceDescriptor.post_forwardnew_idsc           	      C   s   t |tjr	|g}t|| j| kr!tdt| d| j|  t|D ]K\}}| }|dkr2q%| j| | }| j	| | }| j
| | }||||  | ||||  j||||  dd | j
| |  |7  < q%dS )a  
        Extend the KV-cache for the sequence.

        Arguments:
            new_ids (Union[List[torch.IntTensor], torch.IntTensor]): For each allocation group, the IDs
                to add to the KV-cache. If there is only one allocation group, a single tensor can be
                provided. Otherwise, a list of tensors should be provided. The tensors do not need
                to have the same shape.
        zOnly z& allocation groups provided, expected r   T)non_blockingN)
isinstancer8   Tensorr=   r(   
ValueErrorr@   numelr+   r*   r)   copy_)	r   rR   r   group_idnew_group_ids
new_blocksshadow_alloc_groupalloc_group
cur_blocksr   r   r   extend_kv_cache   s*   
z$DSSequenceDescriptor.extend_kv_cachefree_idsc                 C   s   t d)a  
        Free blocks from the KV-cache for the sequence.

        Arguments:
            free_ids (Union[List[torch.IntTensor], torch.IntTensor]): The ids of blocks to free
                from the KV-cache. If there is only one allocation group, a single tensor can be
                provided. Otherwise, a list of tensors should be provided. The tensors do not need
                to have the same shape.
        z.Partial KV-cache freeing is not yet supported.r   )r   r`   r   r   r   r   free_kv_cache  s   
z"DSSequenceDescriptor.free_kv_cache)r-   r   )r   Fr#   )r   r   r   r   __annotations__r   r8   	IntTensorrU   r   r   r   rB   r1   r.   r   boolr/   r   rN   rP   rQ   r   r   r_   ra   r   r   r   r   r%   ;   sZ   
 
%

(	,#r%   )typingr   r   r   r8   r   r   r%   r   r   r   r   <module>   s
   