o
    Ti1                     @   sh   d dl mZ d dlZd dlmZ d dlmZ ddlmZ ddl	m
Z
 ded	efd
dZG dd dZdS )    )OptionalN)get_accelerator)RaggedUtilsBuilder   )DSSequenceDescriptor)DSStateManagerConfigoriginal_sizereturnc                 C   s0   dt dt dt fdd}| dkrdnd}|| |S )	z0
    Pad to a backend friendly granularity.
    val	pow_2_valr	   c                 S   s   | |d  |d  @ S )Nr    )r
   r   r   r   `/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/inference/v2/ragged/ragged_wrapper.py_pad_to_mul_of_pow2   s   z&to_padded.<locals>._pad_to_mul_of_pow2i   @      )int)r   r   granularityr   r   r   	to_padded   s   
r   c                   @   s  e Zd ZU dZejed< ejed< 	 ejed< ejed< 	 ejed< ejed< 	 ejed< ejed	< 	 ejed
< ejed< 	 deddfddZd/ddZ	d0de
dejddfddZedejfddZd1dee ddfddZd0dedejfdd Zd0dedejfd!d"Zd0dedejfd#d$Zd0dedejfd%d&Zd0dedejfd'd(Zd0dedeej fd)d*Zedefd+d,Zedefd-d.ZdS )2RaggedBatchWrapperaY  
    Container for all the auxiliary Tensors used in the management of a ragged batch.

    For each Tensor, we maintain a shadow Tensor on the host. This Tensor is what is
    directly populated when constructing the ragged batch. The shadow Tensors, when possible,
    should be allocated so as to support fast host-to-accelerator copies.
    _input_ids_shadow
_input_ids_batch_metadata_storage_batch_metadata_storage_shadow_token_to_seq_storage_token_to_seq_storage_shadow_inflight_seq_descriptors _inflight_seq_descriptors_shadow_kv_ptrs_kv_ptrs_shadowconfigr	   Nc                 C   s  || _ tj| j jtjt  d| _tjdtjt  d| _	tj| j jtjt  d| _
tj| j jdftjt  d| _tj| j jtjt  d| _t  | _| jj}|| j| _|| j	| _|| j
| _|| j| _|| j| _d| _d| _d| _g | _g | _g | _g | _dS )a'  
        Convenience wrapper around the data structures used to represent a ragged
        batch for inference. Only a single `RaggedBatchWrapper` should be used per
        ragged inference engine.

        The underlying data structures are implemented in `ragged_batch_descriptor.h`.
        )dtypedevice      Fr   N)_configtorchzerosmax_ragged_batch_sizeint64r   current_devicer   int32r   r   max_ragged_sequence_countr   r   r   load_utils_moduleallocate_fast_host_bufferr   r   r   r   r   
_is_padded_current_tokens_current_sequences_batch_tokens$_inflight_seq_descriptors_shadow_buf_kv_blocks_ptr_buf _token_to_seq_storage_shadow_buf)selfr   
host_allocr   r   r   __init__O   s@   



zRaggedBatchWrapper.__init__c                 C   s(   d| _ d| _g | _g | _g | _g | _dS )zb
        Clear the ragged batch. This will reset the number of tokens and sequences to 0.
        r   N)r0   r1   r2   r3   r4   r5   r6   r   r   r   clear{   s   
zRaggedBatchWrapper.clearTseq_descriptortokensc                 C   s   |j t dkrtd|j  d|r#| j| jjkr#td| jj | }|r<| j| | jjkr<td| jj d| j	
| | j
| j | j
| | j
|j | j
d | j| jg|  | j
|j |  j|7  _|  jd7  _d	S )
z
        Incrementally insert a sequence into the ragged batch. This will update the
        metadata for the ragged batch and the sequence.

        Arguments:
            seq_descriptor ()
        cpuz0Expected tokens to be on host but found device ''z,Ragged batch is full due to sequence limit: z,Ragged batch is full due to capacity limit: )r   r   N)r!   r%   RuntimeErrorcurrent_sequencesr$   r+   numelcurrent_tokensr'   r2   appendr3   seen_tokensr5   extendr4   kv_blocks_ptrr0   r1   )r6   r;   r<   	do_checks
seq_tokensr   r   r   insert_sequence   s    z"RaggedBatchWrapper.insert_sequencec                 C   s   | j }| jr
t|S |S z
        The number of tokens in the in-flight ragged batch. This will not trigger
        synchronization with the device.
        )rC   r/   r   )r6   cur_toksr   r   r   tensor_toks   s   zRaggedBatchWrapper.tensor_toksFpaddingc                 C   s  | j }| j dt| j t| j | jd| j  tj	| j
dd | jdt| j t| j | jdt| j t| j | jt|| jg |rst|}| j|| d | j|| d d| _n|}d| _| j}dtjdtjd	dfd
d}|| jd| | jd|  || j| j || jd| | jd|  || jd| | jd|  || jd| | jd|  dS )zh
        Completes construction of the ragged batch by flushing the host buffers to the device.
        Nr   )dimTFdstsrcr	   c                 S   s   | j |dd d S )NT)non_blocking)copy_)rQ   rR   r   r   r   _noblock_copy   s   z2RaggedBatchWrapper.finalize.<locals>._noblock_copy)rC   r   flattenlenr3   rT   r%   tensorr   catr2   r   r5   r   r4   r   rA   r   fill_r/   Tensorr   r   r   r   r   )r6   rN   rL   padded_toksrA   rU   r   r   r   finalize   s4   
"
""zRaggedBatchWrapper.finalize	on_devicec                 C      |r
| j d| j S | jS )z
        The input ids tensor for the ragged batch. If the device Tensor is requested, the Tensor
        is truncated to the number of tokens in the batch.
        N)r   rM   r   r6   r^   r   r   r   	input_ids      zRaggedBatchWrapper.input_idsc                 C   s   |r| j S | jS )z
        Buffer associated with the batch metadata tensor that can
        be populated in preparation for passing a new input to the device.
        )r   r   r`   r   r   r   batch_metadata_buffer   s   z(RaggedBatchWrapper.batch_metadata_bufferc                 C   r_   )z
        Mapping of token to which sequence it belongs to in the ragged batch. If the device Tensor
        is requested, the Tensor is truncated to the number of tokens in the batch.
        N)r   rM   r   r`   r   r   r   tokens_to_seq   rb   z RaggedBatchWrapper.tokens_to_seqc                 C   r_   )z
        Buffer associated with the metadata of each sequence in the ragged batch. If the device Tensor
        is requested, the Tensor is truncated to the number of sequences in the batch.
        N)r   rA   r   r`   r   r   r   inflight_seq_descriptors   rb   z+RaggedBatchWrapper.inflight_seq_descriptorsc                 C   r_   )z
        Pointer to where the list of KV ids associated with a sequence are. If the device Tensor
        is requested, the Tensor is truncated to the number of sequences in the batch.
        N)r   rA   r   r`   r   r   r   kv_ptrs  rb   zRaggedBatchWrapper.kv_ptrsc                 C   s   dS )z
        Placeholder for supporting complex masks. Currently not supported.

        Models that will need this will be BERT-like, not generative.
        Nr   r`   r   r   r   masks  s   zRaggedBatchWrapper.masksc                 C      | j S rK   )r0   r9   r   r   r   rC        z!RaggedBatchWrapper.current_tokensc                 C   rh   )z
        The number of sequences in the in-flight ragged batch. This will not trigger
        synchronization with the device.
        )r1   r9   r   r   r   rA     ri   z$RaggedBatchWrapper.current_sequences)r	   N)T)F)__name__
__module____qualname____doc__r%   r[   __annotations__r   r8   r:   r   rJ   propertyrM   r   boolr]   ra   rc   rd   re   rf   rg   r   rC   rA   r   r   r   r   r      sB   
 
	









,&$




r   )typingr   r%   deepspeed.acceleratorr   deepspeed.ops.op_builderr   sequence_descriptorr   manager_configsr   r   r   r   r   r   r   r   <module>   s   