o
    Ti(                     @   s   d dl Z d dlZd dlZd dlmZmZ d dlZd dlmZ	 d dl
mZ d dlmZ ddlmZ ddlmZ ddlmZmZmZ dd	lmZmZ dd
lmZmZ ddlmZ ddlmZ dZ G dd dZ!dS )    N)IterableTuple)get_accelerator)init_distributed   )InferenceV2Policy)inference_logger)DSStateManagerRaggedBatchWrapperPlaceholderSequenceDescriptor)SchedulingErrorSchedulingResult)make_param_filenamemake_metadata_filename)DSInferenceModelBase)RaggedInferenceEngineConfigzmodel-forward-inferencec                	   @   s  e Zd ZU eed< 	 eed< 	 eed< 	 edej	fddZ
edefddZdefd	d
ZdededdfddZdd Z	d'dee deej	 dedej	fddZdededeeej	f fddZdee dee defddZdedefd d!Zdeddfd"d#Zd$eddfd%d&ZdS )(InferenceEngineV2_config_model_state_managerreturnc                 C      | j jS )z
        Number of free KV blocks. This is a tensor of shape [n_kv_cache_groups] where each
        element is the number of free blocks in the corresponding KV cache group.
        )r   free_blocksself r   T/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/inference/v2/engine_v2.pyr   /   s   zInferenceEngineV2.free_blocksc                 C   r   )z,
        Number of KV cache groups.
        )r   n_kv_cache_groupsr   r   r   r   r   7   s   z#InferenceEngineV2.n_kv_cache_groupsc                 C   s   | j S )z+
        The model implementation.
        )r   r   r   r   r   model>   s   zInferenceEngineV2.modelpolicyengine_configNc                 C   s~   || _ || _|  | _t d | j| j | j| _t d t| j j	| _
t| j j	| j | jd| _| j| j dS )ab  
        Create the Inference V2 engine.

        Arguments:
            policy (InferenceV2Policy): Policy for the model implementation. This policy object
                will be used to build the model and load the checkpoint associated with it.
            engine_config (RaggedInferenceEngineConfig): Configuration for the inference engine.
        zBuilding model...zModel built.)base_mp_groupN)r   _policy_initialize_tp_group_base_mp_groupr   infobuild_modelr   r
   state_manager_batchr	   kv_cache_configr   set_state_manager)r   r   r    r   r   r   __init__D   s   	
zInferenceEngineV2.__init__c                 C   sV   t   ttdd}t | || jjjkrt	dt
t| jjj}tj|dS )z@
        Implementation of our TP group initialization.
        
LOCAL_RANKr   zILocal rank is greater than TP size, ensure that the TP config is correct.)ranks)r   intosgetenvr   
set_devicer   tensor_paralleltp_sizeRuntimeErrorlistrangedist	new_group)r   
local_rankr-   r   r   r   r#   ]   s   z&InferenceEngineV2._initialize_tp_groupT
batch_uidsbatch_tokens	do_checksc           
      C   s   |rdd |D }|  ||}|tjkrt|| j  t||D ]#\}}| j|}| j	
||  ||  | jj|||d q"| j  | j	| j | j	| j}	|	jd | jjksdJ |D ]}| j|}|  | j	| qf|	S )a   
        Put a ragged batch onto the inference engine. This will perform one forward and return
        a Tensor of the shape [len(batch_uids), *output_shape]. Logits for the non-final tokens
        are not calculated.

        Arguments:
            batch_uids: Iterable of uids for the batch on the host
            batch_tokens: Iterable of token tensors for the batch on the host
            do_checks: Check schedulability when it is set to True. You can skip this check for better performance when it has already been completed.
        c                 S   s   g | ]}t |qS r   )len).0tokensr   r   r   
<listcomp>{   s    z)InferenceEngineV2.put.<locals>.<listcomp>)r<   r   )can_scheduler   Successr   r(   clearzipr   get_or_create_sequencer   maybe_allocate_kvnumelpre_forwardinsert_sequencefinalizeprepare_batchforwardshapecurrent_sequencesget_sequencepost_forwardmaybe_free_kv)
r   r:   r;   r<   
token_lensschedule_checkuidr?   host_seq_desclogitsr   r   r   putk   s(   


zInferenceEngineV2.putrT   max_request_tokensc                 C   sL   | j |}|du r| j j| jjjkrdS t }| j|||\}}||fS )a  
        Determine the number of tokens and KV blocks to reserve for a given request. Given a UID
        (this UID may not be recognized by the model yet), this will return the number of tokens
        and blocks to reserve for the request.

        Arguments:
            uid (int): The UID of the sequence (as tracked by the scheduling entity). If
                this is a new sequence (with a UID unknown to the inference engine), then
                an empty placeholder is created to pass to the occupancy logic.
            n_tokens (int): The number of tokens to hypothetically send.

        Returns:
            Tuple[int, Optional[int]]: Tuple of free kv blocks and the number of blocks
                required to schedule the sequence.
        N)r   r   )	r   rO   n_tracked_sequencesr   r'   max_tracked_sequencesr   r   get_kv_requirements)r   rT   rX   max_request_blocksseq_desc
req_tokens
req_blocksr   r   r   query   s   zInferenceEngineV2.queryuidslengthsc                 C   s   | j j}| j j}d}d}t|| jjjkrtjS t	||D ]0\}}| j 
|}	|	du r2|d7 }t }	| j|	||\}
}|
|krEtj  S ||7 }||8 }q|| jjjkrXtjS || jjjkrbtjS tjS )a|  
        Dry run a batch to determine if it can be scheduled. Placeholder sequences will be
        created for any UIDs that are unknown to the inference engine.

        Arguments:
            uids (Iterable[int]): Iterable of UIDs for the batch
            lengths (Iterable[int]): Iterable of lengths for each sequence of the batch. This lengths
                corresponds to the number of tokens to send in the hypothetical forward; history
                tokens will be determined via UID lookup and future tokens are disregarded.

        Returns:
            bool: True if the batch can be scheduled, False otherwise.
        r   Nr   )r   rY   r   r=   r   r'   max_ragged_sequence_countr   BatchSequenceLimitExceededrD   rO   r   r   r[   KVCacheLimitExceededrZ   EngineSequenceLimitExceededmax_ragged_batch_sizeBatchTokenLimitExceededrB   )r   ra   rb   cur_seqsr   r_   	batch_lenrT   lengthr]   	sched_lensched_blocksr   r   r   rA      s*   

zInferenceEngineV2.can_schedulec                 C   s$   | j |}|du rdS | j|S )zQ
        Get the remaining capacity of the last block already allocated.
        Nr   )r   rO   r   get_remaining_block_capacity)r   rT   r]   r   r   r   rn      s   z.InferenceEngineV2.get_remaining_block_capacityc                 C   s   | j | dS )z
        Remove all state associated with a sequence from the inference engine.

        Arguments:
            uid (int): The UID of the sequence to flush.
        N)r   flush_sequence)r   rT   r   r   r   flush   s   zInferenceEngineV2.flush	save_pathc                 C   s   t || jj| jj}t|| jj| jj}t| jj| t	| jj
 t|d | jjdkrAt	| jjttj|dd dS dS )z~
        Serialize the model to a file.

        Arguments:
            path (str): Path to the file to serialize to.
        wr   zds_model_config.pklwbN)r   r   tp_rankr3   r   torchsaveflattened_paramsjsondumpflattened_param_metadataopenpickler   r/   pathjoin)r   rq   param_file_namemetadata_file_namer   r   r   	serialize   s   $zInferenceEngineV2.serialize)T)__name__
__module____qualname__r   __annotations__r   r	   propertyru   Tensorr   r.   r   r   r   r+   r#   r   boolrW   r   r`   r   rA   rn   rp   strr   r   r   r   r   r      s:   
 
 31		r   )"r/   rx   r|   typingr   r   ru   deepspeed.commcommr7   deepspeed.acceleratorr   deepspeed.comm.commr   model_implementationsr   loggingr   raggedr	   r
   r   scheduling_utilsr   r   (model_implementations.flat_model_helpersr   r   *model_implementations.inference_model_baser   	config_v2r   INFERENCE_MODEL_TIMERr   r   r   r   r   <module>   s"   