o
    Ti                     @   s   d dl mZmZmZ d dlZd dlmZ ddlm	Z	 ddl
mZmZ ddlT ddlT ddlT ddlmZ dd	lmZ d
dlmZmZ G dd deZdS )    )IterableOptionalTupleN   )
empty_from)ActivationType	DtypeEnum   )*)
heuristics)RaggedBatchWrapper   )QwenNonTransformerContainerQwenTransformerContainerc                   @   s  e Zd ZU dZee ed< 	 eee  ed< 	 	 e	de
fddZ	 e	de
fddZe	de
fd	d
Ze	de
fddZe	de
fddZe	de
fddZe	de
fddZe	de
fddZe	defddZe	defddZe	defddZe	defddZe	dee fddZd0d d!Z	 d"edej fd#d$Z!d%e
d&ej d'ej d(ede"ej ej f f
d)d*Z#d'ej d(edej fd+d,Z$d-edej fd.d/Z%dS )1QwenInferenceModelzP
    Inference model implementation for ragged batching for Llama-2 models.
    _non_transformer_transformerreturnc                 C      | j jS N)_configmax_seq_lengthself r   k/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/inference/v2/model_implementations/qwen/model.pymax_sequence_length)      z&QwenInferenceModel.max_sequence_lengthc                 C   r   r   )r   num_hidden_layersr   r   r   r   
num_layers1   r   zQwenInferenceModel.num_layersc                 C   r   r   )r   hidden_sizer   r   r   r   	model_dim5   r   zQwenInferenceModel.model_dimc                 C   r   r   )r   
vocab_sizer   r   r   r   r"   9   r   zQwenInferenceModel.vocab_sizec                 C   s   | j | j S r   )r!   n_headsr   r   r   r   	head_size=      zQwenInferenceModel.head_sizec                 C   r   r   )r   num_attention_headsr   r   r   r   r#   A   r   zQwenInferenceModel.n_headsc                 C   s   | j jd S )Nr	   )r   intermediate_sizer   r   r   r   intermediate_dimE   r%   z#QwenInferenceModel.intermediate_dimc                 C   s   | j j| j j S r   )r   r    kv_channelsr   r   r   r   
n_heads_kvI   s   zQwenInferenceModel.n_heads_kvc                 C   sB   | j j| j j dk}|rtjS | j jrtjS | j jrtjS td)Nr   z Only fp16 and bf16 are supported)r   bf16fp16r   NotImplementedError)r   autoset_precisionr   r   r   activation_dtypeM   s   z#QwenInferenceModel.activation_dtypec                 C      t jS r   )r   SiGLUr   r   r   r   mlp_activation_fn[      z$QwenInferenceModel.mlp_activation_fnc                 C   r0   r   )NormTypeEnumRMSNormr   r   r   r   	norm_type_   r3   zQwenInferenceModel.norm_typec                 C   r0   r   )PositionalEmbeddingTyperotate_halfr   r   r   r   positional_embedding_typec   r3   z,QwenInferenceModel.positional_embedding_typec                 C   s   t | jjdS )N)
theta_base)RotateHalfConfigr   rotary_emb_baser   r   r   r   positional_embedding_configg   s   z.QwenInferenceModel.positional_embedding_configNc              	   C   s>   t | jjj| j| j| j| j| j| jjd}t	
|| j| _dS )a  
        Instantiates the normalization layer for the model. This sets the `self.norm` attribute.

        TODO(cmikeh2): In the future we'll distinguish between the different norm objects,
        but for now we'll just use the same one for all of them.
        )
max_tokenstypechannelsresidual_dtypeinput_dtypeoutput_dtypeepsN)DSNormConfig_engine_configstate_managermax_ragged_batch_sizer6   r!   r/   r   layer_norm_epsilonr   instantiate_pre_normnorm)r   norm_configr   r   r   make_norm_layerk   s   
z"QwenInferenceModel.make_norm_layerragged_batchc                 C   s<   |  || jj}|jd | jkrtd|j d| j |S )z
        Performs the embedding lookup prior to running the transformer of the model.

        Arguments:
            ragged_batch (RaggedBatchWrapper): The batch to embed.

        Returns:
            torch.Tensor: The embedded batch.
        zEmbedding output shape z does not match model_dim )embedr   word_embshaper!   
ValueError)r   rN   rP   r   r   r   _forward_embed   s   
z!QwenInferenceModel._forward_embed	layer_idxresidualhidden_statesragged_batch_infoc                 C   s  | j | }| j|}| j||j|jd}| |||}| j||jdd}| j	dkr2t
j|| jd | j|||jdd\}}| j||jdd}| j||jdd}| j	dkr]t
j|| jd || jd kr{| j |d  }| j|||jdd\}}||fS || ||fS )aL  
        Executes one (slightly offset) layer of the transformer. This implementation does a peak-ahead
        optimization to fuse the layer norm of the next layer into the current layer.

        Arguments:
            layer_idx (int): The index of the layer to execute.
            residual (torch.Tensor): The residual tensor from the previous layer.
            hidden_states (torch.Tensor): The hidden states from the previous layer. This is the
                hidden states after pre normalization.
            ragged_batch_info (RaggedBatchWrapper): The batch metadata.
        )bNr   groupbeta)r   rG   	get_cacheqkvqkv_wqkv_battnattn_out
attn_out_wtp_sizedist
all_reduce_base_mp_grouprK   mlp_norm_gammamlp_1mlp_1_wmlp_2mlp_2_wr   attn_norm_gammaadd_)r   rU   rV   rW   rX   
cur_paramskv_cachenext_paramsr   r   r   _forward_transformer_layer   s$   



z-QwenInferenceModel._forward_transformer_layerc                 C   s   | j || jj|| jjd}| jdkrKt| j| j|jd |jd f}t| j|jd | j	f}t
j||| jd ||ddd|jd | j	 |S |S )z
        Performs unembedding of the hidden states to logits. This will only sample the final
        token of each sequence.
        )gammar   r   rZ   r	   )unembedr   word_unembed
final_normre   r   _comm_logitsrR   _return_logitsr"   rf   all_gather_into_tensorrh   copy_permutereshape)r   rW   rX   logitscomm_bufferfull_logitsr   r   r   _forward_unembed   s   
 $z#QwenInferenceModel._forward_unembedwrapped_batchc                 C   sX   |  |}| j|d | jd jd d\}}t| jD ]}| ||||\}}q| ||S )Nr   r\   )rT   rK   r   rn   ranger   rs   r   )r   r   rV   rW   rU   r   r   r   forward   s   


zQwenInferenceModel.forward)r   N)&__name__
__module____qualname____doc__r   r   __annotations__r   r   propertyintr   r   r!   r"   r$   r#   r(   r*   r   r/   r   r2   r4   r6   r7   r9   r;   r=   rM   r   torchTensorrT   r   rs   r   r   r   r   r   r   r      sX   
 

,r   )typingr   r   r   r   deepspeed.commcommrf   	allocatorr   inference_utilsr   r    modules.configsmodules.interfacesmodulesr   raggedr   	containerr   r   DSTransformerModelBaser   r   r   r   r   <module>   s   