o
    Ti#                     @   s   d dl mZmZmZ d dlZd dlmZ ddlm	Z	 ddl
mZ ddlmZmZ ddlT ddlT ddlT ddlmZ d	d
lmZmZ ddlmZmZ G dd deZdS )    )IterableOptionalTupleN   )
empty_from)RaggedInferenceEngineConfig)ActivationType	DtypeEnum)*)RaggedBatchWrapper   )DSModelImplementationConfigMPType   )MixtralNonTransformerContainerMixtralTransformerContainerc                       s  e Zd ZU dZee ed< 	 eee  ed< 	 	 e	de
fddZ	 e	de
fddZe	de
fd	d
Ze	de
fddZe	de
fddZe	de
fddZe	de
fddZe	de
fddZe	defddZe	defddZe	defddZe	defddZe	dee fddZ	 e	de
fdd Ze	de
fd!d"Ze	defd#d$Z 	 d%e!d&e"d'e#dd(f fd)d*Z$d+e%de&j'fd,d-Z(d.e
d/e&j'd0e&j'd1e%de)e&j'e&j'f f
d2d3Z*d0e&j'd1e%de&j'fd4d5Z+d6e%de&j'fd7d8Z,  Z-S )9MixtralInferenceModelz<
    Inference model implementation for Mixtral models.
    _non_transformer_transformerreturnc                 C      | j jS N)_configmax_position_embeddingsself r   n/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/inference/v2/model_implementations/mixtral/model.pymax_sequence_length-      z)MixtralInferenceModel.max_sequence_lengthc                 C   r   r   )r   num_hidden_layersr   r   r   r   
num_layers5   r   z MixtralInferenceModel.num_layersc                 C   r   r   )r   hidden_sizer   r   r   r   	model_dim9   r   zMixtralInferenceModel.model_dimc                 C   r   r   )r   
vocab_sizer   r   r   r   r$   =   r   z MixtralInferenceModel.vocab_sizec                 C   s   | j | j S r   )r#   n_headsr   r   r   r   	head_sizeA   s   zMixtralInferenceModel.head_sizec                 C   r   r   )r   num_attention_headsr   r   r   r   r%   E   r   zMixtralInferenceModel.n_headsc                 C   r   r   )r   intermediate_sizer   r   r   r   intermediate_dimI   r   z&MixtralInferenceModel.intermediate_dimc                 C   r   r   )r   num_key_value_headsr   r   r   r   
n_heads_kvM   r   z MixtralInferenceModel.n_heads_kvc                 C   s0   | j jtjkr
tjS | j jtjkrtjS td)Nz Only fp16 and bf16 are supported)	r   torch_dtypetorchfloat16r	   fp16bfloat16bf16NotImplementedErrorr   r   r   r   activation_dtypeQ   s
   z&MixtralInferenceModel.activation_dtypec                 C   sT   | j j }|dkrtjS |dkrtjS |dkrtjS |dkr"tjS td| d)NgelurelugegelusiluzActivation z not supported)r   
hidden_actlowerr   GEGLUReGLUSiGLUr2   )r   
activationr   r   r   mlp_activation_fnZ   s   z'MixtralInferenceModel.mlp_activation_fnc                 C      t jS r   )NormTypeEnumRMSNormr   r   r   r   	norm_typeh      zMixtralInferenceModel.norm_typec                 C   r?   r   )PositionalEmbeddingTyperotate_halfr   r   r   r   positional_embedding_typel   rC   z/MixtralInferenceModel.positional_embedding_typec                 C   s   t | jjdS )zG
        The positional embedding configuration for the model.
        )
theta_base)RotateHalfConfigr   
rope_thetar   r   r   r   positional_embedding_configp   s   z1MixtralInferenceModel.positional_embedding_configc                 C   r   r   )r   num_local_expertsr   r   r   r   	n_experts{   r   zMixtralInferenceModel.n_expertsc                 C   r   r   )r   num_experts_per_tokr   r   r   r   n_top_k   r   zMixtralInferenceModel.n_top_kc                 C   s   dS )NTr   r   r   r   r   normalize_expert_scores   s   z-MixtralInferenceModel.normalize_expert_scoresconfigengine_configbase_mp_groupNc                    sR   t  ||| |   |   |   |   |   |   |   d| _	dS )a	  
        Base implementation for initialization. By default, this will initialize
        the traditional components of a transformer model:
            - Embedding
            - QKV projection
            - Self attention
            - Attention output projection
            - Feed forward network
            - Normalization
            - Unembedding

        Arguments:
            config (DSModelImplementationConfig): Model-specific configuration. No assumptions
                should be made about this config that are not closely tied to the specific
                model implementation.
            engine_config (RaggedInferenceEngineConfig): Engine configuration.
            base_mp_group (MPType): Base communication group for Tensor-parallel inference.
        N)
super__init__make_norm_layermake_qkv_layermake_attn_layermake_attn_out_layermake_moe_layermake_embedding_layermake_unembedding_layer_kv_cache_config)r   rP   rQ   rR   	__class__r   r   rT      s   
zMixtralInferenceModel.__init__ragged_batchc                 C   s<   |  || jj}|jd | jkrtd|j d| j |S )z
        Performs the embedding lookup prior to running the transformer of the model.

        Arguments:
            ragged_batch (RaggedBatchWrapper): The batch to embed.

        Returns:
            torch.Tensor: The embedded batch.
        zEmbedding output shape z does not match model_dim )embedr   word_embshaper#   
ValueError)r   r_   ra   r   r   r   _forward_embed   s   
z$MixtralInferenceModel._forward_embed	layer_idxresidualhidden_statesragged_batch_infoc                 C   s   | j | }| j|}| ||j}| |||}| ||j}| jdkr-t	j
|| jd | |||j\}}| |||j|j|j}| jdkrPt	j
|| jd || jd krl| j |d  }| |||j\}}||fS || ||fS )aL  
        Executes one (slightly offset) layer of the transformer. This implementation does a peak-ahead
        optimization to fuse the layer norm of the next layer into the current layer.

        Arguments:
            layer_idx (int): The index of the layer to execute.
            residual (torch.Tensor): The residual tensor from the previous layer.
            hidden_states (torch.Tensor): The hidden states from the previous layer. This is the
                hidden states after pre normalization.
            ragged_batch_info (RaggedBatchWrapper): The batch metadata.
        r   group)r   state_manager	get_cacheqkvqkv_wattnattn_out
attn_out_wtp_sizedist
all_reduce_base_mp_groupnormmlp_norm_gammamoemoe_gate	moe_mlp_1	moe_mlp_2r!   attn_norm_gammaadd_)r   rf   rg   rh   ri   
cur_paramskv_cachenext_paramsr   r   r   _forward_transformer   s&   



z*MixtralInferenceModel._forward_transformerc                 C   s   | j || jj|| jjd}| jdkrKt| j| j|jd |jd f}t| j|jd | j	f}t
j||| jd ||ddd|jd | j	 |S |S )z
        Performs unembedding of the hidden states to logits. This will only sample the final
        token of each sequence.
        )gammar   r   rj   r   )unembedr   word_unembed
final_normrs   r   _comm_logitsrc   _return_logitsr$   rt   all_gather_into_tensorrv   copy_permutereshape)r   rh   ri   logitscomm_bufferfull_logitsr   r   r   _forward_unembed   s   
 $z&MixtralInferenceModel._forward_unembedwrapped_batchc                 C   sX   |  |}| j|d | jd jd d\}}t| jD ]}| ||||\}}q| ||S )Nr   )beta)re   rw   r   r}   ranger!   r   r   )r   r   rg   rh   rf   r   r   r   forward   s
   
zMixtralInferenceModel.forward).__name__
__module____qualname____doc__r   r   __annotations__r   r   propertyintr   r!   r#   r$   r&   r%   r)   r+   r	   r3   r   r>   r@   rB   rD   rF   rH   rJ   rL   rN   boolrO   r   r   r   rT   r   r-   Tensorre   r   r   r   r   __classcell__r   r   r]   r   r      sn   
 

+r   )typingr   r   r   r-   deepspeed.commcommrt   	allocatorr   	config_v2r   inference_utilsr   r	   model_implementationsmodules.configsmodules.interfacesraggedr   inference_model_baser   r   	containerr   r   DSMoETransformerModelBaser   r   r   r   r   <module>   s   