o
    TiC                     @   s   d dl Z d dlZd dlZd dlmZ d dlmZ ddlmZ ddl	m
Z
 ddlmZmZmZ ddlmZ ddlmZ dd	lmZ d
dlmZ d dlmZ ddlmZ G dd de
ZG dd deZG dd dejZG dd dejZ dS )    N)Function   )DeepSpeedSelfAttention)DeepSpeedInferenceConfig)	SoftmaxOpVectorMatMulOp
GELUGemmOp)BiasResidualOp)EinsumSecSmEcmOp)LayerNormOp   )TopKGate)comm)MoEResMatmulOpc                       sr   e Zd ZdZ																												
	d fdd	Zedd Zedd Z  ZS )DeepSpeedMoEInferenceConfiga  Initialize the DeepSpeed Transformer Config.
        Arguments:
            hidden_size: The hidden size of the transformer layer
            intermediate_size: The intermediate size of the feed-forward part of transformer layer
            heads: The number of heads in the self-attention of the transformer layer
            num_hidden_layers: The number of transformer layers
            layer_norm_eps: The epsilon value for the layer norm
            local_rank: Optional: The rank of GPU running the transformer kernel, it is not required
                to use if the model already set the current device, otherwise need to set it
                so that the transformer kernel can work on the right device
            mp_size (optional): This argument is mainly used to create the parameters on the kernel side
                using model-parallel architecture. If the client model already takes care of this, there is no
                need to pass this argument.
            fp16: Enable half-precision computation
            bf16: Enable bf16 floating point computation
            pre_layer_norm: Select between Pre-LN or Post-LN transformer architecture
            stochastic_mode:  Enable for high performance, please note that this flag has some level of
                non-determinism and can produce different results on different runs.  However, we have seen
                that by enabling it, the pretraining tasks such as BERT are not affected and can obtain
                a high accuracy level. On the other hand, for the downstream tasks, such as fine-tuning, we recommend
                to turn it off in order to be able to reproduce the same result through the regular kernel execution.

            scale_attention: If true, both q and k are scaled by 1/sqrt(attention_heads) before attention computation.
            return_tuple: if True, returns the transformer output as a tuple, otherwise returns as a tensor
    -q=r   FT         ?Nstandardc                    s   t t| ||dkr|nd| |||||||	|
||||||| || _|| _|| _|| _|| _|| _|| _	|| _
|| _|| _|| _d S )Nr   r   )superr   __init__moe_expertskcapacity_factoreval_capacity_factormin_capacitynoisy_gate_policydrop_tokensuse_rtsglobal_expertsmlp_typescale_attn_by_inverse_layer_idx)selfhidden_sizeintermediate_sizeheadsnum_hidden_layerslayer_norm_eps
local_rankmp_sizefp16bf16q_int8pre_layer_normstochastic_modescale_attentiontriangular_maskinglocal_attentionwindow_sizereturn_tupler   r    r   r   r   r   r   r   r   r!   r"   	__class__ e/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/ops/transformer/inference/moe_inference.pyr   1   s&   

z$DeepSpeedMoEInferenceConfig.__init__c                 C   s&   t  }| D ]	\}}||j|< q|S N)r   items__dict__)clsjson_objectconfigkeyvaluer7   r7   r8   	from_dict_   s   z%DeepSpeedMoEInferenceConfig.from_dictc                 C   sF   t |ddd}| }W d    n1 sw   Y  | t|S )Nrzutf-8)encoding)openreadrA   jsonloads)r<   	json_filereadertextr7   r7   r8   from_json_filef   s   
z*DeepSpeedMoEInferenceConfig.from_json_file)r   r   r   r   r   r   r   FFFTFTTFr   Tr   r   r   r   r   r   NTFr   F)	__name__
__module____qualname____doc__r   classmethodrA   rK   __classcell__r7   r7   r5   r8   r      sF    .
r   c                   @   s$   e Zd Zedd Zedd ZdS )DeepSpeedMLPFunctionc              	   C   s   |j r |||||j|d |d|	  |j}||||d ||	}n||||||j|j|}|
d ur@tj|
ddkr@tj||
|d || S )N      groupr   )rV   async_op)r-   epsilonr.   distget_world_size
all_reduce)ctxinputinter_winter_br>   output_boutput_wq_scalesq_groupsmerge_countmp_grouprW   gelu_gemm_funcvector_matmul_funcintermediateoutputr7   r7   r8   forwardo   s   zDeepSpeedMLPFunction.forwardc                 C   s   t d)NzYou are running with DeepSpeed Inference mode.                             Please switch to Training mode for running backward!)RuntimeError)r\   grad_outputr7   r7   r8   backward}   s   zDeepSpeedMLPFunction.backwardN)rL   rM   rN   staticmethodrj   rm   r7   r7   r7   r8   rR   m   s
    
rR   c                       s(   e Zd Zd fdd	Zd	ddZ  ZS )
DeepSpeedMoEMLPNr   Fc                    s  t t|   || _tt| jj| _	tt| jj| _
| jj|d u r)dntj|d }tt| jj|| _tt|| _tt|| jj| _tt| jj| _|| _|re|d n|| _tt|| _|| _t| j| _t| j| _d S )Nr   rU   rS   )r   ro   r   r>   nn	ParametertorchTensorr$   attn_nwattn_nbr%   rY   rZ   r^   r_   ra   r`   rb   rc   intmathlog2rd   re   r   rf   r   rg   )r#   r>   rb   rc   rd   mlp_extra_groupingre   interm_sizer5   r7   r8   r      s    zDeepSpeedMoEMLP.__init__c                 C   s8   t || j| j| j| j| j| j| j| j	| j
|| j| jS r9   )rR   applyr^   r_   r>   r`   ra   rb   rc   rd   re   rf   rg   )r#   r]   rW   r7   r7   r8   rj      s   zDeepSpeedMoEMLP.forward)Nr   r   FN)F)rL   rM   rN   r   rj   rQ   r7   r7   r5   r8   ro      s    ro   c                       s~   e Zd ZdZdZ							d fdd	Zdd	 Zd
d Zdd Zdd Z	dd Z
												dddZ  ZS )DeepSpeedMoEInferencea@  Initialize the DeepSpeed MoE Transformer Layer.
        Arguments:
            layer_id: The layer index starting from 0, e.g. if model has 24 transformer layers,
                layer_id will be 0,1,2...23 when each layer object is instantiated
            config: An object of DeepSpeedInferenceConfig
            mp_group: Model parallelism group initialized on the modeling side.
            quantize_scales: This argument groups all the layers' scales used for quantization
            quantize_groups: Number of groups used for quantizing the model
            merge_count: Shows the number of model-parallel checkpoints merged before running inference.
                We use this argument to control the quantization scale for the model parameters if a bigger
                quantize-grouping than 1 is used.
            mlp_extra_grouping: This flag is used to show a 2x higher number of groups used for the MLP part
                of a Transformer layer. We use this feature for quantization to reduce the convergence impact
                for specific downstream tasks.
    r   Nr   Fc	           	         s  t t|    | _tj| j_| jjtjksJ dt jd7  _t| j|| _	t
t| jj| _t
t| jj| _t
t| jj| _t
t| jj| _ jdkr~t || _t
t| jjd| _t| j| _t| j| _d _t
 fddt| jjD | _t| jj| jj | jj!| jj"| jj#| jj$| jj%| jj&| jj'| j(
| _)|| _(|| _*| _+t,d| jj- t.| j| _/t0| j| _1t2| j| _3t4| j| _5d S )NzEDeepSpeed MoE Transformer Inference not yet tested for bfloat supportr   residualrS   c                 3   s"    | ]}t  V  qd S r9   )ro   ).0ir>   expert_mp_grouprd   ry   quantize_groupsquantize_scalesr7   r8   	<genexpr>   s
    
z1DeepSpeedMoEInference.__init__.<locals>.<genexpr>z.DeepSpeed MoE Transformer Inference config is )6r   r|   r   r>   layer_iddtyperr   bfloat16r   	attentionrp   rq   rs   r$   rt   ru   norm_wnorm_br!   ro   res_mlpres_coefr   	coef_funcr   rg   r*   
ModuleListranger   mlpr   r    r   r   r   r   r   r   r   ep_groupmoe_gatere   r   printr;   r	   bias_residual_funcr   ds_layernormr
   einsum_sec_sm_ecmr   moe_res_matmul)	r#   r>   re   r   r   r   r   rd   ry   r5   r   r8   r      sD   	




zDeepSpeedMoEInference.__init__c              	   C   s,   |  || j|}| |tddddd|S )Nr   Fr   )rg   r   r   rr   empty)r#   inprW   r7   r7   r8   res_coef_func   s   z#DeepSpeedMoEInference.res_coef_funcc                 C   sF   |  |d| jjd \}}}}| |||d| jj}||fS )Nr   )r   viewr>   r$   r   type_as)r#   attention_output_combined_weightsdispatch_maskdispatched_attentionr7   r7   r8   moe_gate_einsum   s   z%DeepSpeedMoEInference.moe_gate_einsumc                 C   s   | | jj| jj | jjd| jj}|j| jjdd}tj| jj|d jd f|d jdd   |j	|j
d}t|tt| jD ]\}}| j| |d|jd |jd ||< qA|S )Nr   r   dimr   rS   r   device)reshaper>   r    r   r$   chunkrr   r   shaper   r   zipr   lenr   r   )r#   dispatched_inputchunksexpert_outputsr   expertr7   r7   r8   expert_exec   s$   z!DeepSpeedMoEInference.expert_execc                 C   s6   t j| jddkrt|}t j||| jd |S |S )NrU   r   )rY   rZ   r   rr   
empty_likeall_to_all_single)r#   r   r   r7   r7   r8   	_alltoall  s
   
zDeepSpeedMoEInference._alltoallc                 C   s:   t |||jd d|d|jd }||jS )Nr   r   )rr   matmulr   r   r   )r#   r   expert_outputr   combined_outputr7   r7   r8   scale_expert_output
  s
   z)DeepSpeedMoEInference.scale_expert_outputc                  C   s  |p|p|}|d u r|n|}|j }| jj tjtjfv r$|tjkr$| }t  | ||||||
||| j	| j

}|rI|dd \}}}||f}n|rV|dd \}}}}n|d }|| jj }| || j| j| jj}| jjdkr| j|dd}| j|dd}| jd urtj| jd}tj||  |j |jd}tj||| jd |jd	g| d
d  R  }| |\}}| |}| |}| |}| |||}| jd ur|j |j!d tj| jd ddtj"| jd }| jjdkr| #||| | $||td
}| jj%s| || j	| j
| jj}||j kr|&|}W d    n	1 s*w   Y  |r6||f}| jj'rGt(|t)u rD|S |fS |S )Nr   rT   r   r}   T)rW   rU   r   r   r   r   )*r   r>   rr   float16int8floathalfno_gradr   r   r   attn_obr   rt   ru   rX   r!   r   r   r   rY   rZ   r   numelr   all_gather_into_tensorr   sizer   r   r   r   splitr   get_rankr   r   r.   tor4   typetuple) r#   r]   
input_maskattention_mask	head_mask
layer_pastget_key_valueget_presentencoder_outputenc_dec_attn_maskencoder_hidden_statesencoder_attention_mask	use_cacheoutput_attentions
input_typer   p_keyp_valuepresentsr   context_outputresidual_addres_mlp_outres_coef_out
world_sizegather_bufferr   r   r   r   r   ri   r7   r7   r8   rj     sl   








3
zDeepSpeedMoEInference.forward)NNNNr   r   F)NNNNFFNNNNFF)rL   rM   rN   rO   r   r   r   r   r   r   r   rj   rQ   r7   r7   r5   r8   r|      s:    4	r|   )!rF   rw   rr   torch.autogradr   torch.nnrp   ds_attentionr   r>   r   
op_bindingr   r   r   op_binding.bias_residualr	   op_binding.einsum_sec_sm_ecmr
   op_binding.layer_normr   moe.sharded_moer   	deepspeedr   rY   op_binding.moe_res_matmulr   r   rR   Modulero   r|   r7   r7   r7   r8   <module>   s$   W