o
    Ti|                     @   sF   d dl T ddlmZ ddlm  mZ ddlm	Z	 G dd de
ZdS )   )*    )commN)get_acceleratorc                       sd   e Zd Z fddZdd Zdd Zdd Zd	d
 Zdd Zdd Z	dd Z
dd Zdd Z  ZS )BaseTransformerMoEContainerc                    s   t  jdi | | j | _t | _| j| jk rdn| j| j | _t	| j
dr-| j
jndf| _g | _g | _g | _g | _d | _d | _d | _d | _d | _d S )Nr   layer_norm_epsg-q= )super__init__policyget_num_expertsnum_expertsdistget_world_sizeep_world_sizelocal_ep_sizehasattrconfigr   _h4h_w_h4h_b_4hh_w_4hh_b
_res_h4h_w
_res_h4h_b
_res_4hh_w
_res_4hh_b	_res_coef)selfkwargs	__class__r   _/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/module_inject/containers/base_moe.pyr
      s   

z$BaseTransformerMoEContainer.__init__c                 C   sj   | j | j   | j| j dksJ dtj| j| j| j| j	| j
| j| j| j| j| jjj| jd| _| jS )Nr   zTo run the model parallel across the GPUs, the attention_heads require to be divisible by the world_size!This is because the attention computation is partitioned evenly among the parallel GPUs.)hidden_sizeheadsr   fp16pre_layer_normmp_sizeq_int8moe_expertsglobal_expertsmlp_typescale_attn_by_inverse_layer_idx)set_hidden_headsr   get_hidden_headsnum_attention_headsr&   transformer_inferenceDeepSpeedMoEInferenceConfigr"   r   r$   r%   quantizer   r   r   moetyper+   ds_model_configr   r   r   r!   create_ds_model_config&   s$   z2BaseTransformerMoEContainer.create_ds_model_configc                 C   s4   | j | j   | | jjj | j| j   d S N)	set_attentionr   	attentionset_mlpr   r2   r3   set_layernorm	layernormr5   r   r   r!   initialize_tensors<   s   z.BaseTransformerMoEContainer.initialize_tensorsc              
   C   sZ   |dkr| j  \| _| _| _| _d S | j |\	| _| _| _| _| _| _| _| _	| _
d S )Nstandard)r   mlpr   r   r   r   r   r   r   r   r   )r   config_moe_typer   r   r!   r:   B   s   

z#BaseTransformerMoEContainer.set_mlpc                 C   s.   |    |   | jjjdkr|   d S d S Nresidual)transpose_attentiontranspose_mlpr   r2   r3   transpose_residualr5   r   r   r!   	transposeL   s
   z%BaseTransformerMoEContainer.transposec                    s0    fdd j D  _  fdd jD  _d S )Nc                       g | ]}  |jqS r   transpose_impldata.0moe_w1r5   r   r!   
<listcomp>T       z=BaseTransformerMoEContainer.transpose_mlp.<locals>.<listcomp>c                    rG   r   rH   rK   r5   r   r!   rN   U   rO   )r   r   r5   r   r5   r!   rD   S   s   z)BaseTransformerMoEContainer.transpose_mlpc                 C   s:   |  | jj| j_|  | jj| j_|  | jj| j_d S r7   )rI   r   rJ   r   r   r5   r   r   r!   rE   W   s   z.BaseTransformerMoEContainer.transpose_residualc                 C   s(   |  | | | |   |   d S r7   )attention_qkv_mpattention_o_mpattention_quantizationmlp_mp)r   
mp_replacer   r   r!   apply_tensor_parallelism\   s   

z4BaseTransformerMoEContainer.apply_tensor_parallelismc                 C   s   t  }t| jD ]Z}| j|| j |  t  | jj	| j
_| j|| j |  t  | jj	| j_| j|| j |  t  | jj	| j_| j|| j |  t  | jj	| j_q	d S r7   )r   get_rankranger   r   tor   current_device_namemoduler?   inter_wrJ   r   inter_br   output_wr   output_b)r   	gpu_indexep_indexr   r   r!   rS   g   s   z"BaseTransformerMoEContainer.mlp_mpc                 C   s  | j t  | jj _| jt  | jj_| jjj| j	t   | jj
j| jt   | jjjdkr| jt  | jjj_| jt  | jjj_| jt  | jjj_| jt  | jjj_| jt  | jj_d S d S rA   )attn_nwrX   r   rY   rZ   rJ   attn_nbnorm_wcopy_input_nwnorm_binput_nbr   r2   r3   r   res_mlpr[   r   r\   r   r]   r   r^   r   res_coefr5   r   r   r!   copy_data_to_new_modulev   s   z3BaseTransformerMoEContainer.copy_data_to_new_module)__name__
__module____qualname__r
   r6   r=   r:   rF   rD   rE   rU   rS   rj   __classcell__r   r   r   r!   r      s    
r   )base	deepspeedr   r   deepspeed.ops.transformeropstransformerr/   deepspeed.acceleratorr   BaseTransformerContainerr   r   r   r   r!   <module>   s
   