o
    Ti`                     @   sn   d dl T d dlT d dlmZ ddlmZ ddlZd dlmZ ddl	m
Z G dd	 d	eeZG d
d deZdS )   )*)MegatronContainer    )DeepSpeedMegatronGPTInferenceN)MegatronLayerPolicy)versionc                       s&   e Zd Z fddZdddZ  ZS )DS_MegatronGPTMoEContainerc                    s   t  |||| d S N)super__init__)selfpolicyconfigmodel_configlayer_id	__class__ g/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/module_inject/containers/megatron_gpt_moe.pyr      s   z#DS_MegatronGPTMoEContainer.__init__Nc                 C   sN   |d ur|n| j }t|| jd| _| j| jj_| jr$d| jj_d| jj_| jS )N)mp_groupTF)	ds_model_configr   r   modulescale_attentionr   megatron_v2rotate_halfrotate_every_two)r   r   _configr   r   r   create_module   s   

z(DS_MegatronGPTMoEContainer.create_moduler	   )__name__
__module____qualname__r   r   __classcell__r   r   r   r   r      s    r   c                       s@   e Zd ZdZdZdZdZd fdd	Zdd	 ZdddZ	  Z
S )MegatronMoELayerPolicyNr   standardr   Tc                    st   t  | || _tjd u r8ttjtdkrd t_d S zddl	m
} |t_W d S  ty7   d t_Y d S w d S )Nz1.2r   )ParallelTransformerLayer)r
   r   client_moduler"   _orig_layer_classpkg_versionparsetorch__version__megatron.model.transformerr$   ImportError)r   r%   	inferencer$   r   r   r   r   *   s   

zMegatronMoELayerPolicy.__init__c                 C   s   | j S r	   )num_experts)r   r   r   r   get_num_experts9   s   z&MegatronMoELayerPolicy.get_num_expertsFc              	      s  |dkr| j jjjjn| j jjjjj t }|| _|dkrH fddt|D  fddt|D  fddt|D  fddt|D fS  fddt|D  fddt|D  fd	dt|D  fd
dt|D | j jjj	j
| j jjj	j| j jjjj
| j jjjj| j jjj
f	S )Nr#   c                       g | ]} | j jqS r   dense_h_to_4hweight.0imoe_expertsr   r   
<listcomp>I       z.MegatronMoELayerPolicy.mlp.<locals>.<listcomp>c                    r0   r   r2   biasr4   r7   r   r   r9   J   r:   c                    r0   r   dense_4h_to_hr3   r4   r7   r   r   r9   K   r:   c                    r0   r   r>   r<   r4   r7   r   r   r9   L   r:   c                    r0   r   r1   r4   r7   r   r   r9   N   r:   c                    r0   r   r;   r4   r7   r   r   r9   O   r:   c                    r0   r   r=   r4   r7   r   r   r9   P   r:   c                    r0   r   r?   r4   r7   r   r   r9   Q   r:   )r%   mlpdeepspeed_moeexpertsdeepspeed_expertsmoelenr.   ranger2   r3   r<   r>   coefficient)r   moe_typeenable_trainingr.   r   r7   r   r@   <   s*   
zMegatronMoELayerPolicy.mlp)T)r#   F)r   r   r    r&   r   rH   r.   r   r/   r@   r!   r   r   r   r   r"   $   s    r"   )basebase_moefeatures.megatronr   <deepspeed.model_implementations.transformers.ds_megatron_gptr   r)   megatron_gptr   	packagingr   r'   BaseTransformerMoEContainerr   r"   r   r   r   r   <module>   s   