o
    Ti)                     @   sf   d dl T d dlmZ ddlmZ ddlZddlmZ ddlm	Z
 G d	d
 d
eeZG dd deZdS )   )*)MegatronContainer    )DeepSpeedMegatronGPTInferenceN   )TransformerPolicy)versionc                       s&   e Zd Z fddZdddZ  ZS )DS_MegatronGPTContainerc                    s   t  jdi | d S )N )super__init__)selfkwargs	__class__r
   c/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/module_inject/containers/megatron_gpt.pyr      s   z DS_MegatronGPTContainer.__init__Nc                 C   sN   |d ur|n| j }t|| jd| _| j| jj_| jr$d| jj_d| jj_| jS )N)mp_groupTF)	ds_model_configr   r   modulescale_attentionconfigmegatron_v2rotate_halfrotate_every_two)r   r   _configr
   r
   r   create_module   s   

z%DS_MegatronGPTContainer.create_moduleN)__name__
__module____qualname__r   r   __classcell__r
   r
   r   r   r	      s    r	   c                       sV   e Zd ZdZdZdZdZdZd fdd	Zdd	 Z	dd
dZ
dddZdd Z  ZS )MegatronLayerPolicyNr   standardTFc                    s   t  j|tjtjd || _tjd u r@tt	j
tdkr#d t_d S zddlm} |t_dt_W d S  ty?   d t_Y d S w d S )N)r   use_mupz1.2r   )ParallelTransformerLayerr   )r   r   r!   r   r#   client_module_orig_layer_classpkg_versionparsetorch__version__megatron.model.transformerr$   r   ImportError)r   r%   	inferencer$   r   r
   r   r   *   s   

zMegatronLayerPolicy.__init__c                 C   sV   t jdkr| jjjjjd | jjj| jjj	t
fS | jjjjjd | jjj| jjj	t
fS )Nr   r   )r!   r   r%   	attentionquery_key_valueweightshapenum_attention_headsinput_layernormepsDEFAULT_INTERMEDIATE_SIZEself_attentionr   r
   r
   r   get_hidden_heads:   s   
z$MegatronLayerPolicy.get_hidden_headsc                 C   s>   | j rtjdkr| jj}n| jj}|jj|jj|j	j|j	jfS )Nr   )
r-   r!   r   r%   r.   r6   r/   r0   biasdense)r   enable_trainingr.   r
   r
   r   r.   F   s   

zMegatronLayerPolicy.attentionc              	      sb  ddl m} || j\}}|r|dkr| jjjjjn| jjjjjj t }|dkrT fddt	|D  fddt	|D  fddt	|D  fddt	|D fS  fd	dt	|D  fd
dt	|D  fddt	|D  fddt	|D | jjjj
j| jjjj
j| jjjjj| jjjjj| jjjjf	S | jjj
j| jjj
j| jjjj| jjjjfS )Nr   )has_moe_layersr"   c                       g | ]} | j jqS r
   dense_h_to_4hr0   .0imoe_expertsr
   r   
<listcomp>[       z+MegatronLayerPolicy.mlp.<locals>.<listcomp>c                    r=   r
   r?   r9   r@   rC   r
   r   rE   \   rF   c                    r=   r
   dense_4h_to_hr0   r@   rC   r
   r   rE   ]   rF   c                    r=   r
   rI   r9   r@   rC   r
   r   rE   ^   rF   c                    r=   r
   r>   r@   rC   r
   r   rE   a   rF   c                    r=   r
   rG   r@   rC   r
   r   rE   b   rF   c                    r=   r
   rH   r@   rC   r
   r   rE   c   rF   c                    r=   r
   rJ   r@   rC   r
   r   rE   d   rF   )deepspeed.moe.utilsr<   r%   mlpdeepspeed_moeexpertsdeepspeed_expertsmoelenranger?   r0   r9   rI   coefficient)r   moe_typer;   r<   rP   _num_expertsr
   rC   r   rL   R   s8   




zMegatronLayerPolicy.mlpc                 C   s$   | j jj| j jj| j jj| j jjfS r   )r%   post_attention_layernormr0   r9   r3   r7   r
   r
   r   	layernormq   s
   zMegatronLayerPolicy.layernorm)T)F)r"   F)r   r   r   r&   r   rT   r   r#   r   r8   r.   rL   rX   r    r
   r
   r   r   r!   #   s    

r!   )basefeatures.megatronr   <deepspeed.model_implementations.transformers.ds_megatron_gptr   r)   policyr   	packagingr   r'   BaseTransformerContainerr	   r!   r
   r
   r
   r   <module>   s   