o
    Ti	                     @   s   d dl T d dlmZ d dlmZ ddlmZ ddlZddlm	Z	 dd	lm
Z
 dd
lmZ ddlmZ ddlmZ G dd deeeZG dd de	ZdS )   )*)MetaTensorContainer)HybridMegatronContainer    )DeepSpeedGPTInferenceN   )TransformerPolicy)transformer_param_names)
maybe_copy)versionmaybe_get_lorac                       s>   e Zd Z fddZdddZdd Zdd	 Zd
d Z  ZS )DS_GPTNEOXContainerc                    s   t  jdi | d S )N )super__init__)selfkwargs	__class__r   ^/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/module_inject/containers/gptneox.pyr      s   zDS_GPTNEOXContainer.__init__Nc                 C   sN   |d ur|n| j }t|| jd| _| j| jj_| jr$d| jj_d| jj_| jS )N)mp_groupTF)	ds_model_configr   r   modulescale_attentionconfigmegatron_v2rotate_halfrotate_every_two)r   r   _configr   r   r   create_module   s   

z!DS_GPTNEOXContainer.create_modulec                 C   s8   |   \}}}}|| jf|| jf|| jf|| jfg}|S )D
        Necessary to implement for `HybridEngineContainer`
        )get_lora_params_h4h_w_4hh_wqkvwdense_w)r   fc1_lorafc2_loraqkv_loraout_loraretr   r   r   get_lora_matched_pair%   s   $z)DS_GPTNEOXContainer.get_lora_matched_pairc                 C   sP   t jdkr| jjj}n| jjj}dd | jjjj| jjjj|j	|j
fD | _dS )r!   r   c                 S   s   g | ]}t |qS r   r   ).0pr   r   r   
<listcomp>6   s    z7DS_GPTNEOXContainer.set_lora_params.<locals>.<listcomp>N)GPTNEOXLayerPolicyr   policyclient_module	attentionself_attentionmlpdense_h_to_4hdense_4h_to_hquery_key_valuedenselora_paramsr   r3   r   r   r   set_lora_params-   s   

z#DS_GPTNEOXContainer.set_lora_paramsc                 C   s   d}t ddD ]}t|j|||t| |||  d| jj| jj| jjjjd
 qt ddD ]}t|j|||t| |||   q,t ddD ]}t|j	|||t| |||   qDt ddD ]}t||||t| |||   q\d S )	N)z attention.query_key_value.weightzattention.query_key_value.biaszattention.dense.weightzattention.dense.biaszmlp.dense_h_to_4h.weightzmlp.dense_h_to_4h.biaszmlp.dense_4h_to_h.weightzmlp.dense_4h_to_h.biaszpost_attention_layernorm.weightzpost_attention_layernorm.biaszinput_layernorm.weightzinput_layernorm.biasr   r   T)qkvr   	split_qkvheads   
      )
ranger
   r3   r	   r1   is_megatron_v2r>   r2   num_attention_headsr5   )r   r   sdweight_quantizer
mp_replaceprefixparam_namesir   r   r   load_params=   s0   




 zDS_GPTNEOXContainer.load_paramsN)	__name__
__module____qualname__r   r    r,   r<   rL   __classcell__r   r   r   r   r      s    
r   c                       sJ   e Zd ZdZdZd fdd	Zdd Zdd	d
ZdddZdd Z	  Z
S )r0   Nr   TFc                    sz   t  j|||d || _tjd u r;ttjtdkr!d t_d S zddl	m
} |t_W d S  ty:   d t_Y d S w d S )N)r   r>   z1.2r   )GPTNeoXLayer)r   r   r2   r0   _orig_layer_classpkg_versionparsetorch__version__transformersrR   ImportError)r   r2   	inferencer   r>   rR   r   r   r   r   e   s   

zGPTNEOXLayerPolicy.__init__c                 C   s:   t jdkr
| jj}n| jj}| jjj| jjj| jjjt	fS Nr   )
r0   r   r2   r3   r4   hidden_sizerE   input_layernormepsDEFAULT_INTERMEDIATE_SIZEr;   r   r   r   get_hidden_headsr   s   

z#GPTNEOXLayerPolicy.get_hidden_headsc                 C   s8   t jdkr
| jj}n| jj}|jj|jj|jj|jjfS r[   )	r0   r   r2   r3   r4   r8   weightbiasr9   )r   enable_trainingr3   r   r   r   r3   }   s   

zGPTNEOXLayerPolicy.attentionc                 C   s,   | j jjj| j jjj| j jjj| j jjjfS rM   )r2   r5   r6   ra   rb   r7   )r   rc   r   r   r   r5      s
   



zGPTNEOXLayerPolicy.mlpc                 C   s$   | j jj| j jj| j jj| j jjfS rM   )r2   post_attention_layernormra   rb   r]   )r   r   r   r   	layernorm   s
   zGPTNEOXLayerPolicy.layernorm)TTF)F)rN   rO   rP   rS   r   r   r`   r3   r5   re   rQ   r   r   r   r   r0   a   s    

r0   )basefeatures.meta_tensorr   features.hybrid_megatronr   3deepspeed.model_implementations.transformers.ds_gptr   rV   r1   r   r	   r
   	packagingr   rT   r   BaseTransformerContainerr   r0   r   r   r   r   <module>   s   N