o
    Ti                     @   s   d dl T d dlmZ d dlmZ ddlmZ ddlZddlm	Z	 dd	l
mZ dd
l
mZ ddl
mZ ddl
mZ ddl
mZ G dd deeeZG dd deZdS )   )*)MetaTensorContainer)HybridSplitQKVContainer    )DeepSpeedGPTInferenceN)	Parameter   )TransformerPolicy)transformer_param_names)
maybe_copy)maybe_copy_qkvmaybe_get_lorac                       sF   e Zd Z fddZdddZdd Zdd	 Zd
d Zdd Z  Z	S )DS_GPTNEOContainerc                    s   t  jdi | d S )N )super__init__)selfkwargs	__class__r   ]/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/module_inject/containers/gptneo.pyr      s   zDS_GPTNEOContainer.__init__Nc                 C   s4   |d ur|n| j }t|| jd| _| j| jj_| jS )N)mp_group)ds_model_configr   r   modulescale_attentionconfig)r   r   _configr   r   r   create_module   s   z DS_GPTNEOContainer.create_modulec                 C   sX   dd | j jjj| j jjj| j jjjj| j jjjj| j jjjj	| j jjjj
fD | _dS )D
        Necessary to implement for `HybridEngineContainer`
        c                 S   s   g | ]}t |qS r   r   ).0pr   r   r   
<listcomp>%   s    z6DS_GPTNEOContainer.set_lora_params.<locals>.<listcomp>N)policyclient_modulemlpc_fcc_projattn	attentionq_projk_projv_projout_projlora_paramsr   r   r   r   set_lora_params!   s   z"DS_GPTNEOContainer.set_lora_paramsc                 C   sL   | j jjjjj| _d| _| j jjjjj| _	d| _
| j jjjjj| _d| _dS )zF
        Necessary to implement for `HybridSplitQKVContainer`
        N)r#   r$   r(   r)   r*   weightqwqbr+   kwkbr,   vwvbr/   r   r   r   	set_q_k_v-   s   
zDS_GPTNEOContainer.set_q_k_vc                 C   sL   |   \}}}}}}|| jf|| jf|| jf|| jf|| jf|| jfg}|S )r   )get_lora_params_h4h_w_4hh_wdense_wr2   r4   r6   )r   fc1_lorafc2_loraq_lorak_lorav_loraout_loraretr   r   r   get_lora_matched_pair8   s
    z(DS_GPTNEOContainer.get_lora_matched_pairc                 C   s   d}t |j|||d||d  ||d  ||d  g| jjd tddD ]}t|j|||t|d  |||   q$tdd	D ]}t|j|||t|d  |||   q>td	d
D ]}t||||t|d  |||   qXd S )N)zattn.attention.q_proj.weightzattn.attention.k_proj.weightzattn.attention.v_proj.weightzattn.attention.out_proj.weightzattn.attention.out_proj.biaszmlp.c_fc.weightzmlp.c_fc.biaszmlp.c_proj.weightzmlp.c_proj.biaszln_2.weightz	ln_2.biaszln_1.weightz	ln_1.bias	attn_qkvwr   r   r   )	split_qkv            )r   r)   r#   rF   ranger   r
   r%   )r   r   sdweight_quantizer
mp_replaceprefixparam_namesir   r   r   load_paramsA   s*   "


zDS_GPTNEOContainer.load_paramsN)
__name__
__module____qualname__r   r   r0   r8   rD   rR   __classcell__r   r   r   r   r      s    
	r   c                       sJ   e Zd Zd fdd	Zdd Zdd Zdd	d
ZdddZdd Z  Z	S )HFGPTNEOLayerPolicyTc                    sF   t  j|dd || _zdd l}|jjjjt_	W d S    d t_	Y d S )NF)r   r   )
r   r   r$   transformersmodelsgpt_neomodeling_gpt_neoGPTNeoBlockrX   _orig_layer_class)r   r$   	inferencerY   r   r   r   r   d   s   zHFGPTNEOLayerPolicy.__init__c                 C   s"   | j jjj| j jjj| j jjtfS rS   )r$   r(   r)   	embed_dim	num_headsln_1epsDEFAULT_INTERMEDIATE_SIZEr/   r   r   r   get_hidden_headsm   s
   

z$HFGPTNEOLayerPolicy.get_hidden_headsc                 C   s.   | j jjjjd | j jjjjd | j jjjjd fS rS   )r$   r(   r)   r*   r1   r+   r,   r/   r   r   r   	get_q_k_vs   s   zHFGPTNEOLayerPolicy.get_q_k_vFc                 C   sf   | j jjjj}| j jjjj}| j jjjj}ttj	|||fdd|d}|d | j jjj
j| j jjj
jfS )Nr   )dim)requires_grad)r$   r(   r)   r*   r1   r+   r,   r   torchcatr-   bias)r   enable_trainingr2   r4   r6   qkvwr   r   r   r)   {   s   zHFGPTNEOLayerPolicy.attentionc                 C   s,   | j jjj| j jjj| j jjj| j jjjfS rS   )r$   r%   r&   r1   rk   r'   )r   rl   r   r   r   r%      s
   



zHFGPTNEOLayerPolicy.mlpc                 C   s$   | j jj| j jj| j jj| j jjfS rS   )r$   ln_2r1   rk   rb   r/   r   r   r   	layernorm   s
   zHFGPTNEOLayerPolicy.layernorm)T)F)
rT   rU   rV   r   re   rf   r)   r%   ro   rW   r   r   r   r   rX   b   s    	

rX   )basefeatures.meta_tensorr   features.split_qkvr   3deepspeed.model_implementations.transformers.ds_gptr   ri   torch.nn.parameterr   r#   r	   r
   r   r   r   BaseTransformerContainerr   rX   r   r   r   r   <module>   s   N