o
    Ti                     @   s   d dl T d dlmZ d dlmZ ddlmZ ddlZddlm	Z	 dd	l
mZ dd
l
mZ ddl
mZ ddl
mZ ddl
mZ G dd deeeZG dd deZdS )   )*)MetaTensorContainer)HybridSplitQKVContainer    )DeepSpeedGPTInferenceN)	Parameter   )TransformerPolicy)transformer_param_names)
maybe_copy)maybe_copy_qkvmaybe_get_lorac                       sF   e Zd Z fddZdddZdd Zdd	 Zd
d Zdd Z  Z	S )DS_GPTJContainerc                    s   t  jdi | d S )N )super__init__)selfkwargs	__class__r   [/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/module_inject/containers/gptj.pyr      s   zDS_GPTJContainer.__init__Nc                 C   s4   |d ur|n| j }t|| jd| _| j| jj_| jS )N)mp_group)ds_model_configr   r   modulescale_attentionconfig)r   r   _configr   r   r   create_module   s   zDS_GPTJContainer.create_modulec                 C   sP   dd | j jjj| j jjj| j jjj| j jjj| j jjj| j jjj	fD | _
dS )zD
        Necessary to implement for `HybridEngineContainer`
        c                 S   s   g | ]}t |qS r   r   ).0pr   r   r   
<listcomp>%   s    z4DS_GPTJContainer.set_lora_params.<locals>.<listcomp>N)policyclient_modulemlpfc_infc_outattnq_projk_projv_projout_projlora_paramsr   r   r   r   set_lora_params!   s   z DS_GPTJContainer.set_lora_paramsc                 C   sL   |   \}}}}}}|| jf|| jf|| jf|| jf|| jf|| jfg}|S N)get_lora_params_h4h_w_4hh_wdense_wqwkwvw)r   fc1_lorafc2_loraq_lorak_lorav_loraout_loraretr   r   r   get_lora_matched_pair-   s
    z&DS_GPTJContainer.get_lora_matched_pairc                 C   sF   | j jjjj| _d| _| j jjjj| _d| _	| j jjj
j| _d| _dS )zF
        Necessary to implement for `HybridSplitQKVContainer`
        N)r"   r#   r'   r(   weightr4   qbr)   r5   kbr*   r6   vbr-   r   r   r   	set_q_k_v3   s   
zDS_GPTJContainer.set_q_k_vc                 C   s   d}t |j|||d||d  ||d  ||d  g| jjd tddD ]}t|j|||t|d  |||   q$tdd	D ]}t|j|||t| |||   q>td	d
D ]}t||||t|d  |||   qVd S )N)
zattn.q_proj.weightzattn.k_proj.weightzattn.v_proj.weightzattn.out_proj.weightzmlp.fc_in.weightzmlp.fc_in.biaszmlp.fc_out.weightzmlp.fc_out.biaszln_1.weightz	ln_1.bias	attn_qkvwr   r   r   )	split_qkv         
   )r   	attentionr"   rE   ranger   r
   r$   )r   r   sdweight_quantizer
mp_replaceprefixparam_namesir   r   r   load_params>   s*   "


zDS_GPTJContainer.load_paramsr/   )
__name__
__module____qualname__r   r   r.   r>   rC   rR   __classcell__r   r   r   r   r      s    
r   c                       sF   e Zd ZdZd fdd	Zdd Zddd	Zdd
dZdd Z  Z	S )HFGPTJLayerPolicyNTc                    sF   t  j|dd || _zdd l}|jjjjt_	W d S    d t_	Y d S )NT)r   r   )
r   r   r#   transformersmodelsgptjmodeling_gptj	GPTJBlockrW   _orig_layer_class)r   r#   	inferencerX   r   r   r   r   _   s   zHFGPTJLayerPolicy.__init__c                 C   s   | j jj| j jj| j jjtfS r/   )r#   r'   	embed_dimnum_attention_headsln_1epsDEFAULT_INTERMEDIATE_SIZEr-   r   r   r   get_hidden_headsh   s
   z"HFGPTJLayerPolicy.get_hidden_headsFc                 C   sT   | j jjj}| j jjj}| j jjj}ttj|||fdd|d}|d | j jj	jd fS )Nr   )dim)requires_grad)
r#   r'   r(   r?   r)   r*   r   torchcatr+   )r   enable_trainingr4   r5   r6   qkvwr   r   r   rJ   n   s   
zHFGPTJLayerPolicy.attentionc                 C   s,   | j jjj| j jjj| j jjj| j jjjfS r/   )r#   r$   r%   r?   biasr&   )r   ri   r   r   r   r$   z   s
   



zHFGPTJLayerPolicy.mlpc                 C   s   d d | j jj| j jjfS r/   )r#   ra   r?   rk   r-   r   r   r   	layernorm   s
   zHFGPTJLayerPolicy.layernorm)T)F)
rS   rT   rU   r]   r   rd   rJ   r$   rl   rV   r   r   r   r   rW   \   s    	

rW   )basefeatures.meta_tensorr   features.split_qkvr   3deepspeed.model_implementations.transformers.ds_gptr   rg   torch.nn.parameterr   r"   r	   r
   r   r   r   BaseTransformerContainerr   rW   r   r   r   r   <module>   s   H