o
    Ti                     @   s   d dl T d dlmZmZ ddlmZ ddlZddlmZ ddl	m
Z
 dd	l	mZ dd
l	mZ ddl	mZ ddl	mZ ddlmZ G dd deeeZG dd de
ZdS )   )*)MetaTensorContainerHybridSplitQKVContainer    )DeepSpeedOPTInferenceN)	Parameter   )TransformerPolicy)transformer_param_names)
maybe_copy)maybe_copy_qkvmaybe_get_lora)ActivationFuncTypec                       sF   e Zd Z fddZdddZdd Zdd	 Zd
d Zdd Z  Z	S )DS_OPTContainerc                    s   t  jdi | d S )N )super__init__)selfkwargs	__class__r   Z/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/module_inject/containers/opt.pyr      s   zDS_OPTContainer.__init__Nc                 C   s4   |d ur|n| j }t|| jd| _| j| jj_| jS )N)mp_group)ds_model_configr   r   modulescale_attentionconfig)r   r   _configr   r   r   create_module   s   zDS_OPTContainer.create_modulec                 C   sL   dd | j jj| j jj| j jjj| j jjj| j jjj| j jjjfD | _	dS )zD
        Necessary to implement for `HybridEngineContainer`
        c                 S   s   g | ]}t |qS r   r   ).0pr   r   r   
<listcomp>$   s    z3DS_OPTContainer.set_lora_params.<locals>.<listcomp>N)
policyclient_modulefc1fc2	self_attnq_projk_projv_projout_projlora_paramsr   r   r   r   set_lora_params    s   



zDS_OPTContainer.set_lora_paramsc                 C   sd   | j jjjj| _| j jjjj| _| j jjjj| _	| j jjjj| _
| j jjjj| _| j jjjj| _dS )zF
        Necessary to implement for `HybridSplitQKVContainer`
        N)r#   r$   r'   r(   weightqwbiasqbr)   kwkbr*   vwvbr-   r   r   r   	set_q_k_v/   s   zDS_OPTContainer.set_q_k_vc                 C   sL   |   \}}}}}}|| jf|| jf|| jf|| jf|| jf|| jfg}|S N)get_lora_params_h4h_w_4hh_wdense_wr0   r3   r5   )r   fc1_lorafc2_loraq_lorak_lorav_loraout_loraretr   r   r   get_lora_matched_pair:   s
    z%DS_OPTContainer.get_lora_matched_pairc                 C   s   d}t dddD ]'}t|j|||t|d  |||  |||d   |||d   g| jjd qt ddD ]}t|j|||t|d	  |||   q5t dd
D ]}t|j|||t|d	  |||   qOt d
dD ]}t||||t|d	  |||   qid S )N)zself_attn.q_proj.weightzself_attn.k_proj.weightzself_attn.v_proj.weightzself_attn.q_proj.biaszself_attn.k_proj.biaszself_attn.v_proj.biaszself_attn.out_proj.weightzself_attn.out_proj.biasz
fc1.weightzfc1.biasz
fc2.weightzfc2.biaszfinal_layer_norm.weightzfinal_layer_norm.biaszself_attn_layer_norm.weightzself_attn_layer_norm.biasr         r   r   )	split_qkv            )ranger   	attentionr
   r#   rG   r   mlp)r   r   sdweight_quantizer
mp_replaceprefixparam_namesir   r   r   load_params@   s.   
(


zDS_OPTContainer.load_paramsr8   )
__name__
__module____qualname__r   r   r.   r7   rD   rU   __classcell__r   r   r   r   r      s    
r   c                       sF   e Zd ZdZd fdd	Zdd Zddd	Zdd
dZdd Z  Z	S )HFOPTLayerPolicyNTc                    s   t  j|dd|d || _zdd l}|jjjjt_	W n   d t_	Y t
tdrNt
tjdrNtjjdkr9tj| _d S tjjdv rEtj| _d S tdtjjtj| _d S )	NT)linear_layerpre_attn_normuse_load_prefixr   hf_model_configactivation_functionrelu)gelugelu_newz#Unsupported activation function: {})r   r   r$   transformersmodelsoptmodeling_optOPTDecoderLayerrZ   _orig_layer_classhasattrr	   r^   r_   r   ReLUmlp_act_func_typeGELU
ValueErrorformat)r   r$   	inferencer]   rc   r   r   r   r   j   s$   zHFOPTLayerPolicy.__init__c                 C   s   | j jj| j jj| j jjtfS r8   )r$   r'   	embed_dim	num_headsself_attn_layer_normepsDEFAULT_INTERMEDIATE_SIZEr-   r   r   r   get_hidden_heads   s
   z!HFOPTLayerPolicy.get_hidden_headsFc           
      C   s   | j jjj}| j jjj}| j jjj}| j jjj}| j jjj}| j jjj}ttj	|||fdd|d}ttj	|||fdd|d}	||	| j jj
j| j jj
jfS )Nr   )dim)requires_grad)r$   r'   r(   r/   r1   r)   r*   r   torchcatr+   )
r   enable_trainingr0   r2   r3   r4   r5   r6   qkvwqkvbr   r   r   rM      s   

zHFOPTLayerPolicy.attentionc                 C   $   | j jj| j jj| j jj| j jjfS r8   )r$   r%   r/   r1   r&   )r   rz   r   r   r   rN      
   zHFOPTLayerPolicy.mlpc                 C   r}   r8   )r$   final_layer_normr/   r1   rr   r-   r   r   r   	layernorm   r~   zHFOPTLayerPolicy.layernorm)TT)F)
rV   rW   rX   rh   r   ru   rM   rN   r   rY   r   r   r   r   rZ   g   s    

rZ   )basefeaturesr   r   3deepspeed.model_implementations.transformers.ds_optr   rx   torch.nn.parameterr   r#   r	   r
   r   r   r   deepspeed.utils.typesr   BaseTransformerContainerr   rZ   r   r   r   r   <module>   s   T