o
    TiC                      @   s   d dl mZmZ d dlmZmZ d dlZd dlmZ dZ	G dd deZ
G dd	 d	e
Zd
d ZdddZ				dddZdddZdd Zdd Zdd ZdS )    )ABCabstractmethod)ActivationFuncTypeNormTypeN)get_accelerator)	attn_qkvw	attn_qkvbattn_owattn_obinter_winter_boutput_woutput_battn_nwattn_nbnorm_wnorm_bc                   @   s$   e Zd ZdZdd Zedd ZdS )DSPolicyNc                 C   s
   d| _ d S NF)cuda_graph_supportedself r   R/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/module_inject/policy.py__init__   s   
zDSPolicy.__init__c                 C      t z
        Returns attention qkv and dense parameters
        weight: (3*hidden, hidden) and (hidden, hidden)
        bias: (3*hidden) and (hidden)
        NotImplementedErrorr   r   r   r   	attention       zDSPolicy.attention)__name__
__module____qualname___orig_layer_classr   r   r   r   r   r   r   r      s
    r   c                
       sj   e Zd ZdZdddddejdddejf
 fdd	Ze	dd Z
e	dd	 Ze	d
d Ze	dd Z  ZS )TransformerPolicyNTFc                    sP   t    d| _|| _|| _|| _|| _|| _|| _|| _	|| _
|	| _|
| _d S r   )superr   r   	inferencelinear_layerscale_attentionis_megatron_v2use_mupmlp_act_func_typepre_attn_normuse_load_prefix	split_qkv	norm_type)r   r'   r(   r)   megatron_v2r+   r,   r-   r.   r/   r0   	__class__r   r   r   /   s   

zTransformerPolicy.__init__c                 C   r   r   r   r   r   r   r   r   M   r    zTransformerPolicy.attentionc                 C   r   )z8
        return hidden_size and number of heads
        r   r   r   r   r   get_hidden_headsV   s   z"TransformerPolicy.get_hidden_headsc                 C   r   )z
        Returns mlp intermediate and output
        weight: (intermediate, hidden) and (hidden, intermediate)
        bias: (intermediate) and (hidden)
        r   r   r   r   r   mlp]   r    zTransformerPolicy.mlpc                 C   r   )z
        Returns LayerNorms used in transformer layer
        Post-Attention and pre/post layer norm
        gamma and beta with shape: (hidden)
        r   r   r   r   r   	layernormf   r    zTransformerPolicy.layernorm)r!   r"   r#   hf_model_configr   GELUr   	LayerNormr   r   r   r4   r5   r6   __classcell__r   r   r2   r   r%   *   s*    


r%   c                 C   sn   t   |  } | ddd}| d| d }W d    n1 s&w   Y  | | jd | jd S )N)torchno_grad
contiguous	transposereshapecopy_shape)datadata1r   r   r   r@   q   s   
r@      c                 C   s   ||j  }d}| j| | }|  d | ||f }| j| }tj||jd d dd\}}}	t|jdkrU|jd fd }
tj||
||
|	|
f|d| jS tj|d|d|	dfdd| jS )Nr;      dim   r   )r;   )	mp_sizerC   sizeviewr=   splitlencatrA   )xheads
mp_replace	outer_dimattention_head_sizenew_x_shapex_1qkv	new_shaper   r   r   
_transpose{   s   

 .r\   Fc
                 C   s  ||v rt | |}
|| }t|
jdkr8|r|j|
|dd}
n||
|}
|r7|r7tjjt	|
|	|d
 }
nB|rQ|j|
||jrD|nt|
 d|jd}
n)|rg|rgt	t||	|d
 }|jrgt|}|j|
||jrq|nt||jd}
t| ||
 d S d S )NrF   rG   
num_splits)rR   rS   r^   int8r`   )getattrrO   rC   strided_copycopyr=   nn	parameter	Parameterr\   r?   quantizeq_int8r@   setattr)modulesdweight_quantizerrS   dst_namesrc_nameqkvr1   r/   rR   dsttmpr   r   r   
maybe_copy   s6   


rs   c                 C   s  |d |v r||d  }||d  }||d  }	t j|||	fdd}
t| |}t|jdkrB|r;|j||
 dd}n?|||
}n8|ra|j|||j	rT|

t  nt|
 d|j	d}n|j|||j	rq|

t  nt|
|j	d}t| || d S d S )	Nr   rF   rJ   rH   rG   r]   r_   ra   )r=   rP   rb   rO   rC   rc   r?   rd   rh   ri   tor   device_namer@   rj   )rk   rl   rm   rS   rn   	src_namesr/   rX   rY   rZ   qkv_datarq   r   r   r   maybe_copy_qkv   s.   
 
 rx   c           
      C   s   |d |v rB||d  }||d  }t j||fdd}t| |}	|j|	||jr0|t  nt	|d|jd}	t
| ||	 d S d S )Nr   rF   rH   rJ   r_   )r=   rP   rb   rc   rh   ri   rt   r   ru   r@   rj   )
rk   rl   rm   rS   rn   rv   reg_proj	gate_proj	mlp1_datarq   r   r   r   maybe_copy_geglu   s   
 r|   c                 C   s   | j | j| jgS )N)lora_right_weightlora_left_weightlora_scaling)pr   r   r   pack_lora_weights   s   r   c                 C   s   t | drt| }|S g }|S )Nr}   )hasattrr   )r   
lora_paramr   r   r   maybe_get_lora   s
   
r   )rF   N)FFFrF   )F)abcr   r   deepspeed.utils.typesr   r   r=   deepspeed.acceleratorr   transformer_param_namesr   r%   r@   r\   rs   rx   r|   r   r   r   r   r   r   <module>   s$   G



#