o
    Ti5                     @   s\   d dl mZ d dlZd dlZd dlmZ d dlmZ dZG dd deZ	G dd	 d	eZ
dS )
    )ABCN)DeepSpeedInferenceConfig)get_acceleratorc                   @   s   e Zd Zdd ZdS )BaseConvolutionContainerc                 C   s   d S N selfr   r   [/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/module_inject/containers/base.py__init__   s   z!BaseConvolutionContainer.__init__N)__name__
__module____qualname__r   r   r   r   r   r      s    r   c                   @   s   e Zd Zdd Zdd Zdd Zd=dd	Zd
d Zdd Zd=ddZ	dd Z
dd Zdd Zdd Zdd Zdd Zdd Zdd Zd d! Zd"d# Zd=d$d%Zd=d&d'Zd=d(d)Zd=d*d+Zd,d- Zd.d/ Zd0d1 Zd2d3 Zd4d5 Zd6d7 Zd8d9 Zd:d; Zd<S )>BaseTransformerContainerc                 C   s  || _ || _|| _|| _|| _| j j| _| j j| _d| _d | _	d | _
d | _| jjj| _t| jdr5| jjn| j j| _| jj| _| j j| _| j j| _| jj| _d| _t| jdr`| jj| j dknd| _t| jdd| _| j j| _| j j| _| jj| _d| _ | jj!| _!| jj"| _"t| jdd| _#| j j$| _$d| _%| & | _'| j'd u p| j'd	k | _(d | _)d | _*d | _+d | _,d | _-d | _.d | _/d | _0d | _1d | _2d | _3d | _4d | _5d| _6|j6ot7j8| _6d S )
NFdo_layer_norm_beforeTattention_layerslocalwindow_size   scale_attn_by_inverse_layer_idxr   )9policyconfigmodel_configlayer_idchildis_megatron_v2megatron_v2scale_attentionckpt_load_enabledhidden_sizeintermediate_sizenum_attention_headstensor_paralleltp_sizemp_sizehasattrr   pre_attn_normpre_layer_normdtypelinear_layerattn_linear_layermlp_linear_layerreturn_tupletriangular_maskingr   local_attentiongetattrr   mlp_act_func_type	norm_typetraining_mp_sizebigscience_bloommax_out_tokensmin_out_tokensr   use_mupreturn_single_tupleget_rotary_dim
rotary_dimmlp_after_attnqkvwqkvbdense_wdense_b_h4h_w_h4h_b_4hh_w_4hh_battn_nwattn_nbinput_nwinput_nbmp_group
use_triton	deepspeed
HAS_TRITON)r
   r   r   r   r   r   r   r   r   r      sj   














z!BaseTransformerContainer.__init__c                 C   sV  | j | j   | j| j dksJ dtd!i d| jd| jd| jd| jd| j	d| j
d	| jd
| jd| jd| jd| jd| jd| jd| jd| jd| jd| jd| jd| jd| jd| jd| jd| jjd| jjd| jd| jj| _| jrt j!rddl"m#} t$| |st%d| jjsdd l&m'} |(  | jS )"Nr   zTo run the model parallel across the GPUs, the attention_heads require to be divisible by the world_size!This is because the attention computation is partitioned evenly among the parallel GPUs.r    r!   headslayer_norm_epsr)   r(   r2   r%   r-   r.   r/   r   r:   r;   r1   r3   r4   r5   r6   r   r7   r8   set_empty_paramstransposed_moderI   triton_autotuner   )DS_BERTContainerz0Triton kernels are only for BERT-like models yet)fp16_matmulr   ))set_hidden_headsr   get_hidden_headsr"   r%   r   r    r!   layernorm_epsilonr)   r(   r2   r-   r.   r/   r   r:   r;   r1   r3   r4   r5   r6   r   r7   r8   r   rN   rO   rI   rP   ds_model_configrJ   rK   bertrQ   
isinstanceNotImplementedError5deepspeed.ops.transformer.inference.triton.matmul_extrR   skip_autotune)r
   rQ   rR   r   r   r   create_ds_model_configX   s   	

z/BaseTransformerContainer.create_ds_model_configc                 C   s2   t | jdr| jjr| jsJ dd S d S td)Nis_metaz8Meta tensors are not supported for this model currently.zCMeta tensor support is not available, please upgrade to torch 1.10+)r&   r<   r]   r   rY   r	   r   r   r   check_meta_tensor_support   s   z2BaseTransformerContainer.check_meta_tensor_supportFc                 C   s<   | j | jj|d  | j| jj|d  | j| j   d S )N)enable_training)set_attentionr   	attentionset_mlpmlpset_layernorm	layernorm)r
   r_   r   r   r   initialize_tensors   s   z+BaseTransformerContainer.initialize_tensorsc                    s    j tjtjfv rC j D ]6\}}t|tr,tdd |D r, fdd|D  j|< t|tj	s9t|tj
jrB| j  j|< qd S d S )Nc                 s   s*    | ]}t |tjpt |tjjV  qd S r   )rX   torchTensornn	Parameter).0tensorr   r   r   	<genexpr>   s    
zEBaseTransformerContainer.convert_to_required_dtype.<locals>.<genexpr>c                    s   g | ]}|  jqS r   )tor)   )rk   
moe_tensorr	   r   r   
<listcomp>   s    zFBaseTransformerContainer.convert_to_required_dtype.<locals>.<listcomp>)r)   rg   halfbfloat16__dict__itemsrX   listallrh   ri   rj   rn   )r
   kvr   r	   r   convert_to_required_dtype   s   z2BaseTransformerContainer.convert_to_required_dtypec                 C   s<   t | jdr
| jjS t | jdrt | jjdr| jjjS dS )Nr:   ra   rotary_ndimsr   )r&   r   r:   r   ra   rz   r	   r   r   r   r9      s
   
z'BaseTransformerContainer.get_rotary_dimc                 C   
   || _ d S r   )moe)r
   r|   r   r   r   set_moe      
z BaseTransformerContainer.set_moec                 C   s   || _ || _d S r   )r%   rH   )r
   r%   rH   r   r   r   set_tensor_parallel_config   s   
z3BaseTransformerContainer.set_tensor_parallel_configc                 C   r{   r   )	quantizer)r
   r   r   r   r   set_quantization_config   r~   z0BaseTransformerContainer.set_quantization_configc                 C   s0   || _ |tkrd| | _n|| _|| _|| _dS )a  
        Args:
            hidden_size: embedding dimension of the model
            num_attention_heads: number of attention heads in the model
            epsilon: epsilon value for layer norm (same value used for all norms)
            intermediate_size: Size of MLP projection. If `DEFAULT_INTERMEDIATE_SIZE` is passed
                it is assumed to be `4 * hidden_size`
           N)r    DEFAULT_INTERMEDIATE_SIZEr!   r"   rU   )r
   r    r"   epsilonr!   r   r   r   rS      s   	
z)BaseTransformerContainer.set_hidden_headsc                 C      || _ || _|| _|| _d S r   r<   r=   r>   r?   )r
   r<   r=   r>   r?   r   r   r   r`         
z&BaseTransformerContainer.set_attentionc                 C   r   r   r@   rA   rB   rC   )r
   r@   rA   rB   rC   r   r   r   rb      r   z BaseTransformerContainer.set_mlpc                 C   r   r   )rD   rE   rF   rG   )r
   rD   rE   rF   rG   r   r   r   rd      r   z&BaseTransformerContainer.set_layernormc                 C      |    |   d S r   )attention_quantizationmlp_quantizationr	   r   r   r   apply_weight_quantization   s   z2BaseTransformerContainer.apply_weight_quantizationc                 C   4   | j | jjj| jj_| j | jjj| jj_d S r   )r   quantizemodulera   	attn_qkvwattn_owr	   r   r   r   r         z/BaseTransformerContainer.attention_quantizationc                 C   r   r   )r   r   r   rc   inter_woutput_wr	   r   r   r   r      r   z)BaseTransformerContainer.mlp_quantizationc                 C   s,   |  | | | | | | | d S r   )attention_qkv_mpattention_o_mpmlp_inter_mpmlp_output_mp)r
   
mp_replacer   r   r   apply_tensor_parallelism   s   


z1BaseTransformerContainer.apply_tensor_parallelismc                 C   sD   |j | jjj| jd|d| jj_|j | jjj| jd|d| jj_d S )N   )
num_splitsint8)strided_copyr   ra   r   r<   	attn_qkvbr=   r
   r   reversed_dimr   r   r   r      s   z)BaseTransformerContainer.attention_qkv_mpc                 C   B   |j | jjj| j|d| jj_|j | jjj| j||d| jj_d S Nr   )r   allocate_tensor)copyr   ra   r   r>   attn_obr?   r   r   r   r   r         z'BaseTransformerContainer.attention_o_mpc                 C   s@   |j | jjj| j|d| jj_|j | jjj| j|d| jj_d S )Nr   )r   r   rc   r   r@   inter_brA   r   r   r   r   r      s   "z%BaseTransformerContainer.mlp_inter_mpc                 C   r   r   )r   r   rc   r   rB   output_brC   r   r   r   r   r     r   z&BaseTransformerContainer.mlp_output_mpc              
   C   s   | j | jd}|D ]&}|| d u rt| jj|d  q	t| jj|tjj|| 	t
   q	| j| jd}|D ]$}|| d u rIt| j|d  q9t| j|tjj|| 	t
   q9d S )N)rD   rE   )norm_wnorm_b)rD   rE   setattrr   rc   rg   ri   	parameterrj   rn   r   current_device_namerF   rG   )r
   paramskeyr   r   r   copy_data_to_new_module  s   
z0BaseTransformerContainer.copy_data_to_new_modulec                 C   r   r   )transpose_attentiontranspose_mlpr	   r   r   r   	transpose  s   z"BaseTransformerContainer.transposec                 C   .   | j r| | jj| _| | jj| _d S d S r   )r+   transpose_implr<   datar>   r	   r   r   r   r         z,BaseTransformerContainer.transpose_attentionc                 C   r   r   )r,   r   r@   r   rB   r	   r   r   r   r   %  r   z&BaseTransformerContainer.transpose_mlpc                 C   sV   |  }|d|dd  d ||jd |jd }|t   |S )Nr   )
contiguousreshapecopy_r   shapern   r   r   )r
   r   r   r   r   r   *  s
   "z'BaseTransformerContainer.transpose_implc                 C   s4   | j | j| j| jg}||   ||   |S r   )rD   rE   rF   rG   extendget_attn_paramsget_mlp_params)r
   r   r   r   r   get_all_params1  s   z'BaseTransformerContainer.get_all_paramsc                 C      | j | j| j| jgS r   r   r	   r   r   r   r   >     z(BaseTransformerContainer.get_attn_paramsc                 C   r   r   r   r	   r   r   r   r   A  r   z'BaseTransformerContainer.get_mlp_paramsN)F) r   r   r   r   r\   r^   rf   ry   r9   r}   r   r   rS   r`   rb   rd   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r      s<    <-






r   )abcr   rg   rJ   *deepspeed.ops.transformer.inference.configr   deepspeed.acceleratorr   r   r   r   r   r   r   r   <module>   s   