o
    ߥi                     @   s:  d Z ddlZddlZddlZddlm  mZ ddlmZ	 ddl
mZ G dd dejjZG dd dejjZG d	d
 d
ejjZejjdd Zdd ZG dd dejjZG dd dejjZG dd dejjZdd Zdd ZG dd dejjZG dd dejjZG dd dejjZG dd  d ejjZdS )!zTransformer.    N)FusedLayerNorm)mpuc                       s&   e Zd Z fddZdddZ  ZS )PositionalEmbeddingc                    s>   t t|   || _ddtd|d|   }| d| d S )N   i'                 @inv_freq)superr   __init__hidden_sizetorcharangeregister_buffer)selfr   r   	__class__ `/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/nlp/mglm/model/transformer.pyr
      s   zPositionalEmbedding.__init__Nc                 C   sf   t || j}t j| | gdd}|d ur(|d d d d d f |ddS |d d d d d f S )Ndim)r   gerr   catsincosexpand)r   pos_seqbszsinusoid_inppos_embr   r   r   forward$   s
    zPositionalEmbedding.forwardN)__name__
__module____qualname__r
   r    __classcell__r   r   r   r   r      s    	r   c                       s4   e Zd ZdZ	d	 fdd	Zdd Zdd Z  ZS )
ParallelCrossAttentionz.Parallel cross-attention layer for TransformerNc                    s   t t|   |d u r|}t }t||| _t||| _t||| _tj	||d|d| _
tj	|d| dd|d| _tj|| _tj||d|d| _tj|| _tj rftjjt_tjjt_d S d S )NFgather_outputinit_method   strider(   r)   Tinput_is_parallelr)   )r	   r&   r
   r   get_model_parallel_world_sizedividehidden_size_per_partitionhidden_size_per_attention_head!num_attention_heads_per_partitionColumnParallelLinearquery	key_valuer   nnDropoutattention_dropoutRowParallelLineardenseoutput_dropout	deepspeedcheckpointingis_configuredget_cuda_rng_tracker
checkpoint)r   r   num_attention_headsattention_dropout_proboutput_dropout_probr)   output_layer_init_method
world_sizer   r   r   r
   1   sH   	

zParallelCrossAttention.__init__c                 C   6   |  dd | j| jf }|j| }|ddddS z_Transpose a 3D tensor [b, s, np*hn] into a 4D tensor with
        size [b, np, s, hn].
        Nr   r   r*   r      sizer3   r2   viewpermuter   tensornew_tensor_shaper   r   r   _transpose_for_scores`      
z,ParallelCrossAttention._transpose_for_scoresc                 C   s(  |  |}| |}t|d\}}| |}| |}	| |}
t||	dd}|t	| j
 }|d urCt||dd|   }tjjdd|}t   | |}W d    n1 sbw   Y  t||
}|dddd	 }| d d | jf }|j| }| |}| |}|S )
Nr*   r   g     @      ?r   r   r   rI   )r5   r6   r   split_tensor_along_last_dimrQ   r   matmul	transposemathsqrtr2   mulr7   Softmaxr@   forkr9   rM   
contiguousrK   r1   rL   r;   r<   )r   hidden_statesencoder_states
cross_maskmixed_query_layermixed_x_layermixed_key_layermixed_value_layerquery_layer	key_layervalue_layerattention_scoresattention_probscontext_layernew_context_layer_shapeoutputr   r   r   r    j   s>   










zParallelCrossAttention.forwardr!   )r"   r#   r$   __doc__r
   rQ   r    r%   r   r   r   r   r&   .   s    /
r&   c                       sR   e Zd ZdZ				d fdd	Zdd Zedd	d
Z				dddZ  Z	S )ParallelSelfAttentiona  Parallel self-attention layer for GPT2.

    Self-attention layer takes input with size [b, s, h] where b is
    the batch size, s is the sequence length, and h is the hidden size
    and creates output of the same size.
    Arguments:
        hidden_size: total hidden size of the layer (h).
        num_attention_heads: number of attention heads (n). Note that we
                             require n to be divisible by number of GPUs
                             used to parallelize the model. Also, we
                             require hidden size to be divisible by n.
        attention_dropout_prob: dropout probability for the attention scores.
        init_method: weight initialization.
        output_layer_init_method: output layer initialization. If None, use
                                  `init_method`.
    We use the following notation:
        h: hidden_size
        n: num_attention_heads
        p: number of partitions
        np: n/p
        hp: h/p
        hn: h/n
        b: batch size
        s: sequence length
    NFrT   c
                    s   t t|   || _|d u r|}t }
t||
| _t||| _t||
| _	|| _
|	| _tj|d| dd|d| _|rHtj||d|d| _tj|| _tj||d|d| _tj|| _tj rqtjjt_tjjt_d S d S )NrI   Fr+   r'   Tr-   )r	   rn   r
   	performerr   r/   r0   r1   r2   r3   relative_encodingattention_scaler4   query_key_valuerelativer   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   )r   r   rB   rC   rD   r)   rE   rp   ro   rq   rF   r   r   r   r
      sP   


zParallelSelfAttention.__init__c                 C   rG   rH   rJ   rN   r   r   r   rQ      rR   z+ParallelSelfAttention._transpose_for_scoresc                 C   s   t jg |  d d | ddR | j| jd}t j|| gdd}|jg |  d d | dd | dR  }|d d d d dd f | } |rvt | d| df}| t 	|| d| d d d d d d d f  } | S )NrS   r   devicedtyper   r   r   )
r   zerosrK   ru   rv   r   rL   view_asonestril)x	zero_triuzero_padx_paddedry   r   r   r   
_rel_shift   s   &4 4z ParallelSelfAttention._rel_shiftc                 C   s\  | d}|d u r| |}t|d\}	}
}n!t||fd}| |}t|d\}	}
}|	d d | d f }	| |	}| |
}| |}| jr| |}| |}||	d }t
||dd}||	d }t
||dd}| |}|| }|t| j }n/| jdkrt
|t| j |ddt| j| j  }nt
||ddt| j }t||}| jdkr|jdddd }||8 }|| j9 }|d	d|   }tjjdd
|}t   | |}W d    n1 sw   Y  t
||}|dddd }|  d d | jf }|j| }| |}| |}|S )Nr   rI   r   rS   rT   T)r   keepdimr   g     r   r*   )rK   rr   r   rU   r   r   rQ   rp   rs   	unsqueezerV   rW   r   rX   rY   r2   rq   rZ   maxr7   r[   r@   r\   r9   rM   r]   r1   rL   r;   r<   )r   r^   	ltor_maskposition_embeddingsr_w_biasr_r_biasmemquery_lengthrb   ra   rc   rd   r   re   rf   rg   relative_layer	rw_head_qac_score	rr_head_qbd_scorerh   max_attention_scoresri   rj   rk   rl   r   r   r   r      s   

















zParallelSelfAttention.forwardNFFrT   FNNNN)
r"   r#   r$   rm   r
   rQ   staticmethodr   r    r%   r   r   r   r   rn      s     6
rn   c                 C   s*   d|  dt d|  dd|  |      S )zOpenAI's gelu implementation.g      ?rT   gQ63E?gHm?)r   tanhr{   r   r   r   	gelu_implf  s
   r   c                 C   s   t | S r!   )r   r   r   r   r   gelun  s   r   c                       s,   e Zd ZdZ	d fdd	Zdd Z  ZS )ParallelMLPa  MLP for GPT2.

    MLP will take the input with h hidden state, project it to 4*h
    hidden dimension, perform gelu transformation, and project the
    state back into h hidden dimension. At the end, dropout is also
    applied.

    Arguments:
        hidden_size: The hidden size of the self attention.
        output_dropout_prob: dropout probability for the outputs
                             after self attention and final output.
        init_method: initialization method used for the weights. Note
                     that all biases are initialized to zero and
                     layernorm weight are initialized to one.
        output_layer_init_method: output layer initialization. If None,
                                  use `init_method`.
    Nc                    s\   t t|   |d u r|}tj|d| d|d| _tjd| |d|d| _tj	
|| _d S )N   Fr'   Tr-   )r	   r   r
   r   r4   dense_h_to_4hr:   dense_4h_to_hr   r7   r8   dropout)r   r   rD   r)   rE   r   r   r   r
     s    zParallelMLP.__init__c                 C   s*   |  |}t|}| |}| |}|S r!   )r   r   r   r   )r   r^   intermediate_parallelrl   r   r   r   r      s
   


zParallelMLP.forwardr!   r"   r#   r$   rm   r
   r    r%   r   r   r   r   r   r  s
    r   c                       s0   e Zd ZdZ	d fdd	Z	dddZ  ZS )ParallelDecoderLayer  A single layer transformer for GPT2.

    We use the following notation:
        h: hidden size
        n: number of attention heads
        b: batch size
        s: sequence length
    Transformore layer takes input with size [b, s, h] and returns an
    output of the same size.

    Arguments:
        hidden_size: The hidden size of the self attention.
        num_attention_heads: number of attention head in the self
                             attention.
        attention_dropout_prob: dropout probability of the attention
                                score in self attention.
        output_dropout_prob: dropout probability for the outputs
                             after self attention and final output.
        layernorm_epsilon: epsilon used in layernorm to avoid
                           division by zero.
        init_method: initialization method used for the weights. Note
                     that all biases are initialized to zero and
                     layernorm weight are initialized to one.
        output_layer_init_method: output layers (attention output and
                                  mlp output) initialization. If None,
                                  use `init_method`.
    Nc                    s   t t|   |d u r|}t||d| _t||||||d| _t||d| _t||||||d| _	t||d| _
t||||d| _d S )NepsrE   )r	   r   r
   	LayerNorminput_layernormrn   self_attentionpost_self_layernormr&   cross_attentionpost_attention_layernormr   mlp)r   r   rB   rC   rD   layernorm_epsilonr)   rE   r   r   r   r
     s@   		zParallelDecoderLayer.__init__c                 C   s^   |  |}| ||}|| }| |}| |||}	||	 }
| |
}| |}|
| }|S r!   )r   r   r   r   r   r   )r   r^   r_   r   r`   layernorm_outputself_attention_outputself_layernorm_inputself_layernorm_outputattention_outputlayernorm_input
mlp_outputrl   r   r   r   r      s   
	


zParallelDecoderLayer.forwardr!   r   r   r   r   r   r     s    #4r   c                       s<   e Zd ZdZ				d	 fdd	Z				d
ddZ  ZS )ParallelTransformerLayerr   NFrT   c                    sh   t t|   |d u r|}t||d| _t||||||||	|
d	| _t||d| _t||||d| _	d S )Nr   rE   rp   ro   rq   r   )
r	   r   r
   r   r   rn   	attentionr   r   r   )r   r   rB   rC   rD   r   r)   rE   rp   ro   rq   r   r   r   r
   0  s0   z!ParallelTransformerLayer.__init__c                 C   s\   |  |}|d ur|  |nd }| ||||||}|| }	| |	}| |}
|	|
 }|S r!   )r   r   r   r   )r   r^   r   r   r   r   r   r   r   r   r   rl   r   r   r   r    Z  s   


z ParallelTransformerLayer.forwardr   r   r   r   r   r   r   r     s    #-r   c                    s    fdd}|S )z!Init method based on N(0, sigma).c                       t jjj| d dS Nr   meanstdr   r7   initnormal_rO   sigmar   r   init_z     z#unscaled_init_method.<locals>.init_r   )r   r   r   r   r   unscaled_init_methodw  s   r   c                    s"   | t d|    fdd}|S )z3Init method based on N(0, sigma/sqrt(2*num_layers).r   c                    r   r   r   r   r   r   r   r     r   z!scaled_init_method.<locals>.init_)rX   rY   )r   
num_layersr   r   r   r   scaled_init_method  s   r   c                       sP   e Zd ZdZ									d fdd		Z	
	
		dddZdddZ  ZS )GPT2ParallelTransformera  GPT-2 transformer.

    This module takes input from embedding layer and it's output can
    be used directly by a logit layer. It consists of L (num-layers)
    blocks of:
        layer norm
        self attention
        residual connection
        layer norm
        mlp
        residual connection
    followed by a final layer norm.

    Arguments:
        num_layers: Number of transformer layers.
        hidden_size: The hidden size of the self attention.
        num_attention_heads: number of attention head in the self
                             attention.
        attention_dropout_prob: dropout probability of the attention
                                score in self attention.
        output_dropout_prob: dropout probability for the outputs
                             after self attention and final output.
        checkpoint_activations: if True, checkpoint activations.
        checkpoint_num_layers: number of layers to checkpoint. This
                               is basically the chunk size in checkpoitning.
        layernorm_epsilon: epsilon used in layernorm to avoid
                           division by zero.
        init_method_std: standard deviation of the init method which has
                         the form N(0, std).
        use_scaled_init_for_output_weights: If Ture use 1/sqrt(2*num_layers)
                                            scaling for the output weights (
                                            output of self attention and mlp).
    r   h㈵>{Gz?TFrT   c                    s   t t|   | _|	| _|
| _|| _	| _| _	r
rJ d |r(t	|t
j|| _
| _|| _
rt| _t }t| _t|| _t
jt
| j| j| _d| j_t
jt
| j| j| _d| j_t
  | j  | j  W d    n1 sw   Y  n7|rt
j|d | _t
j|d | _t
jj j!| jj"dd nt
j|| _t
jj j!| jj"dd  	
fddt
j#fddt$|D | _%t&d	| _'t(j)* rt(j)j+t_+t(j)j,t_,d S d S )
NTr   r   r   c                      s>   
rt  tdS t t	d
S )Nr   r   )r   r   r   r   )rC   rq   r   init_method_stdr   rB   rD   rE   ro   rp   use_decoder_layerr   r   	get_layer  s,   	z3GPT2ParallelTransformer.__init__.<locals>.get_layerc                    s   g | ]}  qS r   r   ).0_)r   r   r   
<listcomp>  s    z4GPT2ParallelTransformer.__init__.<locals>.<listcomp>r   )-r	   r   r
   r   checkpoint_activationscheckpoint_num_layersmax_memory_lengthro   r   r   r   r7   r8   embedding_dropoutrp   block_position_encodingr   r   r   r/   r0   r2   r3   	ParameterTensorr   model_parallelr   no_gradzero_	Embeddingblock_position_embeddingsr   r   weight
ModuleListrangelayersr   final_layernormr=   r>   r?   r@   rA   )r   r   r   rB   max_sequence_lengthr   embedding_dropout_probrC   rD   r   r   r   r   "use_scaled_init_for_output_weightsrp   r   ro   r   rq   rF   r   )rC   rq   r   r   r   r   rB   rD   rE   ro   rp   r   r   r
     s   



 

z GPT2ParallelTransformer.__init__Nc                    s    d d \ }|r|d  dnd}	||	 }
t|dkp(t| k}jr:s2J d|	dks:J d|rXrB| n|}d fdd	}jsW||||	d}n|d d d d d d | |	 d f }jrtj|
d d	d
jjd}	|}
|}n)jr|d d df |d d df }}	|}| jr|}| 
fddjdksrȈgng fdd}jr"d}tj}j}||k r!js|gn||g}jr||jjg7 }|r|||||  7 }tj|||| g|R  ||7 }||k snEtjD ]?\}}js3|gn||g}jrE||jjg7 }|rL|| nd }||d|ijdks^re q'}jdksur}j|d|fS )Nr*   r   r   zFattention_mask should be a scalar to indicate the seperation position.zDo not support transformer-xl.c                    s    d| | f}t|}rd|dd d d |f< n)| dd}tj| |j|jddd}||ddk }||	d
|d}|dkr^| dd}tj  | |f|fdd}|	d}|S )Nr   r   r   rt   r*   r   )new_onesr   rz   r   r   ru   rv   rL   masked_fillr   	expand_asr   )
seq_lengthsepmemory_lengthmidsmask)
batch_sizer^   	is_scalarr   r   build_mask_matrix2  s2   

z:GPT2ParallelTransformer.forward.<locals>.build_mask_matrix)r   r   g      rt   c                    s    r|   S | S r!   )detach)_hidden_states)detach_memoryr   r   check_detache  s   z5GPT2ParallelTransformer.forward.<locals>.check_detachc                    s    fdd}|S )Nc                     s   j  }| d | dd  }} jr#| d d | dd  } }n| d d | dd  } }t|D ]%\}}|r>|| nd }||g| R d|i}jdksRrY | q4|S )Nr   r   r   r   )r   rp   	enumerater   append)inputslayers_x_mems_ilayermem_i_)r   end
mem_layersreturn_memoryr   startr   r   custom_forwardq  s   zGGPT2ParallelTransformer.forward.<locals>.custom.<locals>.custom_forwardr   )r   r   r   )r   r   r   r   )r   r   r   customo  s   z/GPT2ParallelTransformer.forward.<locals>.customr   )r   )r   )rK   r   numelro   itemrp   r   ru   rv   r   r   r   r   r   r   lenr   r   r   r   r   r   rA   r   r   r   update_mems)r   r^   position_idsattention_maskmemory_statesr_   r   r   r   r   
key_lengthis_sepr   r   position_sequencer   block_position_idsr   r  lr   chunk_lengthargsr   r   mem_irl   r   )r   r   r   r^   r   r   r   r   r   r      s   






zGPT2ParallelTransformer.forwardc           	      C   s   |r	|d  dnd}|d  d}|| }|st| j|}g }tt|D ]2}||kr=||| d d | d f  q&|tj|| d d | | d f || fdd q&|S )Nr   r   r   )rK   minr   r   r  r   r   r   )	r   hiddensmemsr   r   r   new_memory_lengthnew_memsr   r   r   r   r    s"   " z#GPT2ParallelTransformer.update_mems)	r   r   r   TFFFFrT   )NNFTr   )r"   r#   r$   rm   r
   r    r  r%   r   r   r   r   r     s&    -t
 r   c                       s8   e Zd ZdZdejf fdd	Zdd Zdd Z  Z	S )	BertParallelSelfAttentiona  Parallel self-attention layer for BERT.

    Self-attention layer takes input with size [b, s, h] where b is
    the batch size, s is the sequence lenght, and h is the hidden size
    and creates output of the same size.
    Arguments:
        hidden_size: total hidden size of the layer (h).
        num_attention_heads: number of attention heads (n). Note that we
                             require n to be divisible by number of GPUs
                             used to parallelize the model. Also, we
                             require hidden size be divisible by n.
        dropout_prob: dropout probability for the attention scores.
        output_parallel: If true, no all-gather is done on the output and
                         the output values will be per partition.
    We use the following notation:
        h: hidden_size
        n: num_attention_heads
        p: number of partitions
        np: n/p
        hp: h/p
        hn: h/n
        b: batch size
        s: sequence length
    Fc                    s   t t|   || _|| _|| _|| _t }t	||| _
t	||| _t	||| _tj|d| dd|d| _tj|| _tj rQtjjt_tjjt_d S d S )NrI   Fr+   )r	   r  r
   r   rB   dropout_proboutput_parallelr   r/   r0   r1   r2   r3   r4   rr   r   r7   r8   r   r=   r>   r?   r@   rA   )r   r   rB   r  r  r)   rF   r   r   r   r
     s2   	

z"BertParallelSelfAttention.__init__c                 C   rG   rH   rJ   rN   r   r   r   rQ     rR   z/BertParallelSelfAttention._transpose_for_scoresc                 C   s  |  |}t|d\}}}| |}| |}| |}	tt| j}
t||
 |	dd|
 }||7 }tj
jdd|}t   | |}W d    n1 sWw   Y  t||	}|dddd }| d d | jf }|j| }| jr|}|S t|}|S )NrI   r   rS   r   r   r*   r   )rr   r   rU   rQ   rX   rY   r2   r   rV   rW   r7   r[   r@   r\   r   rM   r]   rK   r1   rL   r  !gather_from_model_parallel_region)r   r^   r  rb   ra   rc   rd   re   rf   rg   norm_factorrh   ri   rj   rk   rl   r   r   r   r      s<   





z!BertParallelSelfAttention.forward)
r"   r#   r$   rm   r   xavier_normal_r
   rQ   r    r%   r   r   r   r   r    s    #
r  c                       s2   e Zd ZdZddejf fdd	Zdd Z  ZS )BertParallelTransformerOutputz[The output layer used after self attention and intermediate
    parts of transformer layer.g-q=Fc                    sB   t t|   tj||||d| _tj|| _	t
||d| _d S )Nr-   r   )r	   r  r
   r   r:   r;   r   r7   r8   r   r   	layernorm)r   
input_sizeoutput_sizer  r   r.   r)   r   r   r   r
   7  s   z&BertParallelTransformerOutput.__init__c                 C   s*   |  |}| |}|| }| |}|S r!   )r;   r   r  )r   r^   input_tensorr   r   r   r   r    H  s
   


z%BertParallelTransformerOutput.forward	r"   r#   r$   rm   r   r  r
   r    r%   r   r   r   r   r  3  s    r  c                       s.   e Zd ZdZejf fdd	Zdd Z  ZS )BertParallelTransformerLayera_  A single layer transformer for Bert.

    We use the following notation:
        h: hidden size
        n: number of attention heads
        b: batch size
        s: sequence length
    Transformore layer takes input with size [b, s, h] and returns an
    output of the same size.

    Arguments:
        hidden_size: The hidden size of the self attention.
        intermediate_size: size of the intermediate state after
                           self attention. In both BERT and GPT
                           this is set to be 4 times the hidden
                           size.
        num_attention_heads: number of attention head in the self
                             attention.
        attention_dropout_prob: dropout probability of the attention
                                score in self attention.
        output_dropout_prob: dropout probability for the outputs
                             after self attention and final output.
        intermediate_activation_fn: activation function for output
                                    of intermediate.
        layernorm_epsilon: epsilon used in layernorm to avoid
                           division by zero.
        init_method: initialization method used for the weights. Note
                     that all biases are initialized to zero and
                     layernorm weight are initialized to one.
    c	           	         sl   t t|   t|||d|d| _t||||d|d| _tj||d|d| _	|| _
t||||d|d| _d S )NT)r  r)   )r   r.   r)   Fr'   )r	   r"  r
   r  r   r  self_outputr   r4   intermediateintermediate_activation_fnrl   )	r   r   intermediate_sizerB   rC   rD   r%  r   r)   r   r   r   r
   p  s>   	z%BertParallelTransformerLayer.__init__c                 C   s<   |  ||}| ||}| |}| |}| ||}|S r!   )r   r#  r$  r%  rl   )r   r^   r  attention_output_parallelattention_self_outputintermediate_output_parallellayer_outputr   r   r   r      s   
z$BertParallelTransformerLayer.forwardr!  r   r   r   r   r"  P  s
    '*r"  )rm   rX   r=   r   torch.nn.initr7   r   #apex.normalization.fused_layer_normr   r   megatron_utilr   Moduler   r&   rn   jitscriptr   r   r   r   r   r   r   r   r  r  r"  r   r   r   r   <module>   s2   l M
5ld	
  5v