o
    ߥiI3                     @   s   d dl Z d dlZd dlmZmZ d dlZd dlZd dlmZ d dlm	Z
 d dlmZ d dlmZ ddlmZ G d	d
 d
ejZG dd dejZG dd dejZG dd dejZG dd dejZG dd deZdS )    N)OptionalUnion)nn)
functional)PreTrainedModel)	ModelFile   )GPTMoEConfigc                       s>   e Zd ZdZ fddZdd Z	dddZdd	d
Z  ZS )GPTMoESelfAttentionzParallel self-attention layer abstract class.

    Self-attention layer takes input with size [s, b, h]
    and returns output of the same size.
    c                    s~   t    |j| _|j| _| j| j | _t| jd| j | _tjdd| _	t
|j| _t| j| j| _t
|j| _d S )N   dim)super__init__hidden_sizenum_attention_headshidden_size_per_attention_headr   Linearquery_key_valueSoftmaxsoftmaxDropoutattention_probs_dropout_probattention_dropoutdensehidden_dropout_proboutput_dropoutselfconfig	__class__ Z/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/nlp/gpt_moe/backbone.pyr   %   s   

zGPTMoESelfAttention.__init__c                 C   s6   |  dd | j| jf }|j| }|ddddS )z_Transpose a 3D tensor [b, s, np*hn] into a 4D tensor with
        size [b, np, s, hn].
        Nr   r      r   r   )sizer   r   viewpermute)r   tensornew_tensor_shaper#   r#   r$   _transpose_for_scores8   s
   
z)GPTMoESelfAttention._transpose_for_scoresFc                 C   sF   |  d }| | | }tj|||d}|r!tdd |D S |S )Nr   r   c                 s   s    | ]}|  V  qd S N)
contiguous).0chunkr#   r#   r$   	<genexpr>L   s    zCGPTMoESelfAttention._split_tensor_along_last_dim.<locals>.<genexpr>)r   r&   torchsplittuple)r   r)   num_partitionscontiguous_split_chunkslast_dimlast_dim_sizetensor_listr#   r#   r$   _split_tensor_along_last_dimA   s   z0GPTMoESelfAttention._split_tensor_along_last_dimc                 C   sP  | d}t|dd||g}| |}| |d\}}}| |}	| |}
| |}| }t|	|
dd}|t	
| j }|r`|
 d}ttjd||f|jddd|||}dd|  }t||| |}| |}| |}t||}|d	ddd }|  d d | jf }|j| }| |}| |}|S )
Nr   r   r   r%   )deviceg     @      ?r   )r&   r1   reshaper   r9   r+   typematmul	transposemathsqrtr   trilonesr;   r'   mulr   r   r(   r-   r   r   r   )r   hidden_states	ltor_maskis_infertgt_lenmixed_x_layermixed_query_layermixed_key_layermixed_value_layerquery_layer	key_layervalue_layerprevious_typeattention_scoressrc_lenconverted_maskattention_probscontext_layernew_context_layer_shapeoutputr#   r#   r$   forwardP   sX   













zGPTMoESelfAttention.forward)F)	__name__
__module____qualname____doc__r   r+   r9   rY   __classcell__r#   r#   r!   r$   r
      s    
r
   c                       (   e Zd ZdZ fddZdd Z  ZS )	GPTMoEMLPzMLP.

    MLP will take the input with h hidden state, project it to 4*h
    hidden dimension, perform nonlinear transformation, and project the
    state back into h hidden dimension.
    c                    sN   t    |j}t|d| | _tj| _td| || _	t
|j| _d S )N   )r   r   r   r   r   dense_h_to_4hFgeluactivation_funcdense_4h_to_hr   r   dropout)r   r    r   r!   r#   r$   r      s   
zGPTMoEMLP.__init__c                 C   s,   |  |}| |}| |}| |}|S r,   )rb   re   rf   rg   )r   rF   intermediate_parallelrX   r#   r#   r$   rY      s
   



zGPTMoEMLP.forwardrZ   r[   r\   r]   r   rY   r^   r#   r#   r!   r$   r`      s    r`   c                       r_   )GPTMoETransformerLayerzA single transformer layer.

    Transformer layer takes input with size [s, b, h] and returns an
    output of the same size.
    c                    sJ   t    tj|j|jd| _t|| _tj|j|jd| _	t
|| _d S )Neps)r   r   r   	LayerNormr   layernorm_epsiloninput_layernormr
   	attentionpost_attention_layernormr`   mlpr   r!   r#   r$   r      s   

zGPTMoETransformerLayer.__init__c                 C   s>   |  |}| ||}|| }| |}| |}|| }|S r,   )ro   rp   rq   rr   )r   rF   rG   layernorm_outputattention_outputlayernorm_input
mlp_outputrX   r#   r#   r$   rY      s   


zGPTMoETransformerLayer.forwardri   r#   r#   r!   r$   rj      s    rj   c                       s0   e Zd ZdZ fddZdd Zdd Z  ZS )GPTMoETransformerzTransformer class.c                    sR   t    d | _ j| _tj fddt| jD | _	tj
 j jd| _d S )Nc                    s   g | ]}t  qS r#   )rj   )r.   _r    r#   r$   
<listcomp>   s    z.GPTMoETransformer.__init__.<locals>.<listcomp>rk   )r   r   input_tensornum_hidden_layers
num_layersr1   r   
ModuleListrangelayersrm   r   rn   final_layernormr   r!   ry   r$   r      s   
zGPTMoETransformer.__init__c                 C   s
   | j | S r,   )r   )r   layer_numberr#   r#   r$   
_get_layer   s   
zGPTMoETransformer._get_layerc                 C   s2   t | jD ]}| |}|||}q| |}|S r,   )r   r}   r   r   )r   rF   attention_maskindexlayerr#   r#   r$   rY      s
   

zGPTMoETransformer.forward)rZ   r[   r\   r]   r   r   rY   r^   r#   r#   r!   r$   rw      s
    rw   c                       r_   )GPTMoETransformerLanguageModela  Transformer language model.

    Arguments:
        transformer_hparams: transformer hyperparameters
        vocab_size: vocabulary size
        max_sequence_length: maximum size of sequence. This
                             is used for positional embedding
        embedding_dropout_prob: dropout probability for embeddings
        num_tokentypes: size of the token-type embeddings. 0 value
                        will ignore this embedding
    c                    sJ   t    t|j|j| _t|j|j| _t	|j
| _t|| _d S r,   )r   r   r   	Embedding
vocab_sizer   word_embeddingsmax_position_embeddingsposition_embeddingsr   r   embedding_dropoutrw   transformerr   r!   r#   r$   r     s   
z'GPTMoETransformerLanguageModel.__init__c           
      C   sF   |  |}| |}|| }| |}| ||}t|| j j}	|	S r,   )r   r   r   r   rc   linearweight)
r   	input_idsr   position_idswords_embeddingsr   
embeddingstransformer_inputtransformer_outputlogitsr#   r#   r$   rY     s   


z&GPTMoETransformerLanguageModel.forwardri   r#   r#   r!   r$   r      s    r   c                       s`   e Zd ZeZdd Z fddZ			dddZede	e
eejf  fd	d
Zdd Z  ZS )GPTMoEModelc                 C   s   t |tjr |jjjd| jjd |jdur|jj	  dS dS t |tj
rC|jjjd| jjd |jdurA|jj|j 	  dS dS t |tjrX|jj	  |jjd dS dS )zInitialize the weightsg        )meanstdNr<   )
isinstancer   r   r   datanormal_r    initializer_rangebiaszero_r   padding_idxrm   fill_)r   moduler#   r#   r$   _init_weights!  s$   

zGPTMoEModel._init_weightsc                    s   t  | t|| _d S r,   )r   r   r   language_modelr   r!   r#   r$   r   3  s   zGPTMoEModel.__init__Nc           
      K   s   | d}ttjdd||ftj|jd}|d u r,tj|tj|jd}|d|}| 	|||}d }|d urKt
 }	|	|d| jj|d}tj||dS )Nr   )dtyper;   r   r   )lossr   )r&   r1   rC   rD   longr;   arange	unsqueeze	expand_asr   r   CrossEntropyLossr'   r    r   addictDict)
r   r   r   r   labelskwargs
seq_lengthr   r   loss_fctr#   r#   r$   rY   7  s(   

zGPTMoEModel.forwardpretrained_model_name_or_pathc                 C   s^   | j |}| |}tj|tj}t|}d|v r|d }dd |	 D }|
| |S )N
state_dictc                 S   s   i | ]\}}| d d|qS )zmodel.language_modelr   )replace)r.   kvr#   r#   r$   
<dictcomp>[  s    z/GPTMoEModel.from_pretrained.<locals>.<dictcomp>)config_classfrom_pretrainedospathjoinr   TORCH_MODEL_BIN_FILEr1   loaditemsload_state_dict)clsr   r    modelstate_dict_filer   r#   r#   r$   r   O  s   

zGPTMoEModel.from_pretrainedc                 O   s   d|iS )Nr   r#   )r   r   argsr   r#   r#   r$   prepare_inputs_for_generationb  s   z)GPTMoEModel.prepare_inputs_for_generation)NNN)rZ   r[   r\   r	   r   r   r   rY   classmethodr   r   strr   PathLiker   r   r^   r#   r#   r!   r$   r     s    
r   )rA   r   typingr   r   r   r1   r   torch.nnr   rc   transformers.modeling_utilsr   modelscope.utils.constantr   configurationr	   Moduler
   r`   rj   rw   r   r   r#   r#   r#   r$   <module>   s    k,"'