o
    ߥi>                     @   s   d dl Z d dlZd dlmZmZ d dlZd dlZd dlmZ d dlm	Z
 d dlmZ d dlmZ d dlmZ dd	lmZ dd
lmZ G dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd deZdS )    N)OptionalUnion)nn)
functional)PreTrainedModel)TokenGeneratorOutput)	ModelFile   )
GPT3Config)samplec                       s>   e Zd ZdZ fddZdd Z	dddZdd	d
Z  ZS )GPT3SelfAttentionzParallel self-attention layer abstract class.

    Self-attention layer takes input with size [s, b, h]
    and returns output of the same size.
    c                    s~   t    |j| _|j| _| j| j | _t| jd| j | _tjdd| _	t
|j| _t| j| j| _t
|j| _d S )N   dim)super__init__hidden_sizenum_attention_headshidden_size_per_attention_headr   Linearquery_key_valueSoftmaxsoftmaxDropoutattention_probs_dropout_probattention_dropoutdensehidden_dropout_proboutput_dropoutselfconfig	__class__ W/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/nlp/gpt3/backbone.pyr   '   s   

zGPT3SelfAttention.__init__c                 C   s6   |  dd | j| jf }|j| }|ddddS )z_Transpose a 3D tensor [b, s, np*hn] into a 4D tensor with
        size [b, np, s, hn].
        Nr   r      r	   r   )sizer   r   viewpermute)r!   tensornew_tensor_shaper%   r%   r&   _transpose_for_scores:   s
   
z'GPT3SelfAttention._transpose_for_scoresFc                 C   sF   |  d }| | | }tj|||d}|r!tdd |D S |S )Nr	   r   c                 s   s    | ]}|  V  qd S N)
contiguous).0chunkr%   r%   r&   	<genexpr>N   s    zAGPT3SelfAttention._split_tensor_along_last_dim.<locals>.<genexpr>)r   r(   torchsplittuple)r!   r+   num_partitionscontiguous_split_chunkslast_dimlast_dim_sizetensor_listr%   r%   r&   _split_tensor_along_last_dimC   s   z.GPT3SelfAttention._split_tensor_along_last_dimc                 C   sP  | d}t|dd||g}| |}| |d\}}}| |}	| |}
| |}| }t|	|
dd}|t	
| j }|r`|
 d}ttjd||f|jddd|||}dd|  }t||| |}| |}| |}t||}|d	ddd }|  d d | jf }|j| }| |}| |}|S )
Nr	   r   r   r'   deviceg     @      ?r   )r(   r3   reshaper   r;   r-   typematmul	transposemathsqrtr   trilonesr>   r)   mulr   r   r*   r/   r   r   r   )r!   hidden_states	ltor_maskis_infertgt_lenmixed_x_layermixed_query_layermixed_key_layermixed_value_layerquery_layer	key_layervalue_layerprevious_typeattention_scoressrc_lenconverted_maskattention_probscontext_layernew_context_layer_shapeoutputr%   r%   r&   forwardR   sX   













zGPT3SelfAttention.forward)F)	__name__
__module____qualname____doc__r   r-   r;   r\   __classcell__r%   r%   r#   r&   r       s    
r   c                       (   e Zd ZdZ fddZdd Z  ZS )GPT3MLPzMLP.

    MLP will take the input with h hidden state, project it to 4*h
    hidden dimension, perform nonlinear transformation, and project the
    state back into h hidden dimension.
    c                    sN   t    |j}t|d| | _tj| _td| || _	t
|j| _d S )N   )r   r   r   r   r   dense_h_to_4hFgeluactivation_funcdense_4h_to_hr   r   dropout)r!   r"   r   r#   r%   r&   r      s   
zGPT3MLP.__init__c                 C   s,   |  |}| |}| |}| |}|S r.   )re   rh   ri   rj   )r!   rI   intermediate_parallelr[   r%   r%   r&   r\      s
   



zGPT3MLP.forwardr]   r^   r_   r`   r   r\   ra   r%   r%   r#   r&   rc      s    rc   c                       rb   )GPT3TransformerLayerzA single transformer layer.

    Transformer layer takes input with size [s, b, h] and returns an
    output of the same size.
    c                    sJ   t    tj|j|jd| _t|| _tj|j|jd| _	t
|| _d S )Neps)r   r   r   	LayerNormr   layernorm_epsiloninput_layernormr   	attentionpost_attention_layernormrc   mlpr    r#   r%   r&   r      s   

zGPT3TransformerLayer.__init__c                 C   s>   |  |}| ||}|| }| |}| |}|| }|S r.   )rr   rs   rt   ru   )r!   rI   rJ   layernorm_outputattention_outputlayernorm_input
mlp_outputr[   r%   r%   r&   r\      s   


zGPT3TransformerLayer.forwardrl   r%   r%   r#   r&   rm      s    rm   c                       s0   e Zd ZdZ fddZdd Zdd Z  ZS )GPT3TransformerzTransformer class.c                    sR   t    d | _ j| _tj fddt| jD | _	tj
 j jd| _d S )Nc                    s   g | ]}t  qS r%   )rm   )r0   _r"   r%   r&   
<listcomp>   s    z,GPT3Transformer.__init__.<locals>.<listcomp>rn   )r   r   input_tensornum_hidden_layers
num_layersr3   r   
ModuleListrangelayersrp   r   rq   final_layernormr    r#   r|   r&   r      s   
zGPT3Transformer.__init__c                 C   s
   | j | S r.   )r   )r!   layer_numberr%   r%   r&   
_get_layer   s   
zGPT3Transformer._get_layerc                 C   s2   t | jD ]}| |}|||}q| |}|S r.   )r   r   r   r   )r!   rI   attention_maskindexlayerr%   r%   r&   r\      s
   

zGPT3Transformer.forward)r]   r^   r_   r`   r   r   r\   ra   r%   r%   r#   r&   rz      s
    rz   c                       rb   )GPT3TransformerLanguageModela  Transformer language model.

    Arguments:
        transformer_hparams: transformer hyperparameters
        vocab_size: vocabulary size
        max_sequence_length: maximum size of sequence. This
                             is used for positional embedding
        embedding_dropout_prob: dropout probability for embeddings
        num_tokentypes: size of the token-type embeddings. 0 value
                        will ignore this embedding
    c                    sJ   t    t|j|j| _t|j|j| _t	|j
| _t|| _d S r.   )r   r   r   	Embedding
vocab_sizer   word_embeddingsmax_position_embeddingsposition_embeddingsr   r   embedding_dropoutrz   transformerr    r#   r%   r&   r     s   
z%GPT3TransformerLanguageModel.__init__c           
      C   sF   |  |}| |}|| }| |}| ||}t|| j j}	|	S r.   )r   r   r   r   rf   linearweight)
r!   	input_idsr   position_idswords_embeddingsr   
embeddingstransformer_inputtransformer_outputlogitsr%   r%   r&   r\     s   


z$GPT3TransformerLanguageModel.forwardrl   r%   r%   r#   r&   r      s    r   c                       sl   e Zd ZeZdd Z fddZ			dddZede	e
eejf  fd	d
ZdddZdddZ  ZS )	GPT3Modelc                 C   s   t |tjr |jjjd| jjd |jdur|jj	  dS dS t |tj
rC|jjjd| jjd |jdurA|jj|j 	  dS dS t |tjrX|jj	  |jjd dS dS )zInitialize the weightsg        )meanstdNr?   )
isinstancer   r   r   datanormal_r"   initializer_rangebiaszero_r   padding_idxrp   fill_)r!   moduler%   r%   r&   _init_weights#  s$   

zGPT3Model._init_weightsc                    s   t  | t|| _d S r.   )r   r   r   language_modelr    r#   r%   r&   r   5  s   zGPT3Model.__init__Nc           
      K   s   | d}ttjdd||ftj|jd}|d u r,tj|tj|jd}|d|}| 	|||}d }|d urKt
 }	|	|d| jj|d}tj||dS )Nr	   dtyper>   r   r   )lossr   )r(   r3   rF   rG   longr>   arange	unsqueeze	expand_asr   r   CrossEntropyLossr)   r"   r   addictDict)
r!   r   r   r   labelskwargs
seq_lengthr   r   loss_fctr%   r%   r&   r\   9  s(   

zGPT3Model.forwardpretrained_model_name_or_pathc                 C   s^   | j |}| |}tj|tj}t|}d|v r|d }dd |	 D }|
| |S )N
state_dictc                 S   s   i | ]\}}| d d|qS )zmodel.language_modelr   )replace)r0   kvr%   r%   r&   
<dictcomp>]  s    z-GPT3Model.from_pretrained.<locals>.<dictcomp>)config_classfrom_pretrainedospathjoinr   TORCH_MODEL_BIN_FILEr3   loaditemsload_state_dict)clsr   r"   modelstate_dict_filer   r%   r%   r&   r   Q  s   

zGPT3Model.from_pretrainedr?   c              	   k   s   | d| jj}| d| jj}| d|dd }|d}| dtj|dg|jd}| 	 }	t|| jj
}
|	|
krFtd	|
|d }|dkretj|||jd }tj||fd
d}| jj}tj|tj|jd}t q t|	|
D ]Z}|d d d |f }| |j}|d d d
d d f }t||||| jjd}||k}|| |||f< t|d d d |d f dV  ||k | @ }||B }t|}|r n	q}W d    d S W d    d S 1 sw   Y  d S )Ntop_ktop_p
max_lengthr	   d   r   prompt_lengthr=   zcontext length too larger   r   r   )r   r   temperaturer   )	sequences)popr"   r   r   r(   r3   r+   r>   minitemr   
ValueErrorzerosr   cateod_iduint8no_gradr   r   r   r   r   byteall)r!   tokensr   r   r   r   r   
batch_sizelengthsmin_prompt_lengthmax_sequence_length
pad_lengthpadstermination_idis_generation_donecontext_length
tokens2user   last_token_logits
new_samplestarted
done_tokendoner%   r%   r&   streaming_generated  st   



	

#"zGPT3Model.streaming_generatec                 K   s&   d }| j ||fi |D ]}|}q|S r.   )r   )r!   r   r   r   last_outputr[   r%   r%   r&   generate  s   zGPT3Model.generate)NNN)r?   )r]   r^   r_   r
   r   r   r   r\   classmethodr   r   strr   PathLiker   r   r   ra   r%   r%   r#   r&   r     s     

Er   )rD   r   typingr   r   r   r3   r   torch.nnr   rf   transformers.modeling_utilsr   modelscope.outputsr   modelscope.utils.constantr   configurationr
   distributed_gpt3r   Moduler   rc   rm   rz   r   r   r%   r%   r%   r&   <module>   s$   k,"'