o
    eiW~                     @   s  d Z ddlZddlZddlm  mZ ddlmZ ddlmZ ddl	m
Z ddlmZ ddlmZmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZmZ ddlmZ eeZG dd dej Z!G dd dej Z"G dd dej Z#G dd dej Z$G dd dej Z%G dd dej Z&G dd dej Z'G dd dej Z(G dd  d ej Z)G d!d" d"ej Z*G d#d$ d$ej Z+eG d%d& d&eZ,eG d'd( d(e,Z-ed)d*G d+d, d,e,eZ.g d-Z/dS ).zPyTorch CPMAnt    N)nn)CrossEntropyLoss   )initialization)ACT2FN)CacheDynamicCache)GenerationMixin)BaseModelOutputWithPastCausalLMOutputWithPast)PreTrainedModel)auto_docstringlogging   )CpmAntConfigc                       s6   e Zd ZdZdef fddZdejfddZ  Z	S )CpmAntLayerNormz~
    We use Root Mean Square (RMS) Layer Normalization, please see https://huggingface.co/papers/1910.07467 for details."
    configc                    s2   t    |j| _|j| _tt|j| _	d S N)
super__init__epshidden_sizedim_normr   	Parametertorchemptyweightselfr   	__class__ h/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/cpmant/modeling_cpmant.pyr   )   s   
zCpmAntLayerNorm.__init__hidden_statesc                 C   s^   | d| jkrtd|j}|tjdjddd}|t	|| j
  || j }|S )f
        Args:
            hidden_states (`torch.Tensor` of shape `(batch, seq_len, dim_in)`)
        z'hidden_states.size(-1) != self.dim_norm   T)dimkeepdim)sizer   AssertionErrordtypetor   float32powmeanrsqrtr   r   )r   r#   	old_dtypevariancer!   r!   r"   forward0   s    zCpmAntLayerNorm.forward)
__name__
__module____qualname____doc__r   r   r   Tensorr3   __classcell__r!   r!   r   r"   r   $   s    r   c                       sr   e Zd Zddef fddZ				ddejdejdejd	ejd
edB de	dB dedB dejdB fddZ
  ZS )CpmAntAttentionNr   c                    s   t    |j| _|j| _|j| _|| _tj	| j| j| j dd| _
tj	| j| j| j dd| _tj	| j| j| j dd| _tj	| j| j | jdd| _tjjdd| _|jd urdtjj|jd| _d S d | _d S )NFbiasr%   r'   )p)r   r   r   	dim_modelnum_attention_heads	num_headsdim_head	layer_idxr   Linear	project_q	project_k	project_vattention_outr   Softmaxsoftmax	dropout_pDropoutdropoutr   r   rC   r   r!   r"   r   >   s   


zCpmAntAttention.__init__Fhidden_q	hidden_kvattention_maskposition_biasoutput_attentionspast_key_values	use_cachecache_positionc	              	   C   s  | d}	| d}
| d}| |}| |}| |}||	|
| j| jdddd}||	|| j| jdddd}||	|| j| jdddd}|durd|||| j	d|i\}}| d}t
||ddt| j }|| }t
|||	d|
|t
d	kt
jtd
|j|jd}| |}t
|||	d|
|t
d	kt
jd|j|jd}|r|}nd}| jdur| |}t
||}||	| j|
| jdddd}| |	|
| j| j }| |}||fS )ad  
        Args:
            hidden_q (`torch.Tensor`):
                Input of transformer block(self-attention block). It can be the raw embedding of a batch of sequences.
            hidden_kv (`torch.Tensor` of shape `(batch, len_k, dim_model)`)):
                Tensor *key_value* and *query* of shape `(batch, len_k, dim_model)`
            attention_mask (`torch.Tensor` of shape `(batch, len_seq, len_seq)`):
                Avoid invalid areas to participate in the calculation of self-attention.
            position_bias (`torch.Tensor` of shape `(batch, len_seq, len_seq)`):
                Provide positional information to self-attention block.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers.
            past_key_values (`Cache`, *optional*):
                Cached past key and value projection states.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
        r   r   r&   r   NrV   r%   Fz-inf)devicer+   )r)   rE   rF   rG   viewrA   rB   permuteupdaterC   r   matmul	transposemathsqrtmasked_filltensorscalar_tensorfloatrX   r+   rJ   rM   
contiguousrH   )r   rO   rP   rQ   rR   rS   rT   rU   rV   
batch_sizelen_qlen_kquerykeyvaluescoreattn_weightsr!   r!   r"   r3   R   sF   





   
 


 
zCpmAntAttention.forwardr   )FNNN)r4   r5   r6   r   r   r   r8   
BoolTensorboolr   r3   r9   r!   r!   r   r"   r:   =   s.    	r:   c                       r   e Zd Zddef fddZ					ddejdejdejdB d	edB d
edB dedB dejdB fddZ	  Z
S )CpmAntSelfAttentionBlockNr   c                    sF   t    t|| _t||d| _|jrtj	|j| _
d S d | _
d S NrC   )r   r   r   layernorm_before_attentionr:   self_attentionrK   r   r   rL   rM   rN   r   r!   r"   r      s   


z!CpmAntSelfAttentionBlock.__init__Fr#   rQ   rR   rS   rT   rU   rV   c           
   
   C   sJ   |  |}| ||||||||\}}	| jdur| |}|| }||	fS )a  
        Args:
            hidden_states (`torch.Tensor` of shape `(batch, len_seq, dim_model)`):
                Input of transformer block(self-attention block). It can be the raw embedding of a batch of sequences.
            attention_mask (`torch.Tensor` of shape `(batch, len_seq, len_seq)`):
                Avoid invalid areas to participate in the calculation of self-attention.
            position_bias (`torch.Tensor` of shape `(batch, len_seq, len_seq)`):
                Provide positional information to self-attention block.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers.
            past_key_values (`Cache`, *optional*):
                Cached past key and value projection states.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
        N)rs   rt   rM   )
r   r#   rQ   rR   rS   rT   rU   rV   outputsrl   r!   r!   r"   r3      s   


z CpmAntSelfAttentionBlock.forwardr   NFNNNr4   r5   r6   r   r   r   r8   rn   r   r3   r9   r!   r!   r   r"   rp      s,    rp   c                       2   e Zd Zdef fddZdejfddZ  ZS )CpmAntDenseGatedACTr   c                    sF   t    tj|j|jdd| _tj|j|jdd| _tj	 | _
d S NFr;   )r   r   r   rD   r   dim_ffw_0w_1r   GELUactr   r   r!   r"   r      s   
zCpmAntDenseGatedACT.__init__r#   c                 C   s&   |  | |}| |}|| }|S )zTransform an input tensor from one feature space to another via a nonlinear operation

        Args:
            hidden_states (`torch.Tensor` of shape `(batch, seq_len, dim_in)`)
        )r   r|   r}   )r   r#   
gate_scorer!   r!   r"   r3      s   
zCpmAntDenseGatedACT.forward	r4   r5   r6   r   r   r   r8   r3   r9   r!   r!   r   r"   ry      s    ry   c                       rx   )CpmAntFeedForwardr   c                    sP   t    t|| _|jd urtj|j| _nd | _tj	|j
|jdd| _d S rz   )r   r   ry   w_inrK   r   r   rL   rM   rD   r{   r   w_outr   r   r!   r"   r      s   


zCpmAntFeedForward.__init__r#   c                 C   s,   |  |}| jdur| |}| |}|S )r$   N)r   rM   r   r   r#   r!   r!   r"   r3      s
   



zCpmAntFeedForward.forwardr   r!   r!   r   r"   r      s    
r   c                       rx   )CpmAntFFNBlockr   c                    sB   t    t|| _t|| _|jrtj	|j| _
d S d | _
d S r   )r   r   r   layernorm_before_ffnr   ffnrK   r   r   rL   rM   r   r   r!   r"   r     s   



zCpmAntFFNBlock.__init__r#   c                 C   s4   |  |}| |}| jdur| |}|| }|S )z
        Args:
            hidden_states (`torch.Tensor` of shape `(batch, len_seq, dim_model)`):
                Hidden states before feed forward layer.
        N)r   r   rM   )r   r#   
ln_outputsru   r!   r!   r"   r3     s   
	


zCpmAntFFNBlock.forwardr   r!   r!   r   r"   r     s
    	r   c                       ro   )CpmAntTransformerBlockNr   c                    s&   t    t||d| _t|| _d S rq   )r   r   rp   self_attr   r   rN   r   r!   r"   r   #  s   
zCpmAntTransformerBlock.__init__Fr#   rQ   rR   rS   rT   rU   rV   c           	   	   C   s.   | j |||||||d\}}| |}||fS )a  
        Args:
            hidden_states (`torch.Tensor`):
                Input to the layer of shape `(batch, seq_len, dim_model)`
            attention_mask (`torch.Tensor`):
                Avoid invalid areas to participate in the calculation of shape `(batch, seq_len, seq_len)`
            position_bias (`torch.Tensor`):
                Provides position information to attention mechanism of shape `(num_heads, seq_len, seq_len)`
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers.
            past_key_values (`Cache`, *optional*):
                Cached past key and value projection states
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
        )rQ   rR   rS   rT   rU   rV   )r   r   )	r   r#   rQ   rR   rS   rT   rU   rV   rl   r!   r!   r"   r3   (  s   


zCpmAntTransformerBlock.forwardr   rv   rw   r!   r!   r   r"   r   "  s,    	r   c                       st   e Zd Zdef fddZ					ddejdejdejdedB d	edB d
edB dedB dejdB fddZ	  Z
S )CpmAntEncoderr   c                    s@   t     j| _t fddt| jD | _t | _	d S )Nc                    s   g | ]}t  |d qS )rr   )r   ).0ir   r!   r"   
<listcomp>T  s    z*CpmAntEncoder.__init__.<locals>.<listcomp>)
r   r   num_hidden_layers
num_layersr   
ModuleListrangelayersr   output_layernormr   r   r   r"   r   Q  s   
 zCpmAntEncoder.__init__Nr#   rQ   rR   rS   output_hidden_statesrT   rU   rV   c	              	   C   s   |rdnd}	|r
dnd}
t | jD ] \}}|r|	|f7 }	|||||||d}|\}}|r1|
|f7 }
q| |}|r>|	|f7 }	||	|
fS )a  
        Args:
            hidden_states (`torch.Tensor`):
                Input to the layer of shape `(batch, seq_len, dim_model)`
            attention_mask (`torch.Tensor`):
                Avoid invalid areas to participate in the calculation of shape `(batch, seq_len, seq_len)`
            position_bias (`torch.Tensor`):
                Provides position information to attention mechanism of shape `(num_heads, seq_len, seq_len)`
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers.
            past_key_values (`Cache`, *optional*):
                Cached past key and value projection states
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
        r!   N)rS   rT   rU   )	enumerater   r   )r   r#   rQ   rR   rS   r   rT   rU   rV   all_hidden_statesall_self_attnsr   layerlayer_outputsrl   r!   r!   r"   r3   X  s*   




zCpmAntEncoder.forward)NNNNNrw   r!   r!   r   r"   r   P  s0    	r   c                       s2   e Zd Z fddZdejdejfddZ  ZS )CpmAntIntermediatec                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S r   )r   r   r   rD   r   intermediate_sizedense
isinstance
hidden_actstrr   intermediate_act_fnr   r   r!   r"   r     s
   
zCpmAntIntermediate.__init__r#   returnc                 C   s   |  |}| |}|S r   )r   r   r   r!   r!   r"   r3     s   

zCpmAntIntermediate.forwardr4   r5   r6   r   r   r8   r3   r9   r!   r!   r   r"   r     s    r   c                       sV   e Zd Zdef fddZdejdejdejdejfdd	Zd
d ZdddZ	  Z
S )CpmAntSegmentPositionEmbeddingr   c                    sR   t    |j| _|j| _|j| _|j| _	t
t|j|j |j |j| _d S r   )r   r   r@   rA   position_bias_num_bucketsnum_bucketsposition_bias_max_distancemax_distancesegment_typesnum_segmentsr   r   r   r   relative_attention_biasr   r   r!   r"   r     s   

z'CpmAntSegmentPositionEmbedding.__init__key_pos	query_poskey_segmentquery_segmentc              	   C   s  t   |d}|d}|d}|d|dkr/td|d d|d d||dks=||dkrKtd| d|d d||dkr`td| d|d d||d|}|||d}||d|}|||d}| ||}|| j }| jt j|t j	|j
d	d d d f t j|t j	|j
d	d d d f  | j| jd
}	t ||k|	d d d d d f |}W d    n1 sw   Y  t|| j}
|
dddd }
|
S )Nr   r   z>key_pos.size(0) should be equal to query_pos.size(0), but got z and !z7keylen should be equal to key_segment.size(1), but got z;querylen should be equal to query_segment.size(1), but got r%   r+   rX   )r   r   r   r&   )r   no_gradr)   r*   rY   !_segment_relative_position_bucketr   _position_bucketarangeint32rX   r   whereF	embeddingr   rZ   rd   )r   r   r   r   r   batchkeylenquerylenrelative_position_bucketabsolute_position_bucketembedsr!   r!   r"   r3     sL   




(z&CpmAntSegmentPositionEmbedding.forwardc                 C   s   || j  | S r   )r   )r   r   r   r!   r!   r"   r     s   z@CpmAntSegmentPositionEmbedding._segment_relative_position_bucket       c                 C   s   d}|d }|dk tj| }t|}|d }||k }|t| | t||  ||   tj }t|t||d }|t	|| tj|7 }|S )Nr   r&   r   )
r,   r   r   abslogrc   r^   min	full_liker   )r   relative_positionr   r   relative_buckets	max_exactis_smallrelative_position_if_larger!   r!   r"   r     s(   
z/CpmAntSegmentPositionEmbedding._position_bucket)r   r   )r4   r5   r6   r   r   r   r8   r3   r   r   r9   r!   r!   r   r"   r     s    
4r   c                       s8   e Zd Z fddZdejdejdejfddZ  ZS )CpmAntOutputc                    sB   t    t|j|j| _tj|j|jd| _t	|j
| _d S )N)r   )r   r   r   rD   r   r   r   	LayerNormlayer_norm_epsrL   hidden_dropout_probrM   r   r   r!   r"   r     s   
zCpmAntOutput.__init__r#   input_tensorr   c                 C   s&   |  |}| |}| || }|S r   )r   rM   r   )r   r#   r   r!   r!   r"   r3     s   

zCpmAntOutput.forwardr   r!   r!   r   r"   r     s    $r   c                       s2   e Zd ZU eed< dZe  fddZ  Z	S )CpmAntPreTrainedModelr   cpmantc                    sN   t  | t|trt|j dS t|tr%tj|j	d| j
jd dS dS )zInitialize the weightsg        )r/   stdN)r   _init_weightsr   r   initones_r   r   normal_r   r   init_std)r   moduler   r!   r"   r     s   

z#CpmAntPreTrainedModel._init_weights)
r4   r5   r6   r   __annotations__base_model_prefixr   r   r   r9   r!   r!   r   r"   r   
  s
   
 r   c                       s   e Zd Zdef fddZdd Zdd Zdd	 Ze	
	
	
	
	
	
	
dde	j
d
B ded
B ded
B ded
B ded
B ded
B de	j
d
B dee	j
 eB fddZ  ZS )CpmAntModelr   c                    sl   t  | t|| _t|j|j| _t|j	|j
|j  |j| _t|| _|j| _|j	| _	|   d S r   )r   r   r   encoderr   	Embeddingr   r   segment_embedding
vocab_sizeprompt_typesprompt_lengthinput_embeddingr   rR   	post_initr   r   r!   r"   r     s   

zCpmAntModel.__init__c                 C   s   | j S r   r   r   r!   r!   r"   get_input_embeddings(  s   z CpmAntModel.get_input_embeddingsc                 K   s
   || _ d S r   r   )r   
embeddingskwargsr!   r!   r"   set_input_embeddings+  s   
z CpmAntModel.set_input_embeddingsc                 C   s>  | d}| d}|j}tj||dtj||dddk}|d d d d d f |d d d d d f  |d||@ B }	|	|d d d d d f |d d d d d f k@ }	tjtt|| j	 d d d |dd d d f 
|d|d d d f k }
tjtj|| j	|d |
fdd}
|
||d|
|d|@ |	@ }	|	S )Nr   r   )rX   r%   r=   )r)   rX   r   r   rY   logical_notra   listr   r   repeatcatonesrn   )r   	input_idsspancontextlengthr   seqlenrX   directional_mask_2drQ   mask_1dr!   r!   r"   _prepare_attention_mask.  s   

$&08$ z#CpmAntModel._prepare_attention_maskNr   rS   r   rT   rU   return_dictrV   r   c              
   K   sD  |dur|n| j j}|dur|n| j j}|dur|n| j j}|dur$|n| j j}|jtjkr4|tj}|j|j	}	}
t
|dkddj|	|
d}|dkdj|	|
d}tjtj| jd | j | jd | j |	|
d|dd|fdd}| \}}tjtj|| j|	|
d|fdd}tj||fd|	|
d}tj||	|
d|d}tj||fd|	|
d}|r|du rt| j d	}|dur| nd}| }| |}| |}|dkr|ddddddf }|| }| ||||}| ||||}|dd|dddf }|dddd|dddf }|dd|dddf }| ||||||||\}}}|dkr|dd| jdddf }|durjd
}|D ]}||dddd| jd| jdf f7 }qO|}|durd
}|D ]}||dd| jdddf f7 }qs|}|stdd ||||fD S t||||dS )ai  
        input_ids (`torch.Tensor` of shape `(batch_size, seq_len)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`CPMAntTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        Nr   r&   r   r%   r   r   r=   r   r!   c                 s   s    | ]	}|d ur|V  qd S r   r!   )r   vr!   r!   r"   	<genexpr>  s    z&CpmAntModel.forward.<locals>.<genexpr>)last_hidden_staterT   r#   
attentions)r   rS   r   use_return_dictrU   r+   r   r   r,   rX   r   sumr   r   r   r   r   r)   zerosfullr   get_seq_lengthrd   r   r   r   rR   r   tupler
   )r   r   rS   r   rT   rU   r   rV   r   r+   rX   segmentr   r   
seq_lengthr   positionr   past_lengthr#   segment_statesrQ   rR   r   all_attentionsnew_attentions	attentionnew_hidden_stateshidden_stater!   r!   r"   r3   @  s   	"

 


.
&
zCpmAntModel.forward)NNNNNNN)r4   r5   r6   r   r   r   r   r   r   r   r8   rn   r   r  r
   r3   r9   r!   r!   r   r"   r     s<    
r   zy
    The CPMAnt Model with a language modeling head on top (linear layer with weights tied to the input embeddings).
    )custom_introc                       s   e Zd ZddiZdef fddZe										ddejdB d	e	dB d
e
dB de
dB de
dB dejdB de
dB dejdB dejdB deejB deeB fddZdd Zdd Z  ZS )CpmAntForCausalLMzlm_head.weightzcpmant.input_embedding.weightr   c                    sD   t  | t|| _tj|j|j|j|j	  dd| _
|   d S rz   )r   r   r   r   r   rD   r   r   r   r   lm_headr   r   r   r!   r"   r     s   
zCpmAntForCausalLM.__init__Nr   r   rT   rU   rS   r   labelsr   rQ   rV   logits_to_keepr   c              	   K   s   |dur|n| j j}| |||||||	}|r|jn|d }t|
tr)t|
 dn|
}| |dd|ddf }d}|durQt }||	d|
d|	d}|sg|f|dd  }|dure|f| S |S t|||j|j|jdS )u<  
        input_ids (`torch.Tensor` of shape `(batch_size, seq_len)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`CPMAntTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        labels (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss.

        Example:

        Text Generation with CpmAntForCausalLM.
        ```python
        >>> from transformers import CPMAntTokenizer, CpmAntForCausalLM

        >>> texts = "今天天气不错，"
        >>> model = CpmAntForCausalLM.from_pretrained("openbmb/cpm-ant-10b")
        >>> tokenizer = CPMAntTokenizer.from_pretrained("openbmb/cpm-ant-10b")
        >>> input_ids = tokenizer(texts, return_tensors="pt")
        >>> outputs = model.generate(**input_ids)
        >>> output_texts = tokenizer.batch_decode(outputs)
        >>> print(output_texts)
        ['今天天气不错，阳光明媚，我和妈妈一起去超市买东西。\n在超市里，我看到了一个很好玩的玩具，它的名字叫“机器人”。它有一个圆圆的脑袋，两只圆圆的眼睛，还有一个圆圆的']
        ```
        Nr   r%   r   )losslogitsrT   r#   r   )r   r   r   r   r   intslicer  r   rY   r)   r   rT   r#   r   )r   r   rT   rU   rS   r   r  r   rQ   rV   r  r   model_outputr#   slice_indicesr  r  	loss_funcoutputr!   r!   r"   r3     s6   *	zCpmAntForCausalLM.forwardc                 C   s   | j jS r   r   r   r   r!   r!   r"   r   
  s   z&CpmAntForCausalLM.get_input_embeddingsc                 C   s   || j _d S r   r  )r   r   r!   r!   r"   r     s   z&CpmAntForCausalLM.set_input_embeddings)
NNNNNNNNNr   )r4   r5   r6   _tied_weights_keysr   r   r   r   r8   r   rn   r  r  r   r3   r   r   r9   r!   r!   r   r"   r    sN    
	
Jr  )r  r   r   )0r7   r^   r   torch.nn.functionalr   
functionalr   torch.nnr    r   r   activationsr   cache_utilsr   r   
generationr	   modeling_outputsr
   r   modeling_utilsr   utilsr   r   configuration_cpmantr   
get_loggerr4   loggerModuler   r:   rp   ry   r   r   r   r   r   r   r   r   r   r  __all__r!   r!   r!   r"   <module>   sH   
e7.@] _