o
    wix                     @   s  d Z ddlZddlmZmZ ddlZddlm  mZ	 ddl
ZddlmZ ddlmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZmZ ddlmZ eeZG dd dejZG dd dejZ G dd dejZ!G dd dejZ"G dd dejZ#G dd dejZ$G dd dejZ%G dd dejZ&G dd dejZ'G d d! d!ejZ(G d"d# d#ejZ)eG d$d% d%eZ*eG d&d' d'e*Z+ed(d)G d*d+ d+e*eZ,g d,Z-dS )-zPyTorch CPMAnt    N)OptionalUnion)nn)CrossEntropyLoss   )ACT2FN)GenerationMixin)BaseModelOutputWithPastCausalLMOutputWithPast)PreTrainedModel)auto_docstringlogging   )CpmAntConfigc                       s6   e Zd ZdZdef fddZdejfddZ  Z	S )CpmAntLayerNormz~
    We use Root Mean Square (RMS) Layer Normalization, please see https://huggingface.co/papers/1910.07467 for details."
    configc                    s2   t    |j| _|j| _tt|j| _	d S N)
super__init__epshidden_sizedim_normr   	Parametertorchemptyweightselfr   	__class__ g/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/cpmant/modeling_cpmant.pyr   *   s   
zCpmAntLayerNorm.__init__hidden_statesc                 C   s^   | d| jkrtd|j}|tjdjddd}|t	|| j
  || j }|S )f
        Args:
            hidden_states (`torch.Tensor` of shape `(batch, seq_len, dim_in)`)
        z'hidden_states.size(-1) != self.dim_norm   T)dimkeepdim)sizer   AssertionErrordtypetor   float32powmeanrsqrtr   r   )r   r"   	old_dtypevariancer    r    r!   forward1   s    zCpmAntLayerNorm.forward)
__name__
__module____qualname____doc__r   r   r   Tensorr2   __classcell__r    r    r   r!   r   %   s    r   c                       sp   e Zd Zdef fddZ			ddejdejdejd	ejd
ee	 dee
ejejf  dee	 fddZ  ZS )CpmAntAttentionr   c                    s   t    |j| _|j| _|j| _tj| j| j| j dd| _	tj| j| j| j dd| _
tj| j| j| j dd| _tj| j| j | jdd| _tjjdd| _|jd uratjj|jd| _d S d | _d S )NFbiasr$   r&   )p)r   r   r   	dim_modelnum_attention_heads	num_headsdim_headr   Linear	project_q	project_k	project_vattention_outr   Softmaxsoftmax	dropout_pDropoutdropoutr   r   r    r!   r   ?   s   


zCpmAntAttention.__init__FNhidden_q	hidden_kvattention_maskposition_biasoutput_attentionspast_key_values	use_cachec              	   C   s  | d}| d}	| d}
| |}| |}| |}|||	| j| jdddd}|||
| j| jdddd}|||
| j| jdddd}|durmtj	|d |gdd}tj	|d |gdd}| d}
t
||ddt| j }|| }t|||d|	|
td	ktjtd
|j|jd}| |}t|||d|	|
td	ktjd|j|jd}|r|}nd}| jdur| |}t
||}||| j|	| jdddd}| ||	| j| j }| |}d}|r||f}|||fS )a  
        Args:
            hidden_q (`torch.Tensor`):
                Input of transformer block(self-attention block). It can be the raw embedding of a batch of sequences.
            hidden_kv (`torch.Tensor` of shape `(batch, len_k, dim_model)`)):
                Tensor *key_value* and *query* of shape `(batch, len_k, dim_model)`
            attention_mask (`torch.Tensor` of shape `(batch, len_seq, len_seq)`):
                Avoid invalid areas to participate in the calculation of self-attention.
            position_bias (`torch.Tensor` of shape `(batch, len_seq, len_seq)`):
                Provide positional information to self-attention block.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers.
            past_key_values (`tuple[torch.Tensor, torch.Tensor]`, *optional*):
                Cached past key and value projection states.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
        r   r   r%   r   Nr<   r$   Fz-inf)devicer*   )r(   rC   rD   rE   viewr@   rA   permuter   catmatmul	transposemathsqrtmasked_filltensorscalar_tensorfloatrT   r*   rH   rK   
contiguousrF   )r   rL   rM   rN   rO   rP   rQ   rR   
batch_sizelen_qlen_kquerykeyvaluescoreattn_weightsr    r    r!   r2   R   sN   





   
 


 

zCpmAntAttention.forward)FNN)r3   r4   r5   r   r   r   r7   
BoolTensorr   booltupler2   r8   r    r    r   r!   r9   >   s(    r9   c                       p   e Zd Zdef fddZ				ddejdejdeej d	ee d
ee	ejejf  dee fddZ
  ZS )CpmAntSelfAttentionBlockr   c                    B   t    t|| _t|| _|jrtj	|j| _
d S d | _
d S r   )r   r   r   layernorm_before_attentionr9   self_attentionrI   r   r   rJ   rK   r   r   r    r!   r         



z!CpmAntSelfAttentionBlock.__init__NFr"   rN   rO   rP   rQ   rR   c           
   	   C   sP   |  |}| |||||||}|\}}}	| jdur| |}|| }|||	fS )a  
        Args:
            hidden_states (`torch.Tensor` of shape `(batch, len_seq, dim_model)`):
                Input of transformer block(self-attention block). It can be the raw embedding of a batch of sequences.
            attention_mask (`torch.Tensor` of shape `(batch, len_seq, len_seq)`):
                Avoid invalid areas to participate in the calculation of self-attention.
            position_bias (`torch.Tensor` of shape `(batch, len_seq, len_seq)`):
                Provide positional information to self-attention block.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers.
            past_key_values (`Tuple(torch.FloatTensor)`, *optional*):
                Cached past key and value projection states.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
        N)ro   rp   rK   )
r   r"   rN   rO   rP   rQ   rR   outputsrh   current_key_valuer    r    r!   r2      s   




z CpmAntSelfAttentionBlock.forwardNFNNr3   r4   r5   r   r   r   r7   r   rj   rk   r2   r8   r    r    r   r!   rm      s&    rm   c                       2   e Zd Zdef fddZdejfddZ  ZS )CpmAntDenseGatedACTr   c                    sF   t    tj|j|jdd| _tj|j|jdd| _tj	 | _
d S NFr:   )r   r   r   rB   r   dim_ffw_0w_1r   GELUactr   r   r    r!   r      s   
zCpmAntDenseGatedACT.__init__r"   c                 C   s&   |  | |}| |}|| }|S )zTransform an input tensor from one feature space to another via a nonlinear operation

        Args:
            hidden_states (`torch.Tensor` of shape `(batch, seq_len, dim_in)`)
        )r}   rz   r{   )r   r"   
gate_scorer    r    r!   r2      s   
zCpmAntDenseGatedACT.forward	r3   r4   r5   r   r   r   r7   r2   r8   r    r    r   r!   rw      s    rw   c                       rv   )CpmAntFeedForwardr   c                    sP   t    t|| _|jd urtj|j| _nd | _tj	|j
|jdd| _d S rx   )r   r   rw   w_inrI   r   r   rJ   rK   rB   ry   r   w_outr   r   r    r!   r      s   


zCpmAntFeedForward.__init__r"   c                 C   s,   |  |}| jdur| |}| |}|S )r#   N)r   rK   r   r   r"   r    r    r!   r2      s
   



zCpmAntFeedForward.forwardr   r    r    r   r!   r      s    
r   c                       rv   )CpmAntFFNBlockr   c                    rn   r   )r   r   r   layernorm_before_ffnr   ffnrI   r   r   rJ   rK   r   r   r    r!   r     rq   zCpmAntFFNBlock.__init__r"   c                 C   s4   |  |}| |}| jdur| |}|| }|S )z
        Args:
            hidden_states (`torch.Tensor` of shape `(batch, len_seq, dim_model)`):
                Hidden states before feed forward layer.
        N)r   r   rK   )r   r"   
ln_outputsrr   r    r    r!   r2     s   
	


zCpmAntFFNBlock.forwardr   r    r    r   r!   r     s
    	r   c                       rl   )CpmAntTransformerBlockr   c                    s"   t    t|| _t|| _d S r   )r   r   rm   self_attr   r   r   r   r    r!   r   !  s   

zCpmAntTransformerBlock.__init__NFr"   rN   rO   rP   rQ   rR   c           	      C   s4   | j ||||||d}|\}}}| |}|||fS )a  
        Args:
            hidden_states (`torch.Tensor`):
                Input to the layer of shape `(batch, seq_len, dim_model)`
            attention_mask (`torch.Tensor`):
                Avoid invalid areas to participate in the calculation of shape `(batch, seq_len, seq_len)`
            position_bias (`torch.Tensor`):
                Provides position information to attention mechanism of shape `(num_heads, seq_len, seq_len)`
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers.
            past_key_values (`tuple[torch.Tensor, torch.Tensor])`, *optional*):
                Cached past key and value projection states
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
        )rN   rO   rP   rQ   rR   )r   r   )	r   r"   rN   rO   rP   rQ   rR   rh   rs   r    r    r!   r2   &  s   
	

zCpmAntTransformerBlock.forwardrt   ru   r    r    r   r!   r      s&    	r   c                       st   e Zd Zdef fddZ				ddejdejdejdee d	ee d
ee	ejejf  dee fddZ
  ZS )CpmAntEncoderr   c                    s@   t     j| _t fddt| jD | _t | _	d S )Nc                    s   g | ]}t  qS r    )r   ).0ithr   r    r!   
<listcomp>S  s    z*CpmAntEncoder.__init__.<locals>.<listcomp>)
r   r   num_hidden_layers
num_layersr   
ModuleListrangelayersr   output_layernormr   r   r   r!   r   P  s   
 zCpmAntEncoder.__init__Nr"   rN   rO   rP   output_hidden_statesrQ   rR   c              	   C   s   |rdnd}|r
dnd}	|rdnd}
t | jD ]0\}}|r"||f7 }||||||r-|| nd|d}|\}}}|r>|	|f7 }	|durG|
|f }
q| |}|rT||f7 }||
||	fS )a%  
        Args:
            hidden_states (`torch.Tensor`):
                Input to the layer of shape `(batch, seq_len, dim_model)`
            attention_mask (`torch.Tensor`):
                Avoid invalid areas to participate in the calculation of shape `(batch, seq_len, seq_len)`
            position_bias (`torch.Tensor`):
                Provides position information to attention mechanism of shape `(num_heads, seq_len, seq_len)`
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers.
            past_key_values (`tuple[torch.Tensor, torch.Tensor])`, *optional*):
                Cached past key and value projection states
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
        r    N)rP   rQ   rR   )	enumerater   r   )r   r"   rN   rO   rP   r   rQ   rR   all_hidden_statesall_self_attnscurrent_key_valuesilayerlayer_outputsrh   rs   r    r    r!   r2   W  s0   





zCpmAntEncoder.forward)NNNNru   r    r    r   r!   r   O  s*    r   c                       s2   e Zd Z fddZdejdejfddZ  ZS )CpmAntIntermediatec                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S r   )r   r   r   rB   r   intermediate_sizedense
isinstance
hidden_actstrr   intermediate_act_fnr   r   r    r!   r     s
   
zCpmAntIntermediate.__init__r"   returnc                 C   s   |  |}| |}|S r   )r   r   r   r    r    r!   r2     s   

zCpmAntIntermediate.forwardr3   r4   r5   r   r   r7   r2   r8   r    r    r   r!   r     s    r   c                       sV   e Zd Zdef fddZdejdejdejdejfdd	Zd
d ZdddZ	  Z
S )CpmAntSegmentPositionEmbeddingr   c                    sR   t    |j| _|j| _|j| _|j| _	t
t|j|j |j |j| _d S r   )r   r   r?   r@   position_bias_num_bucketsnum_bucketsposition_bias_max_distancemax_distancesegment_typesnum_segmentsr   r   r   r   relative_attention_biasr   r   r    r!   r     s   

z'CpmAntSegmentPositionEmbedding.__init__key_pos	query_poskey_segmentquery_segmentc              	   C   s  t   |d}|d}|d}|d|dkr/td|d d|d d||dks=||dkrKtd| d|d d||dkr`td| d|d d||d|}|||d}||d|}|||d}| ||}|| j }| jt j|t j	|j
d	d d d f t j|t j	|j
d	d d d f  | j| jd
}	t ||k|	d d d d d f |}W d    n1 sw   Y  t|| j}
|
dddd }
|
S )Nr   r   z>key_pos.size(0) should be equal to query_pos.size(0), but got z and !z7keylen should be equal to key_segment.size(1), but got z;querylen should be equal to query_segment.size(1), but got r$   r*   rT   )r   r   r   r%   )r   no_gradr(   r)   rU   !_segment_relative_position_bucketr   _position_bucketarangeint32rT   r   whereF	embeddingr   rV   r`   )r   r   r   r   r   batchkeylenquerylenrelative_position_bucketabsolute_position_bucketembedsr    r    r!   r2     sL   




(z&CpmAntSegmentPositionEmbedding.forwardc                 C   s   || j  | S r   )r   )r   r   r   r    r    r!   r     s   z@CpmAntSegmentPositionEmbedding._segment_relative_position_bucket       c                 C   s   d}|d }|dk tj| }t|}|d }||k }|t| | t||  ||   tj }t|t||d }|t	|| tj|7 }|S )Nr   r%   r   )
r+   r   r   abslogr_   rZ   min	full_liker   )r   relative_positionr   r   relative_buckets	max_exactis_smallrelative_postion_if_larger    r    r!   r     s(   
z/CpmAntSegmentPositionEmbedding._position_bucket)r   r   )r3   r4   r5   r   r   r   r7   r2   r   r   r8   r    r    r   r!   r     s    
4r   c                       s8   e Zd Z fddZdejdejdejfddZ  ZS )CpmAntOutputc                    sB   t    t|j|j| _tj|j|jd| _t	|j
| _d S )N)r   )r   r   r   rB   r   r   r   	LayerNormlayer_norm_epsrJ   hidden_dropout_probrK   r   r   r    r!   r     s   
zCpmAntOutput.__init__r"   input_tensorr   c                 C   s&   |  |}| |}| || }|S r   )r   rK   r   )r   r"   r   r    r    r!   r2     s   

zCpmAntOutput.forwardr   r    r    r   r!   r     s    $r   c                   @   s   e Zd ZeZdZdd ZdS )CpmAntPreTrainedModelcpmantc                 C   s   t |tjr |jjjd| jjd |jdur|jj	  dS dS t |tj
rC|jjjd| jjd |jdurA|jj|j 	  dS dS t |tjrX|jj	  |jjd dS t |trf|jjd dS t |trx|jjjd| jjd dS dS )zInitialize the weightsg        )r.   stdNg      ?)r   r   rB   r   datanormal_r   init_stdr;   zero_	Embeddingpadding_idxr   fill_r   r   r   )r   moduler    r    r!   _init_weights  s$   



z#CpmAntPreTrainedModel._init_weightsN)r3   r4   r5   r   config_classbase_model_prefixr   r    r    r    r!   r     s    r   c                       s   e Zd Zdef fddZdd Zdd Zdd	 Ze	
	
	
	
	
	
dde	e
j de	e de	e de	eee
j   de	e de	e deee
j ef fddZ  ZS )CpmAntModelr   c                    sl   t  | t|| _t|j|j| _t|j	|j
|j  |j| _t|| _|j| _|j	| _	|   d S r   )r   r   r   encoderr   r   r   r   segment_embedding
vocab_sizeprompt_typesprompt_lengthinput_embeddingr   rO   	post_initr   r   r    r!   r   %  s   

zCpmAntModel.__init__c                 C      | j S r   r   r   r    r    r!   get_input_embeddings2     z CpmAntModel.get_input_embeddingsc                 K   
   || _ d S r   r   )r   
embeddingskwargsr    r    r!   set_input_embeddings5     
z CpmAntModel.set_input_embeddingsc                 C   s>  | d}| d}|j}tj||dtj||dddk}|d d d d d f |d d d d d f  |d||@ B }	|	|d d d d d f |d d d d d f k@ }	tjtt|| j	 d d d |dd d d f 
|d|d d d f k }
tjtj|| j	|d |
fdd}
|
||d|
|d|@ |	@ }	|	S )Nr   r   )rT   r$   r<   )r(   rT   r   r   rU   logical_notr]   listr   r   repeatrW   onesrj   )r   	input_idsspancontextlengthr   seqlenrT   directional_mask_2drN   mask_1dr    r    r!   _prepare_attention_mask8  s   

$&08$ z#CpmAntModel._prepare_attention_maskNr   rP   r   rQ   rR   return_dictr   c              	   K   sV  |dur|n| j j}|dur|n| j j}|dur|n| j j}|dur$|n| j j}|jtjkr4|tj}|j|j	}}	t
|dkddj||	d}
|
dkdj||	d}tjtj| jd | j | jd | j ||	d|dd|fdd}| \}}tjtj|| j||	d|
fdd}
tj||fd||	d}tj|||	d|d}tj||fd||	d}|du rd}tdg| jj }| }| |}| |
}|| }n |d d d	}| |
}| ||ddddddf  }| ||||}| |||
|
}|dd|dddf }|dddd|dddf }|dd|dddf }| |||||||\}}}}|dkr|dd| jdddf }|dursd
}|D ]}||dddd| jd| jdf f7 }qX|}|durd
}|D ]}||dd| jdddf f7 }q||}|stdd ||||fD S t||||dS )ai  
        input_ids (`torch.Tensor` of shape `(batch_size, seq_len)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`CPMAntTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        Nr   r%   r   r$   r   r   r<   rS   r    c                 s   s    | ]	}|d ur|V  qd S r   r    )r   vr    r    r!   	<genexpr>  s    z&CpmAntModel.forward.<locals>.<genexpr>)last_hidden_staterQ   r"   
attentions)r   rP   r   use_return_dictrR   r*   r   r   r+   rT   r   sumrW   r   r   r   r   r(   zerosfullrk   r   r   r`   r   r   r   rO   r	   )r   r   rP   r   rQ   rR   r   r   r*   rT   segmentr   r   
seq_lengthr   positionr   past_lengthr"   segment_statesrN   rO   present_key_valuesr   all_attentionsnew_attentions	attentionnew_hidden_stateshidden_stater    r    r!   r2   J  s   	"



$ 


.
&
zCpmAntModel.forward)NNNNNN)r3   r4   r5   r   r   r   r   r   r   r   r   r7   rj   rk   r   r	   r2   r8   r    r    r   r!   r   #  s6    	r   zy
    The CPMAnt Model with a language modeling head on top (linear layer with weights tied to the input embeddings).
    )custom_introc                       s   e Zd ZdgZdef fddZe								ddeej	 dee
eej	ej	f   dee d	ee d
ee deej	 dee deej	 deeef fddZdd Zdd Zdd Zdd Zdd Z  ZS )CpmAntForCausalLMzlm_head.weightr   c                    sD   t  | t|| _tj|j|j|j|j	  dd| _
|   d S rx   )r   r   r   r   r   rB   r   r   r   r   lm_headr   r   r   r    r!   r     s   
zCpmAntForCausalLM.__init__Nr   rQ   rR   rP   r   labelsr   rN   r   c	                 K   s   |dur|n| j j}| ||||||}
|r|
jn|
d }| |}d}|dur:t }||d|d|d}|sP|f|
dd  }|durN|f| S |S t|||
j	|
j
|
jdS )u<  
        input_ids (`torch.Tensor` of shape `(batch_size, seq_len)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`CPMAntTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        labels (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss.

        Example:

        Text Generation with CpmAntForCausalLM.
        ```python
        >>> from transformers import CPMAntTokenizer, CpmAntForCausalLM

        >>> texts = "今天天气不错，"
        >>> model = CpmAntForCausalLM.from_pretrained("openbmb/cpm-ant-10b")
        >>> tokenizer = CPMAntTokenizer.from_pretrained("openbmb/cpm-ant-10b")
        >>> input_ids = tokenizer(texts, return_tensors="pt")
        >>> outputs = model.generate(**input_ids)
        >>> output_texts = tokenizer.batch_decode(outputs)
        >>> print(output_texts)
        ['今天天气不错，阳光明媚，我和妈妈一起去超市买东西。\n在超市里，我看到了一个很好玩的玩具，它的名字叫“机器人”。它有一个圆圆的脑袋，两只圆圆的眼睛，还有一个圆圆的']
        ```
        Nr   r$   r   )losslogitsrQ   r"   r  )r   r  r   r  r  r   rU   r(   r
   rQ   r"   r  )r   r   rQ   rR   rP   r   r  r   rN   r   model_outputr"   r  r  	loss_funcoutputr    r    r!   r2     s(   (
zCpmAntForCausalLM.forwardc                 C   s   | j jS r   r   r   r   r    r    r!   r   	  s   z&CpmAntForCausalLM.get_input_embeddingsc                 C   s   || j _d S r   r  )r   r   r    r    r!   r     s   z&CpmAntForCausalLM.set_input_embeddingsc                 C   r   r   r  r   r    r    r!   get_output_embeddings  r   z'CpmAntForCausalLM.get_output_embeddingsc                 C   r   r   r  )r   new_embeddingsr    r    r!   set_output_embeddings  r   z'CpmAntForCausalLM.set_output_embeddingsc                 C   s<   dd |D }|D ]}|d | |d< |d | |d< q	|S )Nc                 S   s    g | ]}|d urt |n|qS r   )r   )r   eachr    r    r!   r     s     z4CpmAntForCausalLM._reorder_cache.<locals>.<listcomp>r   r   r    )r   rQ   beam_idxkey_value_layerr    r    r!   _reorder_cache  s
   z CpmAntForCausalLM._reorder_cache)NNNNNNNN)r3   r4   r5   _tied_weights_keysr   r   r   r   r   r7   r   rk   rj   r   r
   r2   r   r   r  r  r#  r8   r    r    r   r!   r    sH    
	
Ar  )r  r   r   ).r6   rZ   typingr   r   r   torch.nn.functionalr   
functionalr   torch.utils.checkpointtorch.nnr   activationsr   
generationr   modeling_outputsr	   r
   modeling_utilsr   utilsr   r   configuration_cpmantr   
get_loggerr3   loggerModuler   r9   rm   rw   r   r   r   r   r   r   r   r   r   r  __all__r    r    r    r!   <module>   sH   
h1/B] c