o
    ߥiR                     @   s  d Z ddlmZmZmZmZ ddlZddlZddlZddl	Z	ddl
mZ ddlZddlZddlmZ ddlmZ eeZdd	 Zd
d Zdd ZeejjjeedZejjZG dd dejZG dd dejZG dd dejZG dd dejZ G dd dejZ!G dd dejZ"G dd dejZ#G dd dejZ$G dd  d ejZ%G d!d" d"ejZ&G d#d$ d$ejZ'G d%d& d&ejZ(G d'd( d(ejZ)G d)d* d*ejZ*G d+d, d,ejZ+G d-d. d.e+Z,dS )/zPyTorch BERT model.     )absolute_importdivisionprint_functionunicode_literalsN)open)nn   )
BertConfigc                 C   s    | d dt | td   S )ab   Original Implementation of the gelu activation function in Google Bert repo when initially created.
        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
        Also see https://arxiv.org/abs/1606.08415
          ?      ?g       @)torcherfmathsqrtx r   d/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/multi_modal/clip/modeling_bert.pygelu"   s    r   c                 C   s6   d|  dt tdtj | dt | d     S )z Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT).
        Also see https://arxiv.org/abs/1606.08415
    r
   r      gHm?   )r   tanhr   r   pipowr   r   r   r   gelu_new+   s   "r   c                 C   s   | t |  S N)r   sigmoidr   r   r   r   swish3   s   r   )r   relur   r   c                       s*   e Zd ZdZ fddZdddZ  ZS )BertEmbeddingszLConstruct the embeddings from word, position and token_type embeddings.
    c                    sl   t t|   tj|j|jdd| _t|j|j| _	t|j
|j| _t|j|jd| _t|j| _d S )Nr   )padding_idxeps)superr   __init__r   	Embedding
vocab_sizehidden_sizeword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddingsBertLayerNormlayer_norm_eps	LayerNormDropouthidden_dropout_probdropoutselfconfig	__class__r   r   r$   E   s   
zBertEmbeddings.__init__Nc           	      C   s   | d}|d u rtj|tj|jd}|d|}|d u r$t|}| |}| 	|}| 
|}|| | }| |}| |}|S )Nr   )dtypedevicer   )sizer   arangelongr9   	unsqueeze	expand_as
zeros_liker(   r*   r,   r/   r2   )	r4   	input_idstoken_type_idsposition_ids
seq_lengthwords_embeddingsr*   r,   
embeddingsr   r   r   forwardT   s   







zBertEmbeddings.forwardNN__name__
__module____qualname____doc__r$   rF   __classcell__r   r   r6   r   r   A   s    r   c                       s.   e Zd Z fddZdd ZdddZ  ZS )	BertSelfAttentionc                    s   t t|   |j|j dkrtd|j|jf |j| _|j| _t|j|j | _| j| j | _	t
|j| j	| _t
|j| j	| _t
|j| j	| _t
|j| _d S )Nr   zLThe hidden size (%d) is not a multiple of the number of attention heads (%d))r#   rN   r$   r'   num_attention_heads
ValueErroroutput_attentionsintattention_head_sizeall_head_sizer   Linearquerykeyvaluer0   attention_probs_dropout_probr2   r3   r6   r   r   r$   i   s"   
zBertSelfAttention.__init__c                 C   s6   |  d d | j| jf }|j| }|ddddS )Nr   r   r   r   )r:   rO   rS   viewpermute)r4   r   new_x_shaper   r   r   transpose_for_scores}   s
   
z&BertSelfAttention.transpose_for_scoresNc                 C   s   |  |}| |}| |}| |}| |}| |}	t||dd}
|
t| j	 }
|d ur8|
| }
t
jdd|
}| |}|d urM|| }t||	}|dddd }| d d | jf }|j| }| jrw||f}|S |f}|S )NrZ   )dimr   r   r   r   )rV   rW   rX   r^   r   matmul	transposer   r   rS   r   Softmaxr2   r\   
contiguousr:   rT   r[   rQ   )r4   hidden_statesattention_mask	head_maskmixed_query_layermixed_key_layermixed_value_layerquery_layer	key_layervalue_layerattention_scoresattention_probscontext_layernew_context_layer_shapeoutputsr   r   r   rF      sB   








zBertSelfAttention.forwardrG   )rI   rJ   rK   r$   r^   rF   rM   r   r   r6   r   rN   g   s    rN   c                       $   e Zd Z fddZdd Z  ZS )BertSelfOutputc                    sD   t t|   t|j|j| _t|j|jd| _	t
|j| _d S Nr!   )r#   rt   r$   r   rU   r'   denser-   r.   r/   r0   r1   r2   r3   r6   r   r   r$         zBertSelfOutput.__init__c                 C   &   |  |}| |}| || }|S r   rv   r2   r/   r4   re   input_tensorr   r   r   rF         

zBertSelfOutput.forwardrI   rJ   rK   r$   rF   rM   r   r   r6   r   rt          rt   c                       &   e Zd Z fddZdddZ  ZS )BertAttentionc                    s.   t t|   t|| _t|| _t | _d S r   )	r#   r   r$   rN   r4   rt   outputsetpruned_headsr3   r6   r   r   r$      s   

zBertAttention.__init__Nc                 C   s4   |  |||}| |d |}|f|dd   }|S Nr   r   )r4   r   )r4   r{   rf   rg   self_outputsattention_outputrr   r   r   r   rF      s   
zBertAttention.forwardrG   r}   r   r   r6   r   r          r   c                       rs   )BertIntermediatec                    sb   t t|   t|j|j| _t|j	t
s#tjd dkr+t|j	tr+t|j	 | _d S |j	| _d S )Nr   r   )r#   r   r$   r   rU   r'   intermediate_sizerv   
isinstance
hidden_actstrsysversion_infounicodeACT2FNintermediate_act_fnr3   r6   r   r   r$      s   
zBertIntermediate.__init__c                 C   s   |  |}| |}|S r   )rv   r   r4   re   r   r   r   rF      s   

zBertIntermediate.forwardr}   r   r   r6   r   r      s    
r   c                       rs   )
BertOutputc                    sD   t t|   t|j|j| _t|j|j	d| _
t|j| _d S ru   )r#   r   r$   r   rU   r   r'   rv   r-   r.   r/   r0   r1   r2   r3   r6   r   r   r$      rw   zBertOutput.__init__c                 C   rx   r   ry   rz   r   r   r   rF      r|   zBertOutput.forwardr}   r   r   r6   r   r      r~   r   c                       r   )	BertLayerc                    s0   t t|   t|| _t|| _t|| _d S r   )	r#   r   r$   r   	attentionr   intermediater   r   r3   r6   r   r   r$      s   

zBertLayer.__init__Nc           	      C   sB   |  |||}|d }| |}| ||}|f|dd   }|S r   )r   r   r   )	r4   re   rf   rg   attention_outputsr   intermediate_outputlayer_outputrr   r   r   r   rF      s   
zBertLayer.forwardrG   r}   r   r   r6   r   r      r   r   c                       r   )BertEncoderc                    sB   t t|    j| _ j| _t fddt jD | _	d S )Nc                    s   g | ]}t  qS r   )r   ).0_r5   r   r   
<listcomp>	  s    z(BertEncoder.__init__.<locals>.<listcomp>)
r#   r   r$   rQ   output_hidden_statesr   
ModuleListrangenum_hidden_layerslayerr3   r6   r   r   r$     s   
zBertEncoder.__init__Nc           
      C   s   d}d}t | jD ]"\}}| jr||f }||||| }|d }| jr+||d f }q	| jr4||f }|f}	| jr?|	|f }	| jrG|	|f }	|	S )Nr   r   r   )	enumerater   r   rQ   )
r4   re   rf   rg   all_hidden_statesall_attentionsilayer_modulelayer_outputsrr   r   r   r   rF     s(   



zBertEncoder.forwardrG   r}   r   r   r6   r   r     s    r   c                       rs   )
BertPoolerc                    s.   t t|   t|j|j| _t | _d S r   )	r#   r   r$   r   rU   r'   rv   Tanh
activationr3   r6   r   r   r$   '  s   zBertPooler.__init__c                 C   s(   |d d df }|  |}| |}|S )Nr   )rv   r   )r4   re   first_token_tensorpooled_outputr   r   r   rF   ,  s   

zBertPooler.forwardr}   r   r   r6   r   r   %      r   c                       rs   )BertPredictionHeadTransformc                    sr   t t|   t|j|j| _t|jt	s#t
jd dkr*t|jtr*t|j | _n|j| _t|j|jd| _d S )Nr   r   r!   )r#   r   r$   r   rU   r'   rv   r   r   r   r   r   r   r   transform_act_fnr-   r.   r/   r3   r6   r   r   r$   7  s   
z$BertPredictionHeadTransform.__init__c                 C   s"   |  |}| |}| |}|S r   )rv   r   r/   r   r   r   r   rF   C  s   


z#BertPredictionHeadTransform.forwardr}   r   r   r6   r   r   5  s    r   c                       rs   )BertLMPredictionHeadc                    sF   t t|   t|| _tj|j|jdd| _	t
t|j| _d S )NF)bias)r#   r   r$   r   	transformr   rU   r'   r&   decoder	Parameterr   zerosr   r3   r6   r   r   r$   L  s   

zBertLMPredictionHead.__init__c                 C   s   |  |}| || j }|S r   )r   r   r   r   r   r   r   rF   W  s   
zBertLMPredictionHead.forwardr}   r   r   r6   r   r   J  s    r   c                       rs   )BertOnlyMLMHeadc                    s   t t|   t|| _d S r   )r#   r   r$   r   predictionsr3   r6   r   r   r$   _  s   zBertOnlyMLMHead.__init__c                 C      |  |}|S r   )r   )r4   sequence_outputprediction_scoresr   r   r   rF   c     
zBertOnlyMLMHead.forwardr}   r   r   r6   r   r   ]      r   c                       rs   )BertOnlyNSPHeadc                    s"   t t|   t|jd| _d S Nr   )r#   r   r$   r   rU   r'   seq_relationshipr3   r6   r   r   r$   j  s   zBertOnlyNSPHead.__init__c                 C   r   r   )r   )r4   r   seq_relationship_scorer   r   r   rF   n  r   zBertOnlyNSPHead.forwardr}   r   r   r6   r   r   h  r   r   c                       rs   )BertPreTrainingHeadsc                    s,   t t|   t|| _t|jd| _d S r   )	r#   r   r$   r   r   r   rU   r'   r   r3   r6   r   r   r$   u  s   
zBertPreTrainingHeads.__init__c                 C   s   |  |}| |}||fS r   )r   r   )r4   r   r   r   r   r   r   r   rF   z  s   

zBertPreTrainingHeads.forwardr}   r   r   r6   r   r   s  r   r   c                       s,   e Zd ZeZdZ fddZdd Z  ZS )BertPreTrainedModelbertc                    s   t t|   || _d S r   )r#   r   r$   r5   r3   r6   r   r   r$     s   
zBertPreTrainedModel.__init__c                 C   s|   t |tjtjfr|jjjd| jjd nt |t	r'|j
j  |jjd t |tjr:|j
dur<|j
j  dS dS dS )z Initialize the weights g        )meanstdr   N)r   r   rU   r%   weightdatanormal_r5   initializer_ranger-   r   zero_fill_)r4   moduler   r   r   _init_weights  s   
z!BertPreTrainedModel._init_weights)	rI   rJ   rK   r	   config_classbase_model_prefixr$   r   rM   r   r   r6   r   r     s
    r   c                       s2   e Zd ZdZ fddZ				dddZ  ZS )	BertModela  
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
        **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
            Sequence of hidden-states at the output of the last layer of the model.
        **pooler_output**: ``torch.FloatTensor`` of shape ``(batch_size, hidden_size)``
            Last layer hidden-state of the first token of the sequence (classification token)
            further processed by a Linear layer and a Tanh activation function. The Linear
            layer weights are trained from the next sentence prediction (classification)
            objective during Bert pretraining. This output is usually *not* a good summary
            of the semantic content of the input, you're often better with averaging or pooling
            the sequence of hidden-states for the whole input sequence.
        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
            of shape ``(batch_size, sequence_length, hidden_size)``:
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
            list of ``torch.FloatTensor`` (one for each layer)
            of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
            Attentions weights after the attention softmax,
            used to compute the weighted average in the self-attention heads.

    Examples:
        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        >>> model = BertModel.from_pretrained('bert-base-uncased')
        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
        >>> outputs = model(input_ids)
        >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple

    c                    s>   t t| | t|| _t|| _t|| _| 	| j
 d S r   )r#   r   r$   r   rE   r   encoderr   poolerapplyr   r3   r6   r   r   r$     s
   


zBertModel.__init__Nc                 C   s0  |d u r	t |}|d u rt |}|dd}|jt|  jd}d| d }|d url| dkrO|dddd}|	| j
jdddd}n| dkr`|ddd}|jt|  jd}nd g| j
j }| j|||d}| j|||d	}|d }	| |	}
|	|
f|dd   }|S )
Nr   r   )r8   r   g     r   rZ   )rB   rA   )rg   )r   	ones_liker?   r=   tonext
parametersr8   r`   expandr5   r   rE   r   r   )r4   r@   rf   rA   rB   rg   extended_attention_maskembedding_outputencoder_outputsr   r   rr   r   r   r   rF     sZ   


zBertModel.forward)NNNNrH   r   r   r6   r   r     s    r   )-rL   
__future__r   r   r   r   loggingr   osr   ior   jsonr   r   configuration_bertr	   	getLoggerrI   loggerr   r   r   
functionalr   r   r/   r-   Moduler   rN   rt   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   <module>   sJ   
	&F#