o
    ߥi0                     @   s  d dl Z d dlmZ d dlZ d dlmZ d dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ d	d
lmZmZ d	dlmZ e ZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZe
jejej dG dd deZ!dS )    N)CrossEntropyLoss)ACT2FN)Models)MODELS)AttentionFillMaskModelOutput)logger)Tasks   )MegatronBertModelMegatronBertPreTrainedModel)MegatronBertConfigc                       2   e Zd Z fddZdejdejfddZ  ZS )#MegatronBertPredictionHeadTransformc                    sV   t    t|j|j| _t|jtrt	|j | _
n|j| _
tj|j|jd| _d S )N)eps)super__init__nnLinearhidden_sizedense
isinstance
hidden_actstrr   transform_act_fn	LayerNormlayer_norm_epsselfconfig	__class__ a/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/nlp/megatron_bert/fill_mask.pyr   &   s   
z,MegatronBertPredictionHeadTransform.__init__hidden_statesreturnc                 C   s"   |  |}| |}| |}|S N)r   r   r   r   r#   r!   r!   r"   forward0   s   


z+MegatronBertPredictionHeadTransform.forward__name__
__module____qualname__r   torchTensorr'   __classcell__r!   r!   r   r"   r   $   s    
r   c                       $   e Zd Z fddZdd Z  ZS )MegatronBertLMPredictionHeadc                    sL   t    t|| _tj|j|jdd| _t	t
|j| _| j| j_d S )NF)bias)r   r   r   	transformr   r   r   
vocab_sizedecoder	Parameterr,   zerosr1   r   r   r!   r"   r   :   s   


z%MegatronBertLMPredictionHead.__init__c                 C   s   |  |}| |}|S r%   )r2   r4   r&   r!   r!   r"   r'   H   s   

z$MegatronBertLMPredictionHead.forwardr)   r*   r+   r   r'   r.   r!   r!   r   r"   r0   8   s    r0   c                       r   )MegatronBertOnlyMLMHeadc                    s   t    t|| _d S r%   )r   r   r0   predictionsr   r   r!   r"   r   Q   s   
z MegatronBertOnlyMLMHead.__init__sequence_outputr$   c                 C      |  |}|S r%   )r9   )r   r:   prediction_scoresr!   r!   r"   r'   U      
zMegatronBertOnlyMLMHead.forwardr(   r!   r!   r   r"   r8   O   s    r8   c                       r/   )MegatronBertOnlyNSPHeadc                    s   t    t|jd| _d S N   )r   r   r   r   r   seq_relationshipr   r   r!   r"   r   ]   s   
z MegatronBertOnlyNSPHead.__init__c                 C   r;   r%   )rA   )r   pooled_outputseq_relationship_scorer!   r!   r"   r'   a   r=   zMegatronBertOnlyNSPHead.forwardr7   r!   r!   r   r"   r>   [   s    r>   c                       r/   )MegatronBertPreTrainingHeadsc                    s(   t    t|| _t|jd| _d S r?   )r   r   r0   r9   r   r   r   rA   r   r   r!   r"   r   i   s   

z%MegatronBertPreTrainingHeads.__init__c                 C   s   |  |}| |}||fS r%   )r9   rA   )r   r:   rB   r<   rC   r!   r!   r"   r'   n   s   

z$MegatronBertPreTrainingHeads.forwardr7   r!   r!   r   r"   rD   g   s    rD   )module_namec                       sp   e Zd ZddgZddgZdef fddZdd	 Zd
d Z												dddZ		dddZ
  ZS )MegatronBertForMaskedLMpoolerrA   position_idszpredictions.decoder.biasr   c                    s@   t  | |jrtd t|dd| _t|| _| 	  d S )NzsIf you want to use `MegatronBertForMaskedLM` make sure `config.is_decoder=False` for bi-directional self-attention.F)add_pooling_layer)
r   r   
is_decoderr   warningr
   bertr8   cls	post_init)r   r   kwargsr   r!   r"   r   |   s   
z MegatronBertForMaskedLM.__init__c                 C   s
   | j jjS r%   rM   r9   r4   )r   r!   r!   r"   get_output_embeddings   s   
z-MegatronBertForMaskedLM.get_output_embeddingsc                 C   s   || j j_d S r%   rP   )r   new_embeddingsr!   r!   r"   set_output_embeddings   s   z-MegatronBertForMaskedLM.set_output_embeddingsNc                 C   s   |dur|n| j j}| j|||||||||
||d}|d }| |}d}|	dur:t }||d| j j|	d}|sP|f|dd  }|durN|f| S |S t|||j|j	|dS )a   
        Args:
            input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary.

                Indices can be obtained using :class:`~modelscope.models.nlp.structbert.SbertTokenizer`. See
                :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
                for details.

                `What are input IDs? <../glossary.html#input-ids>`__
            attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
                Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                `What are attention masks? <../glossary.html#attention-mask>`__
            token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
                Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
                1]``:

                - 0 corresponds to a `sentence A` token,
                - 1 corresponds to a `sentence B` token.

                `What are token type IDs? <../glossary.html#token-type-ids>`_
            position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range
                ``[0,config.max_position_embeddings - 1]``.

                `What are position IDs? <../glossary.html#position-ids>`_
            head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`,
                `optional`):
                Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`,
                `optional`):
                Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
                representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
                into associated vectors than the model's internal embedding lookup matrix.
            output_attentions (:obj:`bool`, `optional`):
                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
                returned tensors for more detail.
            output_hidden_states (:obj:`bool`, `optional`):
                Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
                for more detail.
            return_dict (:obj:`bool`, `optional`):
                Whether or not to return a :class:`~transformers.ModelOutput` instead of a plain tuple.
            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`,
            *optional*):
                Labels for computing the masked language modeling loss. Indices
                should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids`
                docstring) Tokens with indices set to `-100` are ignored (masked),
                the loss is only computed for the tokens with labels in `[0, ...,
                config.vocab_size]`

        Returns:
            Returns `modelscope.outputs.AttentionFillMaskModelOutput`

        Examples:
            >>> from modelscope.models import Model
            >>> from modelscope.preprocessors import Preprocessor
            >>> model = Model.from_pretrained('damo/nlp_megatronbert_backbone_base_std')
            >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_megatronbert_backbone_base_std')
            >>> print(model(**preprocessor(('This is a test', 'This is also a test'))))
        N)
attention_masktoken_type_idsrH   	head_maskinputs_embedsencoder_hidden_statesencoder_attention_maskoutput_attentionsoutput_hidden_statesreturn_dictr   r@   )losslogitsr#   
attentions	input_ids)
r   use_return_dictrL   rM   r   viewr3   r   r#   r`   )r   ra   rT   rU   rH   rV   rW   rX   rY   labelsrZ   r[   r\   outputsr:   r<   masked_lm_lossloss_fctoutputr!   r!   r"   r'      sL   T
zMegatronBertForMaskedLM.forwardc                 K   s   |j }|d }| jjd u rtd||j d df}tj||gdd}tj|df| jjtj|j	d}tj||gdd}||dS )Nr   z.The PAD token should be defined for generationr	   r]   )dim)dtypedevice)ra   rT   )
shaper   pad_token_id
ValueError	new_zerosr,   catfulllongrk   )r   ra   rT   model_kwargsinput_shapeeffective_batch_sizeattention_mask_new_zerosdummy_tokenr!   r!   r"   prepare_inputs_for_generation  s"   


z5MegatronBertForMaskedLM.prepare_inputs_for_generation)NNNNNNNNNNNNr%   )r)   r*   r+   "_keys_to_ignore_on_load_unexpected_keys_to_ignore_on_load_missingr   r   rQ   rS   r'   rx   r.   r!   r!   r   r"   rF   t   s,    
}rF   )"r,   torch.nnr   torch.utils.checkpointr   transformers.activationsr   modelscope.metainfor   modelscope.models.builderr   modelscope.outputsr   modelscope.utilsr   loggingmodelscope.utils.constantr   backboner
   r   configurationr   
get_loggerModuler   r0   r8   r>   rD   register_module	fill_maskmegatron_bertrF   r!   r!   r!   r"   <module>   s(   