o
    eiv                    @   s  d Z ddlZddlmZ ddlZddlmZ ddlmZmZm	Z	 ddl
mZ ddlmZmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZ ddlmZmZmZ ddlmZ ee Z!eeddG dd deZ"eeddG dd deZ#eeddG dd deZ$eeddG dd deZ%eeddG dd deZ&eeddG d d! d!eZ'eed"dG d#d$ d$eZ(eed%dG d&d' d'eZ)eed(dG d)d* d*eZ*eed+dG d,d- d-eZ+G d.d/ d/ej,Z-G d0d1 d1ej,Z.G d2d3 d3ej,Z/G d4d5 d5ej,Z0G d6d7 d7ej,Z1G d8d9 d9ej,Z2G d:d; d;ej,Z3G d<d= d=eZ4G d>d? d?ej,Z5G d@dA dAej,Z6G dBdC dCej,Z7G dDdE dEej,Z8eG dFdG dGeZ9edHdG dIdJ dJe9Z:dKdL Z;G dMdN dNej,Z<edOdG dPdQ dQe9Z=edRdG dSdT dTe9Z>edUdG dVdW dWe9Z?edXdG dYdZ dZe9Z@ed[dG d\d] d]e9ZAed^dG d_d` d`e9ZBeG dadb dbe9ZCeG dcdd dde9ZDg deZEdS )fzPyTorch LUKE model.    N)	dataclass)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FNgelu)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)PreTrainedModel)apply_chunking_to_forward)ModelOutputauto_docstringlogging   )
LukeConfigz3
    Base class for outputs of the LUKE model.
    )custom_introc                   @   >   e Zd ZU dZdZejdB ed< dZe	ejdf dB ed< dS )BaseLukeModelOutputWithPoolingax  
    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
        Last layer hidden-state of the first token of the sequence (classification token) further processed by a
        Linear layer and a Tanh activation function.
    entity_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, entity_length, hidden_size)`):
        Sequence of entity hidden-states at the output of the last layer of the model.
    entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
        layer plus the initial entity embedding outputs.
    Nentity_last_hidden_state.entity_hidden_states
__name__
__module____qualname____doc__r   torchFloatTensor__annotations__r   tuple r#   r#   d/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/luke/modeling_luke.pyr   $   s   
 r   zV
    Base class for model's outputs, with potential hidden states and attentions.
    c                   @   r   )BaseLukeModelOutputa  
    entity_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, entity_length, hidden_size)`):
        Sequence of entity hidden-states at the output of the last layer of the model.
    entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
        layer plus the initial entity embedding outputs.
    Nr   .r   r   r#   r#   r#   r$   r%   ;   s   
 	r%   c                   @   s   e Zd ZU dZdZejdB ed< dZejdB ed< dZ	ejdB ed< dZ
ejdB ed< dZejdB ed< dZeej dB ed< dZeejd	f dB ed
< dZeejd	f dB ed< dS )LukeMaskedLMOutputa:  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        The sum of masked language modeling (MLM) loss and entity prediction loss.
    mlm_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Masked language modeling (MLM) loss.
    mep_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Masked entity prediction (MEP) loss.
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    entity_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the entity prediction head (scores for each entity vocabulary token before SoftMax).
    entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
        layer plus the initial entity embedding outputs.
    Nlossmlm_lossmep_losslogitsentity_logitshidden_states.r   
attentions)r   r   r   r   r'   r   r    r!   r(   r)   r*   r+   r,   r"   r   r-   r#   r#   r#   r$   r&   O   s   
 r&   z2
    Outputs of entity classification models.
    c                   @      e Zd ZU dZdZejdB ed< dZejdB ed< dZ	e
ejdf dB ed< dZe
ejdf dB ed< dZe
ejdf dB ed< dS )	EntityClassificationOutput  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Classification loss.
    logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
        Classification scores (before SoftMax).
    entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
        layer plus the initial entity embedding outputs.
    Nr'   r*   .r,   r   r-   r   r   r   r   r'   r   r    r!   r*   r,   r"   r   r-   r#   r#   r#   r$   r/   q      
 r/   z7
    Outputs of entity pair classification models.
    c                   @   r.   )	EntityPairClassificationOutputr0   Nr'   r*   .r,   r   r-   r1   r#   r#   r#   r$   r3      r2   r3   z7
    Outputs of entity span classification models.
    c                   @   r.   )	EntitySpanClassificationOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Classification loss.
    logits (`torch.FloatTensor` of shape `(batch_size, entity_length, config.num_labels)`):
        Classification scores (before SoftMax).
    entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
        layer plus the initial entity embedding outputs.
    Nr'   r*   .r,   r   r-   r1   r#   r#   r#   r$   r4      r2   r4   z4
    Outputs of sentence classification models.
    c                   @   r.   )	LukeSequenceClassifierOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Classification (or regression if config.num_labels==1) loss.
    logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
        Classification (or regression if config.num_labels==1) scores (before SoftMax).
    entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
        layer plus the initial entity embedding outputs.
    Nr'   r*   .r,   r   r-   r1   r#   r#   r#   r$   r5      r2   r5   z@
    Base class for outputs of token classification models.
    c                   @   r.   )	LukeTokenClassifierOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Classification loss.
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`):
        Classification scores (before SoftMax).
    entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
        layer plus the initial entity embedding outputs.
    Nr'   r*   .r,   r   r-   r1   r#   r#   r#   r$   r6      r2   r6   z/
    Outputs of question answering models.
    c                   @   s   e Zd ZU dZdZejdB ed< dZejdB ed< dZ	ejdB ed< dZ
eejdf dB ed< dZeejdf dB ed< dZeejdf dB ed	< dS )
 LukeQuestionAnsweringModelOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
    entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
        layer plus the initial entity embedding outputs.
    Nr'   start_logits
end_logits.r,   r   r-   )r   r   r   r   r'   r   r    r!   r8   r9   r,   r"   r   r-   r#   r#   r#   r$   r7      s   
 	r7   z,
    Outputs of multiple choice models.
    c                   @   r.   )	LukeMultipleChoiceModelOutputa  
    loss (`torch.FloatTensor` of shape *(1,)*, *optional*, returned when `labels` is provided):
        Classification loss.
    logits (`torch.FloatTensor` of shape `(batch_size, num_choices)`):
        *num_choices* is the second dimension of the input tensors. (see *input_ids* above).

        Classification scores (before SoftMax).
    entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
        layer plus the initial entity embedding outputs.
    Nr'   r*   .r,   r   r-   r1   r#   r#   r#   r$   r:     s   
 r:   c                       s:   e Zd ZdZ fddZ				d	ddZdd Z  ZS )
LukeEmbeddingszV
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    c                    s   t    tj|j|j|jd| _t|j|j| _	t|j
|j| _tj|j|jd| _t|j| _|j| _tj|j|j| jd| _	d S )Npadding_idxeps)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutr=   selfconfig	__class__r#   r$   rA   &  s   
zLukeEmbeddings.__init__Nc           	      C   s   |d u r|d urt || j|j}n| |}|d ur!| }n| d d }|d u r8tj|tj| j	jd}|d u rA| 
|}| |}| |}|| | }| |}| |}|S )Ndtypedevice)"create_position_ids_from_input_idsr=   torX   &create_position_ids_from_inputs_embedssizer   zeroslongposition_idsrF   rH   rJ   rK   rO   )	rQ   	input_idstoken_type_idsr_   inputs_embedsinput_shaperH   rJ   
embeddingsr#   r#   r$   forward5  s"   






zLukeEmbeddings.forwardc                 C   sN   |  dd }|d }tj| jd || j d tj|jd}|d|S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        NrU   r   rV   r   )r\   r   aranger=   r^   rX   	unsqueezeexpand)rQ   rb   rc   sequence_lengthr_   r#   r#   r$   r[   V  s   	z5LukeEmbeddings.create_position_ids_from_inputs_embeds)NNNN)r   r   r   r   rA   re   r[   __classcell__r#   r#   rS   r$   r;   !  s    
!r;   c                       sF   e Zd Zdef fddZ	d
dejdejdejdB fdd	Z  ZS )LukeEntityEmbeddingsrR   c                    s   t    || _tj|j|jdd| _|j|jkr$tj	|j|jdd| _
t|j|j| _t|j|j| _tj|j|jd| _t|j| _d S )Nr   r<   Fbiasr>   )r@   rA   rR   r   rB   entity_vocab_sizeentity_emb_sizeentity_embeddingsrD   Linearentity_embedding_denserG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rS   r#   r$   rA   i  s   
zLukeEntityEmbeddings.__init__N
entity_idsr_   ra   c           	      C   s   |d u r	t |}| |}| jj| jjkr| |}| |jdd}|dk	|
d}|| }t j|dd}||jddjdd }| |}|| | }| |}| |}|S )Nr   )minrU   dimgHz>)r   
zeros_likerp   rR   ro   rD   rr   rH   clamptype_asrg   sumrJ   rK   rO   )	rQ   rs   r_   ra   rp   rH   position_embedding_maskrJ   rd   r#   r#   r$   re   w  s   





zLukeEntityEmbeddings.forwardN)	r   r   r   r   rA   r   
LongTensorre   rj   r#   r#   rS   r$   rk   h  s    rk   c                       s2   e Zd Z fddZdd Z		d	ddZ  ZS )
LukeSelfAttentionc                    s   t    |j|j dkrt|dstd|j d|j d|j| _t|j|j | _| j| j | _|j	| _	t
|j| j| _t
|j| j| _t
|j| j| _| j	rpt
|j| j| _t
|j| j| _t
|j| j| _t
|j| _d S )Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .)r@   rA   rD   num_attention_headshasattr
ValueErrorintattention_head_sizeall_head_sizeuse_entity_aware_attentionr   rq   querykeyvalue	w2e_query	e2w_query	e2e_queryrM   attention_probs_dropout_probrO   rP   rS   r#   r$   rA     s&   

zLukeSelfAttention.__init__c                 C   s6   |  d d | j| jf }|j| }|ddddS )NrU   r      r   r   )r\   r   r   viewpermute)rQ   xnew_x_shaper#   r#   r$   transpose_for_scores  s   
z&LukeSelfAttention.transpose_for_scoresNFc                 C   s  | d}|d u r|}n	tj||gdd}| | |}| | |}| jr|d ur| | |}	| | |}
| | 	|}| | 
|}|d d d d d |d d f }|d d d d d |d d f }|d d d d |d d d f }|d d d d |d d d f }t|	|dd}t|
|dd}t||dd}t||dd}tj||gdd}tj||gdd}tj||gdd}n| | |}t||dd}|t| j }|d ur|| }tjj|dd}| |}t||}|dddd }|  d d | jf }|j| }|d d d |d d f }|d u r5d }n|d d |d d d f }|rL|||f}|S ||f}|S )Nr   rv   rU   ru   r   r   r   )r\   r   catr   r   r   r   r   r   r   r   matmul	transposemathsqrtr   r   
functionalsoftmaxrO   r   
contiguousr   r   )rQ   word_hidden_statesr   attention_maskoutput_attentions	word_sizeconcat_hidden_states	key_layervalue_layerw2w_query_layerw2e_query_layere2w_query_layere2e_query_layerw2w_key_layere2w_key_layerw2e_key_layere2e_key_layerw2w_attention_scoresw2e_attention_scorese2w_attention_scorese2e_attention_scoresword_attention_scoresentity_attention_scoresattention_scoresquery_layerattention_probscontext_layernew_context_layer_shapeoutput_word_hidden_statesoutput_entity_hidden_statesoutputsr#   r#   r$   re     sT   
    



zLukeSelfAttention.forwardNF)r   r   r   rA   r   re   rj   r#   r#   rS   r$   r     s    	r   c                       8   e Zd Z fddZdejdejdejfddZ  ZS )LukeSelfOutputc                    sB   t    t|j|j| _tj|j|jd| _t|j	| _
d S Nr>   )r@   rA   r   rq   rD   denserK   rL   rM   rN   rO   rP   rS   r#   r$   rA        
zLukeSelfOutput.__init__r,   input_tensorreturnc                 C   &   |  |}| |}| || }|S r}   r   rO   rK   rQ   r,   r   r#   r#   r$   re        

zLukeSelfOutput.forwardr   r   r   rA   r   Tensorre   rj   r#   r#   rS   r$   r          $r   c                       s*   e Zd Z fddZ		dddZ  ZS )LukeAttentionc                    s"   t    t|| _t|| _d S r}   )r@   rA   r   rQ   r   outputrP   rS   r#   r$   rA     s   

zLukeAttention.__init__NFc                 C   s   | d}| ||||}|d u r|d }|}ntj|d d dd}tj||gdd}| ||}	|	d d d |d d f }
|d u rFd }n|	d d |d d d f }|
|f|dd   }|S )Nr   r   r   rv   )r\   rQ   r   r   r   )rQ   r   r   r   r   r   self_outputsconcat_self_outputsr   attention_outputword_attention_outputentity_attention_outputr   r#   r#   r$   re     s&   
zLukeAttention.forwardr   r   r   r   rA   re   rj   r#   r#   rS   r$   r     s
    	r   c                       2   e Zd Z fddZdejdejfddZ  ZS )LukeIntermediatec                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S r}   )r@   rA   r   rq   rD   intermediate_sizer   
isinstance
hidden_actstrr	   intermediate_act_fnrP   rS   r#   r$   rA   9  s
   
zLukeIntermediate.__init__r,   r   c                 C   s   |  |}| |}|S r}   )r   r   rQ   r,   r#   r#   r$   re   A  s   

zLukeIntermediate.forwardr   r#   r#   rS   r$   r   8  s    r   c                       r   )
LukeOutputc                    sB   t    t|j|j| _tj|j|jd| _t	|j
| _d S r   )r@   rA   r   rq   r   rD   r   rK   rL   rM   rN   rO   rP   rS   r#   r$   rA   I  r   zLukeOutput.__init__r,   r   r   c                 C   r   r}   r   r   r#   r#   r$   re   O  r   zLukeOutput.forwardr   r#   r#   rS   r$   r   H  r   r   c                       s2   e Zd Z fddZ		d	ddZdd Z  ZS )
	LukeLayerc                    s:   t    |j| _d| _t|| _t|| _t|| _	d S Nr   )
r@   rA   chunk_size_feed_forwardseq_len_dimr   	attentionr   intermediater   r   rP   rS   r#   r$   rA   W  s   


zLukeLayer.__init__NFc                 C   s   | d}| j||||d}|d u r|d }ntj|d d dd}|dd  }t| j| j| j|}	|	d d d |d d f }
|d u rFd }n|	d d |d d d f }|
|f| }|S )Nr   )r   r   r   rv   )r\   r   r   r   r   feed_forward_chunkr   r   )rQ   r   r   r   r   r   self_attention_outputsconcat_attention_outputr   layer_outputword_layer_outputentity_layer_outputr#   r#   r$   re   _  s(   

zLukeLayer.forwardc                 C   s   |  |}| ||}|S r}   )r   r   )rQ   r   intermediate_outputr   r#   r#   r$   r     s   
zLukeLayer.feed_forward_chunkr   )r   r   r   rA   re   r   rj   r#   r#   rS   r$   r   V  s    
#r   c                       s.   e Zd Z fddZ				dddZ  ZS )	LukeEncoderc                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r#   )r   ).0_rR   r#   r$   
<listcomp>  s    z(LukeEncoder.__init__.<locals>.<listcomp>F)	r@   rA   rR   r   
ModuleListrangenum_hidden_layerslayergradient_checkpointingrP   rS   r   r$   rA     s   
 
zLukeEncoder.__init__NFTc                 C   s   |rdnd }|r
dnd }|rdnd }	t | jD ],\}
}|r'||f }||f }|||||}|d }|d ur:|d }|rC|	|d f }	q|rP||f }||f }|s`tdd |||	||fD S t|||	||dS )Nr#   r   r   r   c                 s       | ]	}|d ur|V  qd S r}   r#   r   vr#   r#   r$   	<genexpr>      z&LukeEncoder.forward.<locals>.<genexpr>)last_hidden_stater,   r-   r   r   )	enumerater   r"   r%   )rQ   r   r   r   r   output_hidden_statesreturn_dictall_word_hidden_statesall_entity_hidden_statesall_self_attentionsilayer_modulelayer_outputsr#   r#   r$   re     sL   	



zLukeEncoder.forward)NFFTr   r#   r#   rS   r$   r     s    
r   c                       r   )
LukePoolerc                    s*   t    t|j|j| _t | _d S r}   )r@   rA   r   rq   rD   r   Tanh
activationrP   rS   r#   r$   rA     s   
zLukePooler.__init__r,   r   c                 C   s(   |d d df }|  |}| |}|S )Nr   )r   r  )rQ   r,   first_token_tensorpooled_outputr#   r#   r$   re     s   

zLukePooler.forwardr   r#   r#   rS   r$   r    s    r  c                       $   e Zd Z fddZdd Z  ZS )EntityPredictionHeadTransformc                    sV   t    t|j|j| _t|jt	rt
|j | _n|j| _tj|j|jd| _d S r   )r@   rA   r   rq   rD   ro   r   r   r   r   r	   transform_act_fnrK   rL   rP   rS   r#   r$   rA     s   
z&EntityPredictionHeadTransform.__init__c                 C   s"   |  |}| |}| |}|S r}   )r   r  rK   r   r#   r#   r$   re     s   


z%EntityPredictionHeadTransform.forwardr   r#   r#   rS   r$   r    s    	r  c                       r  )EntityPredictionHeadc                    sH   t    || _t|| _tj|j|jdd| _	t
t|j| _d S )NFrl   )r@   rA   rR   r  	transformr   rq   ro   rn   decoder	Parameterr   r]   rm   rP   rS   r#   r$   rA     s
   

zEntityPredictionHead.__init__c                 C   s   |  |}| || j }|S r}   )r
  r  rm   r   r#   r#   r$   re     s   
zEntityPredictionHead.forwardr   r#   r#   rS   r$   r	    s    r	  c                   @   s>   e Zd ZU eed< dZdZddgZe	 de
jfddZd	S )
LukePreTrainedModelrR   lukeTr   rk   modulec                 C   s   t |tjr tj|jd| jjd |jdurt	|j dS dS t |tj
rX|jdkr2t	|j ntj|jd| jjd |jdurTt|jddsVt	|j|j  dS dS dS t |tjrlt	|j t|j dS dS )zInitialize the weightsg        )meanstdNr   _is_hf_initializedF)r   r   rq   initnormal_weightrR   initializer_rangerm   zeros_rB   embedding_dimr=   getattrrK   ones_)rQ   r  r#   r#   r$   _init_weights   s    

z!LukePreTrainedModel._init_weightsN)r   r   r   r   r!   base_model_prefixsupports_gradient_checkpointing_no_split_modulesr   no_gradr   Moduler  r#   r#   r#   r$   r    s   
 r  zt
    The bare LUKE model transformer outputting raw hidden-states for both word tokens and entities without any
    c                       s  e Zd Zd!dedef fddZdd Zdd	 Zd
d Zdd Z	e
												d"dejdB dejdB dejdB dejdB dejdB dejdB dejdB dejdB dejdB dedB dedB dedB deeB fddZdejdejdB fdd Z  ZS )#	LukeModelTrR   add_pooling_layerc                    sN   t  | || _t|| _t|| _t|| _|rt	|nd| _
|   dS )zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)r@   rA   rR   r;   rd   rk   rp   r   encoderr  pooler	post_init)rQ   rR   r"  rS   r#   r$   rA     s   


zLukeModel.__init__c                 C      | j jS r}   rd   rF   rQ   r#   r#   r$   get_input_embeddings+     zLukeModel.get_input_embeddingsc                 C      || j _d S r}   r'  rQ   r   r#   r#   r$   set_input_embeddings.     zLukeModel.set_input_embeddingsc                 C   s   | j j S r}   rp   r(  r#   r#   r$   get_entity_embeddings1  r*  zLukeModel.get_entity_embeddingsc                 C   s   || j _ d S r}   r/  r,  r#   r#   r$   set_entity_embeddings4  r.  zLukeModel.set_entity_embeddingsNr`   r   ra   r_   rs   entity_attention_maskentity_token_type_idsentity_position_idsrb   r   r   r   r   c                 K   s  |
dur|
n| j j}
|dur|n| j j}|dur|n| j j}|dur*|	dur*td|dur9| || | }n|	durF|	 dd }ntd|\}}|durU|jn|	j}|du retj	||f|d}|du rrtj
|tj|d}|dur|d}|du rtj	||f|d}|du rtj
||ftj|d}| j||||	d}| ||}|du rd}n| |||}| j||||
||d	}|d
 }| jdur| |nd}|s||f|dd  S t|||j|j|j|jdS )uz  
        entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
            Indices of entity tokens in the entity vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.
        entity_attention_mask (`torch.FloatTensor` of shape `(batch_size, entity_length)`, *optional*):
            Mask to avoid performing attention on padding entity token indices. Mask values selected in `[0, 1]`:

            - 1 for entity tokens that are **not masked**,
            - 0 for entity tokens that are **masked**.
        entity_token_type_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
            Segment token indices to indicate first and second portions of the entity token inputs. Indices are
            selected in `[0, 1]`:

            - 0 corresponds to a *portion A* entity token,
            - 1 corresponds to a *portion B* entity token.
        entity_position_ids (`torch.LongTensor` of shape `(batch_size, entity_length, max_mention_length)`, *optional*):
            Indices of positions of each input entity in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, LukeModel

        >>> tokenizer = AutoTokenizer.from_pretrained("studio-ousia/luke-base")
        >>> model = LukeModel.from_pretrained("studio-ousia/luke-base")
        # Compute the contextualized entity representation corresponding to the entity mention "Beyoncé"

        >>> text = "Beyoncé lives in Los Angeles."
        >>> entity_spans = [(0, 7)]  # character-based entity span corresponding to "Beyoncé"

        >>> encoding = tokenizer(text, entity_spans=entity_spans, add_prefix_space=True, return_tensors="pt")
        >>> outputs = model(**encoding)
        >>> word_last_hidden_state = outputs.last_hidden_state
        >>> entity_last_hidden_state = outputs.entity_last_hidden_state
        # Input Wikipedia entities to obtain enriched contextualized representations of word tokens

        >>> text = "Beyoncé lives in Los Angeles."
        >>> entities = [
        ...     "Beyoncé",
        ...     "Los Angeles",
        ... ]  # Wikipedia entity titles corresponding to the entity mentions "Beyoncé" and "Los Angeles"
        >>> entity_spans = [
        ...     (0, 7),
        ...     (17, 28),
        ... ]  # character-based entity spans corresponding to "Beyoncé" and "Los Angeles"

        >>> encoding = tokenizer(
        ...     text, entities=entities, entity_spans=entity_spans, add_prefix_space=True, return_tensors="pt"
        ... )
        >>> outputs = model(**encoding)
        >>> word_last_hidden_state = outputs.last_hidden_state
        >>> entity_last_hidden_state = outputs.entity_last_hidden_state
        ```NzDYou cannot specify both input_ids and inputs_embeds at the same timerU   z5You have to specify either input_ids or inputs_embeds)rX   rV   r   )r`   r_   ra   rb   )r   r   r   r   r   )r   pooler_outputr,   r-   r   r   )rR   r   r   use_return_dictr   %warn_if_padding_and_no_attention_maskr\   rX   r   onesr]   r^   rd   get_extended_attention_maskrp   r#  r$  r   r,   r-   r   r   )rQ   r`   r   ra   r_   rs   r2  r3  r4  rb   r   r   r   kwargsrc   
batch_size
seq_lengthrX   entity_seq_lengthword_embedding_outputextended_attention_maskentity_embedding_outputencoder_outputssequence_outputr  r#   r#   r$   re   7  sl   I


zLukeModel.forwardword_attention_maskc                 C   s   |}|durt j||gdd}| dkr$|dddddddf }n| dkr7|ddddddf }n	td|j d|j| jd}d	| t | jj }|S )
ac  
        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.

        Arguments:
            word_attention_mask (`torch.LongTensor`):
                Attention mask for word tokens with ones indicating tokens to attend to, zeros for tokens to ignore.
            entity_attention_mask (`torch.LongTensor`, *optional*):
                Attention mask for entity tokens with ones indicating tokens to attend to, zeros for tokens to ignore.

        Returns:
            `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
        NrU   rv   r   r   z&Wrong shape for attention_mask (shape ))rW   g      ?)	r   r   rw   r   shaperZ   rW   finfort   )rQ   rC  r2  r   r?  r#   r#   r$   r9    s   z%LukeModel.get_extended_attention_mask)T)NNNNNNNNNNNN)r   r   r   r   boolrA   r)  r-  r0  r1  r   r   r~   r    r"   r   re   r9  rj   r#   r#   rS   r$   r!    sh    	
 r!  c                 C   s2   |  | }tj|dd|| }| | S )a  
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.

    Args:
        x: torch.Tensor x:

    Returns: torch.Tensor
    r   rv   )ner   r   cumsumrz   r^   )r`   r=   maskincremental_indicesr#   r#   r$   rY     s   rY   c                       s(   e Zd ZdZ fddZdd Z  ZS )
LukeLMHeadz*Roberta Head for masked language modeling.c                    sZ   t    t|j|j| _tj|j|jd| _t|j|j	| _
tt|j	| _d S r   )r@   rA   r   rq   rD   r   rK   rL   
layer_normrC   r  r  r   r]   rm   rP   rS   r#   r$   rA     s
   
zLukeLMHead.__init__c                 K   s*   |  |}t|}| |}| |}|S r}   )r   r
   rM  r  )rQ   featuresr:  r   r#   r#   r$   re     s
   


zLukeLMHead.forward)r   r   r   r   rA   re   rj   r#   r#   rS   r$   rL    s    rL  z
    The LUKE model with a language modeling head and entity prediction head on top for masked language modeling and
    masked entity prediction.
    c                !       s   e Zd ZdddZ fddZdd Zdd	 Ze	
	
	
	
	
	
	
	
	
	
	
	
	
	
ddej	d
B dej
d
B dej	d
B dej	d
B dej	d
B dej	d
B dej	d
B dej	d
B dej	d
B dej	d
B dej
d
B ded
B ded
B ded
B deeB fddZ  ZS )LukeForMaskedLMz/luke.entity_embeddings.entity_embeddings.weightzlm_head.decoder.bias)z!entity_predictions.decoder.weightzlm_head.biasc                    s@   t  | t|| _t|| _t|| _t	 | _
|   d S r}   )r@   rA   r!  r  rL  lm_headr	  entity_predictionsr   r   loss_fnr%  rP   rS   r#   r$   rA     s   



zLukeForMaskedLM.__init__c                 C   r&  r}   rP  r  r(  r#   r#   r$   get_output_embeddings*  r*  z%LukeForMaskedLM.get_output_embeddingsc                 C   r+  r}   rS  )rQ   new_embeddingsr#   r#   r$   set_output_embeddings-  r.  z%LukeForMaskedLM.set_output_embeddingsNr`   r   ra   r_   rs   r2  r3  r4  labelsentity_labelsrb   r   r   r   r   c                 K   s,  |dur|n| j j}| j|||||||||||dd}d}d}| |j}|	durD|	|j}	| |d| j j	|	d}|du rD|}d}d}|j
durq| |j
}|
durq| |d| j j|
d}|du rm|}n|| }|stdd ||||||j|j|jfD S t||||||j|j|jdS )aC  
        entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
            Indices of entity tokens in the entity vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.
        entity_attention_mask (`torch.FloatTensor` of shape `(batch_size, entity_length)`, *optional*):
            Mask to avoid performing attention on padding entity token indices. Mask values selected in `[0, 1]`:

            - 1 for entity tokens that are **not masked**,
            - 0 for entity tokens that are **masked**.
        entity_token_type_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
            Segment token indices to indicate first and second portions of the entity token inputs. Indices are
            selected in `[0, 1]`:

            - 0 corresponds to a *portion A* entity token,
            - 1 corresponds to a *portion B* entity token.
        entity_position_ids (`torch.LongTensor` of shape `(batch_size, entity_length, max_mention_length)`, *optional*):
            Indices of positions of each input entity in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        entity_labels (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        NTr`   r   ra   r_   rs   r2  r3  r4  rb   r   r   r   rU   c                 s   r   r}   r#   r   r#   r#   r$   r     s    
z*LukeForMaskedLM.forward.<locals>.<genexpr>)r'   r(   r)   r*   r+   r,   r   r-   )rR   r6  r  rP  r   rZ   rX   rR  r   rC   r   rQ  rn   r"   r,   r   r-   r&   )rQ   r`   r   ra   r_   rs   r2  r3  r4  rW  rX  rb   r   r   r   r:  r   r'   r(   r*   r)   r+   r#   r#   r$   re   0  sl   1
zLukeForMaskedLM.forwardNNNNNNNNNNNNNN)r   r   r   _tied_weights_keysrA   rT  rV  r   r   r~   r    rG  r"   r&   re   rj   r#   r#   rS   r$   rO    sj    	
rO  z
    The LUKE model with a classification head on top (a linear layer on top of the hidden state of the first entity
    token) for entity classification tasks, such as Open Entity.
    c                          e Zd Z fddZe													ddejdB dejdB dejdB dejdB dejdB d	ejdB d
ejdB dejdB dejdB dejdB dedB dedB dedB de	e
B fddZ  ZS )LukeForEntityClassificationc                    sJ   t  | t|| _|j| _t|j| _t	|j
|j| _|   d S r}   r@   rA   r!  r  
num_labelsr   rM   rN   rO   rq   rD   
classifierr%  rP   rS   r#   r$   rA     s   
z$LukeForEntityClassification.__init__Nr`   r   ra   r_   rs   r2  r3  r4  rb   rW  r   r   r   r   c                 K   s   |dur|n| j j}| j|||||||||	||dd}|jdddddf }| |}| |}d}|
durZ|
|j}
|
jdkrJt	j
||
}nt	j
|d|
d|}|smtdd |||j|j|jfD S t|||j|j|jd	S )
u
  
        entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
            Indices of entity tokens in the entity vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.
        entity_attention_mask (`torch.FloatTensor` of shape `(batch_size, entity_length)`, *optional*):
            Mask to avoid performing attention on padding entity token indices. Mask values selected in `[0, 1]`:

            - 1 for entity tokens that are **not masked**,
            - 0 for entity tokens that are **masked**.
        entity_token_type_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
            Segment token indices to indicate first and second portions of the entity token inputs. Indices are
            selected in `[0, 1]`:

            - 0 corresponds to a *portion A* entity token,
            - 1 corresponds to a *portion B* entity token.
        entity_position_ids (`torch.LongTensor` of shape `(batch_size, entity_length, max_mention_length)`, *optional*):
            Indices of positions of each input entity in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.
        labels (`torch.LongTensor` of shape `(batch_size,)` or `(batch_size, num_labels)`, *optional*):
            Labels for computing the classification loss. If the shape is `(batch_size,)`, the cross entropy loss is
            used for the single-label classification. In this case, labels should contain the indices that should be in
            `[0, ..., config.num_labels - 1]`. If the shape is `(batch_size, num_labels)`, the binary cross entropy
            loss is used for the multi-label classification. In this case, labels should only contain `[0, 1]`, where 0
            and 1 indicate false and true, respectively.

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, LukeForEntityClassification

        >>> tokenizer = AutoTokenizer.from_pretrained("studio-ousia/luke-large-finetuned-open-entity")
        >>> model = LukeForEntityClassification.from_pretrained("studio-ousia/luke-large-finetuned-open-entity")

        >>> text = "Beyoncé lives in Los Angeles."
        >>> entity_spans = [(0, 7)]  # character-based entity span corresponding to "Beyoncé"
        >>> inputs = tokenizer(text, entity_spans=entity_spans, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> logits = outputs.logits
        >>> predicted_class_idx = logits.argmax(-1).item()
        >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
        Predicted class: person
        ```NTrY  r   r   rU   c                 s   r   r}   r#   r   r#   r#   r$   r         z6LukeForEntityClassification.forward.<locals>.<genexpr>r'   r*   r,   r   r-   )rR   r6  r  r   rO   r`  rZ   rX   ndimr   r   cross_entropy binary_cross_entropy_with_logitsr   rz   r"   r,   r   r-   r/   rQ   r`   r   ra   r_   rs   r2  r3  r4  rb   rW  r   r   r   r:  r   feature_vectorr*   r'   r#   r#   r$   re     sF   >


 z#LukeForEntityClassification.forwardNNNNNNNNNNNNN)r   r   r   rA   r   r   r~   r    rG  r"   r/   re   rj   r#   r#   rS   r$   r]    Z    	
r]  z
    The LUKE model with a classification head on top (a linear layer on top of the hidden states of the two entity
    tokens) for entity pair classification tasks, such as TACRED.
    c                       s   e Zd Z fddZe													ddejdB dejdB dejdB dejdB dejdB d	ejdB d
ejdB dejdB dejdB dejdB dedB dedB dedB de	e
B fddZ  ZS )LukeForEntityPairClassificationc                    sP   t  | t|| _|j| _t|j| _t	|j
d |jd| _|   d S )Nr   Fr^  rP   rS   r#   r$   rA   ,  s   
z(LukeForEntityPairClassification.__init__Nr`   r   ra   r_   rs   r2  r3  r4  rb   rW  r   r   r   r   c                 K   s  |dur|n| j j}| j|||||||||	||dd}tj|jdddddf |jdddddf gdd}| |}| |}d}|
durk|
|j	}
|
j
dkr[tj||
}ntj|d|
d|}|s~tdd	 |||j|j|jfD S t|||j|j|jd
S )u  
        entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
            Indices of entity tokens in the entity vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.
        entity_attention_mask (`torch.FloatTensor` of shape `(batch_size, entity_length)`, *optional*):
            Mask to avoid performing attention on padding entity token indices. Mask values selected in `[0, 1]`:

            - 1 for entity tokens that are **not masked**,
            - 0 for entity tokens that are **masked**.
        entity_token_type_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
            Segment token indices to indicate first and second portions of the entity token inputs. Indices are
            selected in `[0, 1]`:

            - 0 corresponds to a *portion A* entity token,
            - 1 corresponds to a *portion B* entity token.
        entity_position_ids (`torch.LongTensor` of shape `(batch_size, entity_length, max_mention_length)`, *optional*):
            Indices of positions of each input entity in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.
        labels (`torch.LongTensor` of shape `(batch_size,)` or `(batch_size, num_labels)`, *optional*):
            Labels for computing the classification loss. If the shape is `(batch_size,)`, the cross entropy loss is
            used for the single-label classification. In this case, labels should contain the indices that should be in
            `[0, ..., config.num_labels - 1]`. If the shape is `(batch_size, num_labels)`, the binary cross entropy
            loss is used for the multi-label classification. In this case, labels should only contain `[0, 1]`, where 0
            and 1 indicate false and true, respectively.

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, LukeForEntityPairClassification

        >>> tokenizer = AutoTokenizer.from_pretrained("studio-ousia/luke-large-finetuned-tacred")
        >>> model = LukeForEntityPairClassification.from_pretrained("studio-ousia/luke-large-finetuned-tacred")

        >>> text = "Beyoncé lives in Los Angeles."
        >>> entity_spans = [
        ...     (0, 7),
        ...     (17, 28),
        ... ]  # character-based entity spans corresponding to "Beyoncé" and "Los Angeles"
        >>> inputs = tokenizer(text, entity_spans=entity_spans, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> logits = outputs.logits
        >>> predicted_class_idx = logits.argmax(-1).item()
        >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
        Predicted class: per:cities_of_residence
        ```NTrY  r   r   rv   rU   c                 s   r   r}   r#   r   r#   r#   r$   r     ra  z:LukeForEntityPairClassification.forward.<locals>.<genexpr>rb  )rR   r6  r  r   r   r   rO   r`  rZ   rX   rc  r   r   rd  re  r   rz   r"   r,   r   r-   r3   rf  r#   r#   r$   re   8  sJ   A0


 z'LukeForEntityPairClassification.forwardrh  )r   r   r   rA   r   r   r~   r    rG  r"   r3   re   rj   r#   r#   rS   r$   rj  %  ri  rj  z
    The LUKE model with a span classification head on top (a linear layer on top of the hidden states output) for tasks
    such as named entity recognition.
    c                #       s   e Zd Z fddZe															ddejdB dejdB dejdB dejdB dejdB d	ejdB d
ejdB dejdB dejdB dejdB dejdB dejdB dedB dedB dedB de	e
B f ddZ  ZS )LukeForEntitySpanClassificationc                    sN   t  | t|| _|j| _t|j| _t	|j
d |j| _|   d S )Nr   r^  rP   rS   r#   r$   rA     s   
z(LukeForEntitySpanClassification.__init__Nr`   r   ra   r_   rs   r2  r3  r4  entity_start_positionsentity_end_positionsrb   rW  r   r   r   r   c                 K   s  |dur|n| j j}| j|||||||||||dd}|jd}|	ddd|}	|	j|jjkr9|	|jj}	t	
|jd|	}|
ddd|}
|
j|jjkrY|
|jj}
t	
|jd|
}t	j|||jgdd}| |}| |}d}|dur||j}|jdkrtj|d| j|d}ntj|d|d|}|stdd	 |||j|j|jfD S t|||j|j|jd
S )u  
        entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
            Indices of entity tokens in the entity vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.
        entity_attention_mask (`torch.FloatTensor` of shape `(batch_size, entity_length)`, *optional*):
            Mask to avoid performing attention on padding entity token indices. Mask values selected in `[0, 1]`:

            - 1 for entity tokens that are **not masked**,
            - 0 for entity tokens that are **masked**.
        entity_token_type_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
            Segment token indices to indicate first and second portions of the entity token inputs. Indices are
            selected in `[0, 1]`:

            - 0 corresponds to a *portion A* entity token,
            - 1 corresponds to a *portion B* entity token.
        entity_position_ids (`torch.LongTensor` of shape `(batch_size, entity_length, max_mention_length)`, *optional*):
            Indices of positions of each input entity in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.
        entity_start_positions (`torch.LongTensor`):
            The start positions of entities in the word token sequence.
        entity_end_positions (`torch.LongTensor`):
            The end positions of entities in the word token sequence.
        labels (`torch.LongTensor` of shape `(batch_size, entity_length)` or `(batch_size, entity_length, num_labels)`, *optional*):
            Labels for computing the classification loss. If the shape is `(batch_size, entity_length)`, the cross
            entropy loss is used for the single-label classification. In this case, labels should contain the indices
            that should be in `[0, ..., config.num_labels - 1]`. If the shape is `(batch_size, entity_length,
            num_labels)`, the binary cross entropy loss is used for the multi-label classification. In this case,
            labels should only contain `[0, 1]`, where 0 and 1 indicate false and true, respectively.

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, LukeForEntitySpanClassification

        >>> tokenizer = AutoTokenizer.from_pretrained("studio-ousia/luke-large-finetuned-conll-2003")
        >>> model = LukeForEntitySpanClassification.from_pretrained("studio-ousia/luke-large-finetuned-conll-2003")

        >>> text = "Beyoncé lives in Los Angeles"
        # List all possible entity spans in the text

        >>> word_start_positions = [0, 8, 14, 17, 21]  # character-based start positions of word tokens
        >>> word_end_positions = [7, 13, 16, 20, 28]  # character-based end positions of word tokens
        >>> entity_spans = []
        >>> for i, start_pos in enumerate(word_start_positions):
        ...     for end_pos in word_end_positions[i:]:
        ...         entity_spans.append((start_pos, end_pos))

        >>> inputs = tokenizer(text, entity_spans=entity_spans, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> logits = outputs.logits
        >>> predicted_class_indices = logits.argmax(-1).squeeze().tolist()
        >>> for span, predicted_class_idx in zip(entity_spans, predicted_class_indices):
        ...     if predicted_class_idx != 0:
        ...         print(text[span[0] : span[1]], model.config.id2label[predicted_class_idx])
        Beyoncé PER
        Los Angeles LOC
        ```NTrY  rU   ru   r   rv   c                 s   r   r}   r#   r   r#   r#   r$   r   :  ra  z:LukeForEntitySpanClassification.forward.<locals>.<genexpr>rb  )rR   r6  r  r   r\   rg   rh   rX   rZ   r   gatherr   r   rO   r`  rc  r   r   rd  r   r_  re  rz   r"   r,   r   r-   r4   )rQ   r`   r   ra   r_   rs   r2  r3  r4  rl  rm  rb   rW  r   r   r   r:  r   rD   start_states
end_statesrg  r*   r'   r#   r#   r$   re     sX   O


  z'LukeForEntitySpanClassification.forward)NNNNNNNNNNNNNNN)r   r   r   rA   r   r   r~   r    rG  r"   r4   re   rj   r#   r#   rS   r$   rk    sf    	
rk  z
    The LUKE Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    c                       r\  )LukeForSequenceClassificationc                    sZ   t  | |j| _t|| _t|jd ur|jn|j| _	t
|j|j| _|   d S r}   r@   rA   r_  r!  r  r   rM   classifier_dropoutrN   rO   rq   rD   r`  r%  rP   rS   r#   r$   rA   P  s   
z&LukeForSequenceClassification.__init__Nr`   r   ra   r_   rs   r2  r3  r4  rb   rW  r   r   r   r   c                 K   s  |dur|n| j j}| j|||||||||	||dd}|j}| |}| |}d}|
dur|
|j}
| j jdu r^| j	dkrDd| j _n| j	dkrZ|
j
tjksU|
j
tjkrZd| j _nd| j _| j jdkr|t }| j	dkrv|| |
 }n+|||
}n%| j jdkrt }||d| j	|
d}n| j jdkrt }|||
}|std	d
 |||j|j|jfD S t|||j|j|jdS )a  
        entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
            Indices of entity tokens in the entity vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.
        entity_attention_mask (`torch.FloatTensor` of shape `(batch_size, entity_length)`, *optional*):
            Mask to avoid performing attention on padding entity token indices. Mask values selected in `[0, 1]`:

            - 1 for entity tokens that are **not masked**,
            - 0 for entity tokens that are **masked**.
        entity_token_type_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
            Segment token indices to indicate first and second portions of the entity token inputs. Indices are
            selected in `[0, 1]`:

            - 0 corresponds to a *portion A* entity token,
            - 1 corresponds to a *portion B* entity token.
        entity_position_ids (`torch.LongTensor` of shape `(batch_size, entity_length, max_mention_length)`, *optional*):
            Indices of positions of each input entity in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        NTrY  r   
regressionsingle_label_classificationmulti_label_classificationrU   c                 s   r   r}   r#   r   r#   r#   r$   r     ra  z8LukeForSequenceClassification.forward.<locals>.<genexpr>rb  )rR   r6  r  r5  rO   r`  rZ   rX   problem_typer_  rW   r   r^   r   r   squeezer   r   r   r"   r,   r   r-   r5   )rQ   r`   r   ra   r_   rs   r2  r3  r4  rb   rW  r   r   r   r:  r   r  r*   r'   loss_fctr#   r#   r$   re   \  sb   +



"


z%LukeForSequenceClassification.forwardrh  )r   r   r   rA   r   r   r~   r    rG  r"   r5   re   rj   r#   r#   rS   r$   rq  I  ri  rq  z
    The LUKE Model with a token classification head on top (a linear layer on top of the hidden-states output). To
    solve Named-Entity Recognition (NER) task using LUKE, `LukeForEntitySpanClassification` is more suitable than this
    class.
    c                       r\  )LukeForTokenClassificationc                    s^   t  | |j| _t|dd| _t|jd ur|jn|j| _	t
|j|j| _|   d S NF)r"  rr  rP   rS   r#   r$   rA     s   z#LukeForTokenClassification.__init__Nr`   r   ra   r_   rs   r2  r3  r4  rb   rW  r   r   r   r   c                 K   s   |dur|n| j j}| j|||||||||	||dd}|j}| |}| |}d}|
durD|
|j}
t }||	d| j
|
	d}|sWtdd |||j|j|jfD S t|||j|j|jdS )aM  
        entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
            Indices of entity tokens in the entity vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.
        entity_attention_mask (`torch.FloatTensor` of shape `(batch_size, entity_length)`, *optional*):
            Mask to avoid performing attention on padding entity token indices. Mask values selected in `[0, 1]`:

            - 1 for entity tokens that are **not masked**,
            - 0 for entity tokens that are **masked**.
        entity_token_type_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
            Segment token indices to indicate first and second portions of the entity token inputs. Indices are
            selected in `[0, 1]`:

            - 0 corresponds to a *portion A* entity token,
            - 1 corresponds to a *portion B* entity token.
        entity_position_ids (`torch.LongTensor` of shape `(batch_size, entity_length, max_mention_length)`, *optional*):
            Indices of positions of each input entity in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        NTrY  rU   c                 s   r   r}   r#   r   r#   r#   r$   r   $  ra  z5LukeForTokenClassification.forward.<locals>.<genexpr>rb  )rR   r6  r  r   rO   r`  rZ   rX   r   r   r_  r"   r,   r   r-   r6   )rQ   r`   r   ra   r_   rs   r2  r3  r4  rb   rW  r   r   r   r:  r   rB  r*   r'   ry  r#   r#   r$   re     sD   +

z"LukeForTokenClassification.forwardrh  )r   r   r   rA   r   r   r~   r    rG  r"   r6   re   rj   r#   r#   rS   r$   rz    sZ    	
rz  c                !       s   e Zd Z fddZe														ddejdB dejdB dejdB dejdB dejdB d	ejdB d
ejdB dejdB dejdB dejdB dejdB dedB dedB dedB de	e
B fddZ  ZS )LukeForQuestionAnsweringc                    s@   t  | |j| _t|dd| _t|j|j| _| 	  d S r{  )
r@   rA   r_  r!  r  r   rq   rD   
qa_outputsr%  rP   rS   r#   r$   rA   5  s
   z!LukeForQuestionAnswering.__init__Nr`   r   ra   r_   rs   r2  r3  r4  rb   start_positionsend_positionsr   r   r   r   c                 K   sB  |dur|n| j j}| j|||||||||	||dd}|j}| |}|jddd\}}|d}|d}d}|
dur|durt|
 dkrM|
d}
t| dkrZ|d}|d}|
	d| |	d| t
|d}|||
}|||}|| d	 }|std
d ||||j|j|jfD S t||||j|j|jdS )a  
        entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
            Indices of entity tokens in the entity vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.
        entity_attention_mask (`torch.FloatTensor` of shape `(batch_size, entity_length)`, *optional*):
            Mask to avoid performing attention on padding entity token indices. Mask values selected in `[0, 1]`:

            - 1 for entity tokens that are **not masked**,
            - 0 for entity tokens that are **masked**.
        entity_token_type_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
            Segment token indices to indicate first and second portions of the entity token inputs. Indices are
            selected in `[0, 1]`:

            - 0 corresponds to a *portion A* entity token,
            - 1 corresponds to a *portion B* entity token.
        entity_position_ids (`torch.LongTensor` of shape `(batch_size, entity_length, max_mention_length)`, *optional*):
            Indices of positions of each input entity in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.
        NTrY  r   rU   rv   r   )ignore_indexr   c                 s   r   r}   r#   r   r#   r#   r$   r     s    	z3LukeForQuestionAnswering.forward.<locals>.<genexpr>)r'   r8   r9   r,   r   r-   )rR   r6  r  r   r}  splitrx  lenr\   clamp_r   r"   r,   r   r-   r7   )rQ   r`   r   ra   r_   rs   r2  r3  r4  rb   r~  r  r   r   r   r:  r   rB  r*   r8   r9   
total_lossignored_indexry  
start_lossend_lossr#   r#   r$   re   @  sf   (








z LukeForQuestionAnswering.forwardrZ  )r   r   r   rA   r   r   r~   r    rG  r"   r7   re   rj   r#   r#   rS   r$   r|  3  s`    	
r|  c                       r\  )LukeForMultipleChoicec                    sP   t  | t|| _t|jd ur|jn|j| _t	|j
d| _|   d S r   )r@   rA   r!  r  r   rM   rs  rN   rO   rq   rD   r`  r%  rP   rS   r#   r$   rA     s   
zLukeForMultipleChoice.__init__Nr`   r   ra   r_   rs   r2  r3  r4  rb   rW  r   r   r   r   c                 K   s  |dur|n| j j}|dur|jd n|	jd }|dur%|d|dnd}|dur4|d|dnd}|durC|d|dnd}|durR|d|dnd}|	dure|	d|	d|	dnd}	|durt|d|dnd}|dur|d|dnd}|dur|d|dnd}|dur|d|d|dnd}| j|||||||||	||dd}|j}| |}| |}|d|}d}|
dur|
	|j
}
t }|||
}|stdd |||j|j|jfD S t|||j|j|jd	S )
a^  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
            Indices of entity tokens in the entity vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.
        entity_attention_mask (`torch.FloatTensor` of shape `(batch_size, entity_length)`, *optional*):
            Mask to avoid performing attention on padding entity token indices. Mask values selected in `[0, 1]`:

            - 1 for entity tokens that are **not masked**,
            - 0 for entity tokens that are **masked**.
        entity_token_type_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
            Segment token indices to indicate first and second portions of the entity token inputs. Indices are
            selected in `[0, 1]`:

            - 0 corresponds to a *portion A* entity token,
            - 1 corresponds to a *portion B* entity token.
        entity_position_ids (`torch.LongTensor` of shape `(batch_size, entity_length, max_mention_length)`, *optional*):
            Indices of positions of each input entity in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Nr   rU   ru   TrY  c                 s   r   r}   r#   r   r#   r#   r$   r   5  r   z0LukeForMultipleChoice.forward.<locals>.<genexpr>rb  )rR   r6  rE  r   r\   r  r5  rO   r`  rZ   rX   r   r"   r,   r   r-   r:   )rQ   r`   r   ra   r_   rs   r2  r3  r4  rb   rW  r   r   r   r:  num_choicesr   r  r*   reshaped_logitsr'   ry  r#   r#   r$   re     s|   C


zLukeForMultipleChoice.forwardrh  )r   r   r   rA   r   r   r~   r    rG  r"   r:   re   rj   r#   r#   rS   r$   r    sZ    	
r  )
r]  rj  rk  r  r|  rq  rz  rO  r!  r  )Fr   r   dataclassesr   r   r   torch.nnr   r   r    r   r  activationsr	   r
   modeling_layersr   modeling_outputsr   r   modeling_utilsr   pytorch_utilsr   utilsr   r   r   configuration_luker   
get_loggerr   loggerr   r%   r&   r/   r3   r4   r5   r6   r7   r:   r   r;   rk   r   r   r   r   r   r   r   r  r  r	  r  r!  rY   rL  rO  r]  rj  rk  rq  rz  r|  r  __all__r#   r#   r#   r$   <module>   s   
G+m*2B R {  wfu !