o
    wi                    @   s  d Z ddlZddlmZ ddlmZmZ ddlZddlZddlm	Z	 ddl
mZmZmZ ddlmZmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZ ddlmZmZmZ ddlmZ e e!Z"eeddG dd deZ#eeddG dd deZ$eeddG dd deZ%eeddG dd deZ&eeddG dd deZ'eeddG d d! d!eZ(eed"dG d#d$ d$eZ)eed%dG d&d' d'eZ*eed(dG d)d* d*eZ+eed+dG d,d- d-eZ,G d.d/ d/e	j-Z.G d0d1 d1e	j-Z/G d2d3 d3e	j-Z0G d4d5 d5e	j-Z1G d6d7 d7e	j-Z2G d8d9 d9e	j-Z3G d:d; d;e	j-Z4G d<d= d=eZ5G d>d? d?e	j-Z6G d@dA dAe	j-Z7G dBdC dCe	j-Z8G dDdE dEe	j-Z9eG dFdG dGeZ:edHdG dIdJ dJe:Z;dKdL Z<G dMdN dNe	j-Z=edOdG dPdQ dQe:Z>edRdG dSdT dTe:Z?edUdG dVdW dWe:Z@edXdG dYdZ dZe:ZAed[dG d\d] d]e:ZBed^dG d_d` d`e:ZCeG dadb dbe:ZDeG dcdd dde:ZEg deZFdS )fzPyTorch LUKE model.    N)	dataclass)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FNgelu)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)PreTrainedModel)apply_chunking_to_forward)ModelOutputauto_docstringlogging   )
LukeConfigz3
    Base class for outputs of the LUKE model.
    )custom_introc                   @   >   e Zd ZU dZdZeej ed< dZ	ee
ejdf  ed< dS )BaseLukeModelOutputWithPoolingax  
    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
        Last layer hidden-state of the first token of the sequence (classification token) further processed by a
        Linear layer and a Tanh activation function.
    entity_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, entity_length, hidden_size)`):
        Sequence of entity hidden-states at the output of the last layer of the model.
    entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
        layer plus the initial entity embedding outputs.
    Nentity_last_hidden_state.entity_hidden_states__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   tuple r$   r$   c/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/luke/modeling_luke.pyr   &   s   
 r   zV
    Base class for model's outputs, with potential hidden states and attentions.
    c                   @   r   )BaseLukeModelOutputa  
    entity_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, entity_length, hidden_size)`):
        Sequence of entity hidden-states at the output of the last layer of the model.
    entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
        layer plus the initial entity embedding outputs.
    Nr   .r   r   r$   r$   r$   r%   r&   =   s   
 	r&   c                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZeej ed< dZeej ed< dZeeej  ed< dZeeejd	f  ed
< dZeeejd	f  ed< dS )LukeMaskedLMOutputa:  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        The sum of masked language modeling (MLM) loss and entity prediction loss.
    mlm_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Masked language modeling (MLM) loss.
    mep_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Masked entity prediction (MEP) loss.
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    entity_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the entity prediction head (scores for each entity vocabulary token before SoftMax).
    entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
        layer plus the initial entity embedding outputs.
    Nlossmlm_lossmep_losslogitsentity_logitshidden_states.r   
attentions)r   r   r   r   r(   r   r    r!   r"   r)   r*   r+   r,   r-   r#   r   r.   r$   r$   r$   r%   r'   Q   s   
 r'   z2
    Outputs of entity classification models.
    c                   @      e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeejdf  ed< dZeeejdf  ed< dZeeejdf  ed< dS )	EntityClassificationOutput  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Classification loss.
    logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
        Classification scores (before SoftMax).
    entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
        layer plus the initial entity embedding outputs.
    Nr(   r+   .r-   r   r.   r   r   r   r   r(   r   r    r!   r"   r+   r-   r#   r   r.   r$   r$   r$   r%   r0   s      
 r0   z7
    Outputs of entity pair classification models.
    c                   @   r/   )	EntityPairClassificationOutputr1   Nr(   r+   .r-   r   r.   r2   r$   r$   r$   r%   r4      r3   r4   z7
    Outputs of entity span classification models.
    c                   @   r/   )	EntitySpanClassificationOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Classification loss.
    logits (`torch.FloatTensor` of shape `(batch_size, entity_length, config.num_labels)`):
        Classification scores (before SoftMax).
    entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
        layer plus the initial entity embedding outputs.
    Nr(   r+   .r-   r   r.   r2   r$   r$   r$   r%   r5      r3   r5   z4
    Outputs of sentence classification models.
    c                   @   r/   )	LukeSequenceClassifierOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Classification (or regression if config.num_labels==1) loss.
    logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
        Classification (or regression if config.num_labels==1) scores (before SoftMax).
    entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
        layer plus the initial entity embedding outputs.
    Nr(   r+   .r-   r   r.   r2   r$   r$   r$   r%   r6      r3   r6   z@
    Base class for outputs of token classification models.
    c                   @   r/   )	LukeTokenClassifierOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Classification loss.
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`):
        Classification scores (before SoftMax).
    entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
        layer plus the initial entity embedding outputs.
    Nr(   r+   .r-   r   r.   r2   r$   r$   r$   r%   r7      r3   r7   z/
    Outputs of question answering models.
    c                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZeeejdf  ed< dZeeejdf  ed< dZeeejdf  ed	< dS )
 LukeQuestionAnsweringModelOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
    entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
        layer plus the initial entity embedding outputs.
    Nr(   start_logits
end_logits.r-   r   r.   )r   r   r   r   r(   r   r    r!   r"   r9   r:   r-   r#   r   r.   r$   r$   r$   r%   r8      s   
 	r8   z,
    Outputs of multiple choice models.
    c                   @   r/   )	LukeMultipleChoiceModelOutputa  
    loss (`torch.FloatTensor` of shape *(1,)*, *optional*, returned when `labels` is provided):
        Classification loss.
    logits (`torch.FloatTensor` of shape `(batch_size, num_choices)`):
        *num_choices* is the second dimension of the input tensors. (see *input_ids* above).

        Classification scores (before SoftMax).
    entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
        layer plus the initial entity embedding outputs.
    Nr(   r+   .r-   r   r.   r2   r$   r$   r$   r%   r;     s   
 r;   c                       s:   e Zd ZdZ fddZ				d	ddZdd Z  ZS )
LukeEmbeddingszV
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    c                    s   t    tj|j|j|jd| _t|j|j| _	t|j
|j| _tj|j|jd| _t|j| _|j| _tj|j|j| jd| _	d S )Npadding_idxeps)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutr>   selfconfig	__class__r$   r%   rB   (  s   
zLukeEmbeddings.__init__Nc           	      C   s   |d u r|d urt || j|j}n| |}|d ur!| }n| d d }|d u r8tj|tj| j	jd}|d u rA| 
|}| |}| |}|| | }| |}| |}|S )Ndtypedevice)"create_position_ids_from_input_idsr>   torY   &create_position_ids_from_inputs_embedssizer    zeroslongposition_idsrG   rI   rK   rL   rP   )	rR   	input_idstoken_type_idsr`   inputs_embedsinput_shaperI   rK   
embeddingsr$   r$   r%   forward9  s"   






zLukeEmbeddings.forwardc                 C   sN   |  dd }|d }tj| jd || j d tj|jd}|d|S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        NrV   r   rW   r   )r]   r    aranger>   r_   rY   	unsqueezeexpand)rR   rc   rd   sequence_lengthr`   r$   r$   r%   r\   Z  s   	z5LukeEmbeddings.create_position_ids_from_inputs_embeds)NNNN)r   r   r   r   rB   rf   r\   __classcell__r$   r$   rT   r%   r<   #  s    
!r<   c                       sF   e Zd Zdef fddZ	d
dejdejdeej fdd	Z  Z	S )LukeEntityEmbeddingsrS   c                    s   t    || _tj|j|jdd| _|j|jkr$tj	|j|jdd| _
t|j|j| _t|j|j| _tj|j|jd| _t|j| _d S )Nr   r=   Fbiasr?   )rA   rB   rS   r   rC   entity_vocab_sizeentity_emb_sizeentity_embeddingsrE   Linearentity_embedding_denserH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rT   r$   r%   rB   m  s   
zLukeEntityEmbeddings.__init__N
entity_idsr`   rb   c           	      C   s   |d u r	t |}| |}| jj| jjkr| |}| |jdd}|dk	|
d}|| }t j|dd}||jddjdd }| |}|| | }| |}| |}|S )Nr   )minrV   dimgHz>)r    
zeros_likerq   rS   rp   rE   rs   rI   clamptype_asrh   sumrK   rL   rP   )	rR   rt   r`   rb   rq   rI   position_embedding_maskrK   re   r$   r$   r%   rf   {  s   





zLukeEntityEmbeddings.forwardN)
r   r   r   r   rB   r    
LongTensorr   rf   rk   r$   r$   rT   r%   rl   l  s    rl   c                       4   e Zd Z fddZdd Z			d	ddZ  ZS )
LukeSelfAttentionc                    s   t    |j|j dkrt|dstd|j d|j d|j| _t|j|j | _| j| j | _|j	| _	t
|j| j| _t
|j| j| _t
|j| j| _| j	rpt
|j| j| _t
|j| j| _t
|j| j| _t
|j| _d S )Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .)rA   rB   rE   num_attention_headshasattr
ValueErrorintattention_head_sizeall_head_sizeuse_entity_aware_attentionr   rr   querykeyvalue	w2e_query	e2w_query	e2e_queryrN   attention_probs_dropout_probrP   rQ   rT   r$   r%   rB     s&   

zLukeSelfAttention.__init__c                 C   s6   |  d d | j| jf }|j| }|ddddS )NrV   r      r   r	   )r]   r   r   viewpermute)rR   xnew_x_shaper$   r$   r%   transpose_for_scores  s   
z&LukeSelfAttention.transpose_for_scoresNFc                  C   s  | d}|d u r|}n	tj||gdd}| | |}| | |}	| jr|d ur| | |}
| | |}| | 	|}| | 
|}|d d d d d |d d f }|d d d d d |d d f }|d d d d |d d d f }|d d d d |d d d f }t|
|dd}t||dd}t||dd}t||dd}tj||gdd}tj||gdd}tj||gdd}n| | |}t||dd}|t| j }|d ur|| }tjj|dd}| |}|d ur|| }t||	}|dddd }|  d d | jf }|j| }|d d d |d d f }|d u r>d }n|d d |d d d f }|rU|||f}|S ||f}|S )Nr   rw   rV   rv   r	   r   r   )r]   r    catr   r   r   r   r   r   r   r   matmul	transposemathsqrtr   r   
functionalsoftmaxrP   r   
contiguousr   r   ) rR   word_hidden_statesr   attention_mask	head_maskoutput_attentions	word_sizeconcat_hidden_states	key_layervalue_layerw2w_query_layerw2e_query_layere2w_query_layere2e_query_layerw2w_key_layere2w_key_layerw2e_key_layere2e_key_layerw2w_attention_scoresw2e_attention_scorese2w_attention_scorese2e_attention_scoresword_attention_scoresentity_attention_scoresattention_scoresquery_layerattention_probscontext_layernew_context_layer_shapeoutput_word_hidden_statesoutput_entity_hidden_statesoutputsr$   r$   r%   rf     sX   
    




zLukeSelfAttention.forwardNNF)r   r   r   rB   r   rf   rk   r$   r$   rT   r%   r     s    	r   c                       8   e Zd Z fddZdejdejdejfddZ  ZS )LukeSelfOutputc                    sB   t    t|j|j| _tj|j|jd| _t|j	| _
d S Nr?   )rA   rB   r   rr   rE   denserL   rM   rN   rO   rP   rQ   rT   r$   r%   rB   
     
zLukeSelfOutput.__init__r-   input_tensorreturnc                 C   &   |  |}| |}| || }|S r~   r   rP   rL   rR   r-   r   r$   r$   r%   rf        

zLukeSelfOutput.forwardr   r   r   rB   r    Tensorrf   rk   r$   r$   rT   r%   r   	      $r   c                       r   )
LukeAttentionc                    s*   t    t|| _t|| _t | _d S r~   )rA   rB   r   rR   r   outputsetpruned_headsrQ   rT   r$   r%   rB     s   


zLukeAttention.__init__c                 C      t dNz4LUKE does not support the pruning of attention headsNotImplementedError)rR   headsr$   r$   r%   prune_heads     zLukeAttention.prune_headsNFc                 C   s   | d}| |||||}|d u r|d }|}	ntj|d d dd}tj||gdd}	| ||	}
|
d d d |d d f }|d u rGd }n|
d d |d d d f }||f|dd   }|S )Nr   r   r   rw   )r]   rR   r    r   r   )rR   r   r   r   r   r   r   self_outputsconcat_self_outputsr   attention_outputword_attention_outputentity_attention_outputr   r$   r$   r%   rf   !  s(   
zLukeAttention.forwardr   )r   r   r   rB   r   rf   rk   r$   r$   rT   r%   r     s    r   c                       2   e Zd Z fddZdejdejfddZ  ZS )LukeIntermediatec                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S r~   )rA   rB   r   rr   rE   intermediate_sizer   
isinstance
hidden_actstrr
   intermediate_act_fnrQ   rT   r$   r%   rB   H  s
   
zLukeIntermediate.__init__r-   r   c                 C   s   |  |}| |}|S r~   )r   r   rR   r-   r$   r$   r%   rf   P  s   

zLukeIntermediate.forwardr   r$   r$   rT   r%   r   G  s    r   c                       r   )
LukeOutputc                    sB   t    t|j|j| _tj|j|jd| _t	|j
| _d S r   )rA   rB   r   rr   r   rE   r   rL   rM   rN   rO   rP   rQ   rT   r$   r%   rB   X  r   zLukeOutput.__init__r-   r   r   c                 C   r   r~   r   r   r$   r$   r%   rf   ^  r   zLukeOutput.forwardr   r$   r$   rT   r%   r   W  r   r   c                       s4   e Zd Z fddZ			d	ddZdd Z  ZS )
	LukeLayerc                    s:   t    |j| _d| _t|| _t|| _t|| _	d S Nr   )
rA   rB   chunk_size_feed_forwardseq_len_dimr   	attentionr   intermediater   r   rQ   rT   r$   r%   rB   f  s   


zLukeLayer.__init__NFc                 C   s   | d}| j|||||d}|d u r|d }ntj|d d dd}|dd  }	t| j| j| j|}
|
d d d |d d f }|d u rGd }n|
d d |d d d f }||f|	 }	|	S )Nr   )r   r   r   rw   )r]   r   r    r   r   feed_forward_chunkr   r   )rR   r   r   r   r   r   r   self_attention_outputsconcat_attention_outputr   layer_outputword_layer_outputentity_layer_outputr$   r$   r%   rf   n  s*   

zLukeLayer.forwardc                 C   s   |  |}| ||}|S r~   )r   r   )rR   r   intermediate_outputr   r$   r$   r%   r     s   
zLukeLayer.feed_forward_chunkr   )r   r   r   rB   rf   r   rk   r$   r$   rT   r%   r   e  s    
%r   c                       s0   e Zd Z fddZ					dddZ  ZS )	LukeEncoderc                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r$   )r   ).0_rS   r$   r%   
<listcomp>  s    z(LukeEncoder.__init__.<locals>.<listcomp>F)	rA   rB   rS   r   
ModuleListrangenum_hidden_layerslayergradient_checkpointingrQ   rT   r   r%   rB     s   
 
zLukeEncoder.__init__NFTc                 C   s   |rdnd }|r
dnd }	|rdnd }
t | jD ]7\}}|r'||f }|	|f }	|d ur/|| nd }||||||}|d }|d urE|d }|rN|
|d f }
q|r[||f }|	|f }	|sktdd |||
||	fD S t|||
||	dS )Nr$   r   r   r   c                 s       | ]	}|d ur|V  qd S r~   r$   r   vr$   r$   r%   	<genexpr>      z&LukeEncoder.forward.<locals>.<genexpr>)last_hidden_stater-   r.   r   r   )	enumerater   r#   r&   )rR   r   r   r   r   r   output_hidden_statesreturn_dictall_word_hidden_statesall_entity_hidden_statesall_self_attentionsilayer_modulelayer_head_masklayer_outputsr$   r$   r%   rf     sP   




zLukeEncoder.forward)NNFFTr   r   r   rB   rf   rk   r$   r$   rT   r%   r     s    
r   c                       r   )
LukePoolerc                    s*   t    t|j|j| _t | _d S r~   )rA   rB   r   rr   rE   r   Tanh
activationrQ   rT   r$   r%   rB     s   
zLukePooler.__init__r-   r   c                 C   s(   |d d df }|  |}| |}|S )Nr   )r   r  )rR   r-   first_token_tensorpooled_outputr$   r$   r%   rf     s   

zLukePooler.forwardr   r$   r$   rT   r%   r    s    r  c                       $   e Zd Z fddZdd Z  ZS )EntityPredictionHeadTransformc                    sV   t    t|j|j| _t|jt	rt
|j | _n|j| _tj|j|jd| _d S r   )rA   rB   r   rr   rE   rp   r   r   r   r   r
   transform_act_fnrL   rM   rQ   rT   r$   r%   rB     s   
z&EntityPredictionHeadTransform.__init__c                 C   s"   |  |}| |}| |}|S r~   )r   r  rL   r   r$   r$   r%   rf     s   


z%EntityPredictionHeadTransform.forwardr  r$   r$   rT   r%   r    s    	r  c                       r  )EntityPredictionHeadc                    sH   t    || _t|| _tj|j|jdd| _	t
t|j| _d S )NFrm   )rA   rB   rS   r  	transformr   rr   rp   ro   decoder	Parameterr    r^   rn   rQ   rT   r$   r%   rB     s
   

zEntityPredictionHead.__init__c                 C   s   |  |}| || j }|S r~   )r  r  rn   r   r$   r$   r%   rf     s   
zEntityPredictionHead.forwardr  r$   r$   rT   r%   r    s    r  c                   @   s0   e Zd ZeZdZdZddgZdej	fddZ
dS )	LukePreTrainedModellukeTr   rl   modulec                 C   s   t |tjr |jjjd| jjd |jdur|jj	  dS dS t |tj
rO|jdkr2|jj	  n|jjjd| jjd |jdurM|jj|j 	  dS dS t |tjrd|jj	  |jjd dS dS )zInitialize the weightsg        )meanstdNr         ?)r   r   rr   weightdatanormal_rS   initializer_rangern   zero_rC   embedding_dimr>   rL   fill_)rR   r  r$   r$   r%   _init_weights  s    


z!LukePreTrainedModel._init_weightsN)r   r   r   r   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modulesr   Moduler'  r$   r$   r$   r%   r    s    r  zt
    The bare LUKE model transformer outputting raw hidden-states for both word tokens and entities without any
    c                        s  e Zd Zd$dedef fddZdd Zdd	 Zd
d Zdd Z	dd Z
e													d%deej deej deej deej deej deej deej deej deej deej dee dee dee deeef fdd Zd!ejdeej fd"d#Z  ZS )&	LukeModelTrS   add_pooling_layerc                    sN   t  | || _t|| _t|| _t|| _|rt	|nd| _
|   dS )zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)rA   rB   rS   r<   re   rl   rq   r   encoderr  pooler	post_init)rR   rS   r.  rT   r$   r%   rB   ,  s   


zLukeModel.__init__c                 C      | j jS r~   re   rG   rR   r$   r$   r%   get_input_embeddings=  r   zLukeModel.get_input_embeddingsc                 C      || j _d S r~   r3  rR   r   r$   r$   r%   set_input_embeddings@     zLukeModel.set_input_embeddingsc                 C   s   | j j S r~   rq   r4  r$   r$   r%   get_entity_embeddingsC  r   zLukeModel.get_entity_embeddingsc                 C   s   || j _ d S r~   r:  r7  r$   r$   r%   set_entity_embeddingsF  r9  zLukeModel.set_entity_embeddingsc                 C   r   r   r   )rR   heads_to_pruner$   r$   r%   _prune_headsI  r   zLukeModel._prune_headsNra   r   rb   r`   rt   entity_attention_maskentity_token_type_idsentity_position_idsr   rc   r   r  r  r   c              	   C   s  |dur|n| j j}|dur|n| j j}|dur|n| j j}|dur*|
dur*td|dur9| || | }n|
durF|
 dd }ntd|\}}|durU|jn|
j}|du retj	||f|d}|du rrtj
|tj|d}|dur|d}|du rtj	||f|d}|du rtj
||ftj|d}| |	| j j}	| j||||
d}| ||}|du rd}n| |||}| j||||	|||d	}|d
 }| jdur| |nd}|s||f|dd  S t|||j|j|j|jdS )uz  
        entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
            Indices of entity tokens in the entity vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.
        entity_attention_mask (`torch.FloatTensor` of shape `(batch_size, entity_length)`, *optional*):
            Mask to avoid performing attention on padding entity token indices. Mask values selected in `[0, 1]`:

            - 1 for entity tokens that are **not masked**,
            - 0 for entity tokens that are **masked**.
        entity_token_type_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
            Segment token indices to indicate first and second portions of the entity token inputs. Indices are
            selected in `[0, 1]`:

            - 0 corresponds to a *portion A* entity token,
            - 1 corresponds to a *portion B* entity token.
        entity_position_ids (`torch.LongTensor` of shape `(batch_size, entity_length, max_mention_length)`, *optional*):
            Indices of positions of each input entity in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, LukeModel

        >>> tokenizer = AutoTokenizer.from_pretrained("studio-ousia/luke-base")
        >>> model = LukeModel.from_pretrained("studio-ousia/luke-base")
        # Compute the contextualized entity representation corresponding to the entity mention "Beyoncé"

        >>> text = "Beyoncé lives in Los Angeles."
        >>> entity_spans = [(0, 7)]  # character-based entity span corresponding to "Beyoncé"

        >>> encoding = tokenizer(text, entity_spans=entity_spans, add_prefix_space=True, return_tensors="pt")
        >>> outputs = model(**encoding)
        >>> word_last_hidden_state = outputs.last_hidden_state
        >>> entity_last_hidden_state = outputs.entity_last_hidden_state
        # Input Wikipedia entities to obtain enriched contextualized representations of word tokens

        >>> text = "Beyoncé lives in Los Angeles."
        >>> entities = [
        ...     "Beyoncé",
        ...     "Los Angeles",
        ... ]  # Wikipedia entity titles corresponding to the entity mentions "Beyoncé" and "Los Angeles"
        >>> entity_spans = [
        ...     (0, 7),
        ...     (17, 28),
        ... ]  # character-based entity spans corresponding to "Beyoncé" and "Los Angeles"

        >>> encoding = tokenizer(
        ...     text, entities=entities, entity_spans=entity_spans, add_prefix_space=True, return_tensors="pt"
        ... )
        >>> outputs = model(**encoding)
        >>> word_last_hidden_state = outputs.last_hidden_state
        >>> entity_last_hidden_state = outputs.entity_last_hidden_state
        ```NzDYou cannot specify both input_ids and inputs_embeds at the same timerV   z5You have to specify either input_ids or inputs_embeds)rY   rW   r   )ra   r`   rb   rc   )r   r   r   r  r  r   )r  pooler_outputr-   r.   r   r   )rS   r   r  use_return_dictr   %warn_if_padding_and_no_attention_maskr]   rY   r    onesr^   r_   get_head_maskr   re   get_extended_attention_maskrq   r/  r0  r   r-   r.   r   r   )rR   ra   r   rb   r`   rt   r?  r@  rA  r   rc   r   r  r  rd   
batch_size
seq_lengthrY   entity_seq_lengthword_embedding_outputextended_attention_maskentity_embedding_outputencoder_outputssequence_outputr  r$   r$   r%   rf   L  sp   I

zLukeModel.forwardword_attention_maskc                 C   s   |}|durt j||gdd}| dkr$|dddddddf }n| dkr7|ddddddf }n	td|j d|j| jd}d	| t | jj }|S )
ac  
        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.

        Arguments:
            word_attention_mask (`torch.LongTensor`):
                Attention mask for word tokens with ones indicating tokens to attend to, zeros for tokens to ignore.
            entity_attention_mask (`torch.LongTensor`, *optional*):
                Attention mask for entity tokens with ones indicating tokens to attend to, zeros for tokens to ignore.

        Returns:
            `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
        NrV   rw   r	   r   z&Wrong shape for attention_mask (shape ))rX   r  )	r    r   rx   r   shaper[   rX   finforu   )rR   rP  r?  r   rL  r$   r$   r%   rG    s   z%LukeModel.get_extended_attention_mask)T)NNNNNNNNNNNNN)r   r   r   r   boolrB   r5  r8  r;  r<  r>  r   r   r    r   r!   r   r#   r   rf   rG  rk   r$   r$   rT   r%   r-  &  sp    	

 r-  c                 C   s2   |  | }tj|dd|| }| | S )a  
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.

    Args:
        x: torch.Tensor x:

    Returns: torch.Tensor
    r   rw   )ner   r    cumsumr{   r_   )ra   r>   maskincremental_indicesr$   r$   r%   rZ     s   rZ   c                       s0   e Zd ZdZ fddZdd Zdd Z  ZS )
LukeLMHeadz*Roberta Head for masked language modeling.c                    sd   t    t|j|j| _tj|j|jd| _t|j|j	| _
tt|j	| _| j| j
_d S r   )rA   rB   r   rr   rE   r   rL   rM   
layer_normrD   r  r  r    r^   rn   rQ   rT   r$   r%   rB     s   
zLukeLMHead.__init__c                 K   s*   |  |}t|}| |}| |}|S r~   )r   r   rZ  r  )rR   featureskwargsr   r$   r$   r%   rf   $  s
   


zLukeLMHead.forwardc                 C   s,   | j jjjdkr| j| j _d S | j j| _d S )Nmeta)r  rn   rY   typer4  r$   r$   r%   _tie_weights.  s   zLukeLMHead._tie_weights)r   r   r   r   rB   rf   r_  rk   r$   r$   rT   r%   rY    s
    	
rY  z
    The LUKE model with a language modeling head and entity prediction head on top for masked language modeling and
    masked entity prediction.
    c                $       s
  e Zd Zg dZ fddZ fddZdd Zdd	 Ze	
	
	
	
	
	
	
	
	
	
	
	
	
	
	
dde	e
j de	e
j de	e
j de	e
j de	e
j de	e
j de	e
j de	e
j de	e
j de	e
j de	e
j de	e
j de	e de	e de	e deeef f ddZ  ZS )LukeForMaskedLM)zlm_head.decoder.weightzlm_head.decoder.biasz!entity_predictions.decoder.weightc                    s@   t  | t|| _t|| _t|| _t	 | _
|   d S r~   )rA   rB   r-  r  rY  lm_headr  entity_predictionsr   r   loss_fnr1  rQ   rT   r$   r%   rB   @  s   



zLukeForMaskedLM.__init__c                    s$   t    | | jj| jjj d S r~   )rA   tie_weights_tie_or_clone_weightsrb  r  r  rq   r4  rT   r$   r%   rd  M  s   
zLukeForMaskedLM.tie_weightsc                 C   r2  r~   ra  r  r4  r$   r$   r%   get_output_embeddingsQ  r   z%LukeForMaskedLM.get_output_embeddingsc                 C   r6  r~   rf  )rR   new_embeddingsr$   r$   r%   set_output_embeddingsT  r9  z%LukeForMaskedLM.set_output_embeddingsNra   r   rb   r`   rt   r?  r@  rA  labelsentity_labelsr   rc   r   r  r  r   c                 C   s.  |dur|n| j j}| j||||||||||||dd}d}d}| |j}|	durE|	|j}	| |d| j j	|	d}|du rE|}d}d}|j
durr| |j
}|
durr| |d| j j|
d}|du rn|}n|| }|stdd ||||||j|j|jfD S t||||||j|j|jdS )aC  
        entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
            Indices of entity tokens in the entity vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.
        entity_attention_mask (`torch.FloatTensor` of shape `(batch_size, entity_length)`, *optional*):
            Mask to avoid performing attention on padding entity token indices. Mask values selected in `[0, 1]`:

            - 1 for entity tokens that are **not masked**,
            - 0 for entity tokens that are **masked**.
        entity_token_type_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
            Segment token indices to indicate first and second portions of the entity token inputs. Indices are
            selected in `[0, 1]`:

            - 0 corresponds to a *portion A* entity token,
            - 1 corresponds to a *portion B* entity token.
        entity_position_ids (`torch.LongTensor` of shape `(batch_size, entity_length, max_mention_length)`, *optional*):
            Indices of positions of each input entity in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        entity_labels (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        NTra   r   rb   r`   rt   r?  r@  rA  r   rc   r   r  r  rV   c                 s   r   r~   r$   r   r$   r$   r%   r     s    
z*LukeForMaskedLM.forward.<locals>.<genexpr>)r(   r)   r*   r+   r,   r-   r   r.   )rS   rC  r  ra  r  r[   rY   rc  r   rD   r   rb  ro   r#   r-   r   r.   r'   )rR   ra   r   rb   r`   rt   r?  r@  rA  rj  rk  r   rc   r   r  r  r   r(   r)   r+   r*   r,   r$   r$   r%   rf   W  sn   1
zLukeForMaskedLM.forwardNNNNNNNNNNNNNNN)r   r   r   _tied_weights_keysrB   rd  rg  ri  r   r   r    r   r!   rT  r   r#   r'   rf   rk   r$   r$   rT   r%   r`  7  sn    	

r`  z
    The LUKE model with a classification head on top (a linear layer on top of the hidden state of the first entity
    token) for entity classification tasks, such as Open Entity.
    c                "          e Zd Z fddZe														ddeej deej deej deej deej d	eej d
eej deej deej deej deej dee	 dee	 dee	 de
eef fddZ  ZS )LukeForEntityClassificationc                    sJ   t  | t|| _|j| _t|j| _t	|j
|j| _|   d S r~   rA   rB   r-  r  
num_labelsr   rN   rO   rP   rr   rE   
classifierr1  rQ   rT   r$   r%   rB     s   
z$LukeForEntityClassification.__init__Nra   r   rb   r`   rt   r?  r@  rA  r   rc   rj  r   r  r  r   c                 C   s   |dur|n| j j}| j|||||||||	|
||dd}|jdddddf }| |}| |}d}|dur[||j}|jdkrKt	j
||}nt	j
|d|d|}|sntdd |||j|j|jfD S t|||j|j|jd	S )
u
  
        entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
            Indices of entity tokens in the entity vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.
        entity_attention_mask (`torch.FloatTensor` of shape `(batch_size, entity_length)`, *optional*):
            Mask to avoid performing attention on padding entity token indices. Mask values selected in `[0, 1]`:

            - 1 for entity tokens that are **not masked**,
            - 0 for entity tokens that are **masked**.
        entity_token_type_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
            Segment token indices to indicate first and second portions of the entity token inputs. Indices are
            selected in `[0, 1]`:

            - 0 corresponds to a *portion A* entity token,
            - 1 corresponds to a *portion B* entity token.
        entity_position_ids (`torch.LongTensor` of shape `(batch_size, entity_length, max_mention_length)`, *optional*):
            Indices of positions of each input entity in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.
        labels (`torch.LongTensor` of shape `(batch_size,)` or `(batch_size, num_labels)`, *optional*):
            Labels for computing the classification loss. If the shape is `(batch_size,)`, the cross entropy loss is
            used for the single-label classification. In this case, labels should contain the indices that should be in
            `[0, ..., config.num_labels - 1]`. If the shape is `(batch_size, num_labels)`, the binary cross entropy
            loss is used for the multi-label classification. In this case, labels should only contain `[0, 1]`, where 0
            and 1 indicate false and true, respectively.

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, LukeForEntityClassification

        >>> tokenizer = AutoTokenizer.from_pretrained("studio-ousia/luke-large-finetuned-open-entity")
        >>> model = LukeForEntityClassification.from_pretrained("studio-ousia/luke-large-finetuned-open-entity")

        >>> text = "Beyoncé lives in Los Angeles."
        >>> entity_spans = [(0, 7)]  # character-based entity span corresponding to "Beyoncé"
        >>> inputs = tokenizer(text, entity_spans=entity_spans, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> logits = outputs.logits
        >>> predicted_class_idx = logits.argmax(-1).item()
        >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
        Predicted class: person
        ```NTrl  r   r   rV   c                 s   r   r~   r$   r   r$   r$   r%   r   ?      z6LukeForEntityClassification.forward.<locals>.<genexpr>r(   r+   r-   r   r.   )rS   rC  r  r   rP   rs  r[   rY   ndimr   r   cross_entropy binary_cross_entropy_with_logitsr   r{   r#   r-   r   r.   r0   rR   ra   r   rb   r`   rt   r?  r@  rA  r   rc   rj  r   r  r  r   feature_vectorr+   r(   r$   r$   r%   rf     sH   >


 z#LukeForEntityClassification.forwardNNNNNNNNNNNNNN)r   r   r   rB   r   r   r    r   r!   rT  r   r#   r0   rf   rk   r$   r$   rT   r%   rp    `    	

rp  z
    The LUKE model with a classification head on top (a linear layer on top of the hidden states of the two entity
    tokens) for entity pair classification tasks, such as TACRED.
    c                "       s   e Zd Z fddZe														ddeej deej deej deej deej d	eej d
eej deej deej deej deej dee	 dee	 dee	 de
eef fddZ  ZS )LukeForEntityPairClassificationc                    sP   t  | t|| _|j| _t|j| _t	|j
d |jd| _|   d S )Nr   Frq  rQ   rT   r$   r%   rB   U  s   
z(LukeForEntityPairClassification.__init__Nra   r   rb   r`   rt   r?  r@  rA  r   rc   rj  r   r  r  r   c                 C   s  |dur|n| j j}| j|||||||||	|
||dd}tj|jdddddf |jdddddf gdd}| |}| |}d}|durl||j	}|j
dkr\tj||}ntj|d|d|}|stdd	 |||j|j|jfD S t|||j|j|jd
S )u  
        entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
            Indices of entity tokens in the entity vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.
        entity_attention_mask (`torch.FloatTensor` of shape `(batch_size, entity_length)`, *optional*):
            Mask to avoid performing attention on padding entity token indices. Mask values selected in `[0, 1]`:

            - 1 for entity tokens that are **not masked**,
            - 0 for entity tokens that are **masked**.
        entity_token_type_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
            Segment token indices to indicate first and second portions of the entity token inputs. Indices are
            selected in `[0, 1]`:

            - 0 corresponds to a *portion A* entity token,
            - 1 corresponds to a *portion B* entity token.
        entity_position_ids (`torch.LongTensor` of shape `(batch_size, entity_length, max_mention_length)`, *optional*):
            Indices of positions of each input entity in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.
        labels (`torch.LongTensor` of shape `(batch_size,)` or `(batch_size, num_labels)`, *optional*):
            Labels for computing the classification loss. If the shape is `(batch_size,)`, the cross entropy loss is
            used for the single-label classification. In this case, labels should contain the indices that should be in
            `[0, ..., config.num_labels - 1]`. If the shape is `(batch_size, num_labels)`, the binary cross entropy
            loss is used for the multi-label classification. In this case, labels should only contain `[0, 1]`, where 0
            and 1 indicate false and true, respectively.

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, LukeForEntityPairClassification

        >>> tokenizer = AutoTokenizer.from_pretrained("studio-ousia/luke-large-finetuned-tacred")
        >>> model = LukeForEntityPairClassification.from_pretrained("studio-ousia/luke-large-finetuned-tacred")

        >>> text = "Beyoncé lives in Los Angeles."
        >>> entity_spans = [
        ...     (0, 7),
        ...     (17, 28),
        ... ]  # character-based entity spans corresponding to "Beyoncé" and "Los Angeles"
        >>> inputs = tokenizer(text, entity_spans=entity_spans, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> logits = outputs.logits
        >>> predicted_class_idx = logits.argmax(-1).item()
        >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
        Predicted class: per:cities_of_residence
        ```NTrl  r   r   rw   rV   c                 s   r   r~   r$   r   r$   r$   r%   r     rt  z:LukeForEntityPairClassification.forward.<locals>.<genexpr>ru  )rS   rC  r  r    r   r   rP   rs  r[   rY   rv  r   r   rw  rx  r   r{   r#   r-   r   r.   r4   ry  r$   r$   r%   rf   a  sL   A0


 z'LukeForEntityPairClassification.forwardr{  )r   r   r   rB   r   r   r    r   r!   rT  r   r#   r4   rf   rk   r$   r$   rT   r%   r}  N  r|  r}  z
    The LUKE model with a span classification head on top (a linear layer on top of the hidden states output) for tasks
    such as named entity recognition.
    c                &       s   e Zd Z fddZe																ddeej deej deej deej deej d	eej d
eej deej deej deej deej deej deej dee	 dee	 dee	 de
eef f"ddZ  ZS )LukeForEntitySpanClassificationc                    sN   t  | t|| _|j| _t|j| _t	|j
d |j| _|   d S )Nr	   rq  rQ   rT   r$   r%   rB     s   
z(LukeForEntitySpanClassification.__init__Nra   r   rb   r`   rt   r?  r@  rA  entity_start_positionsentity_end_positionsr   rc   rj  r   r  r  r   c                 C   s  |dur|n| j j}| j||||||||||||dd}|jd}|	ddd|}	|	j|jjkr:|	|jj}	t	
|jd|	}|
ddd|}
|
j|jjkrZ|
|jj}
t	
|jd|
}t	j|||jgdd}| |}| |}d}|dur||j}|jdkrtj|d| j|d}ntj|d|d|}|stdd	 |||j|j|jfD S t|||j|j|jd
S )u  
        entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
            Indices of entity tokens in the entity vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.
        entity_attention_mask (`torch.FloatTensor` of shape `(batch_size, entity_length)`, *optional*):
            Mask to avoid performing attention on padding entity token indices. Mask values selected in `[0, 1]`:

            - 1 for entity tokens that are **not masked**,
            - 0 for entity tokens that are **masked**.
        entity_token_type_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
            Segment token indices to indicate first and second portions of the entity token inputs. Indices are
            selected in `[0, 1]`:

            - 0 corresponds to a *portion A* entity token,
            - 1 corresponds to a *portion B* entity token.
        entity_position_ids (`torch.LongTensor` of shape `(batch_size, entity_length, max_mention_length)`, *optional*):
            Indices of positions of each input entity in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.
        entity_start_positions (`torch.LongTensor`):
            The start positions of entities in the word token sequence.
        entity_end_positions (`torch.LongTensor`):
            The end positions of entities in the word token sequence.
        labels (`torch.LongTensor` of shape `(batch_size, entity_length)` or `(batch_size, entity_length, num_labels)`, *optional*):
            Labels for computing the classification loss. If the shape is `(batch_size, entity_length)`, the cross
            entropy loss is used for the single-label classification. In this case, labels should contain the indices
            that should be in `[0, ..., config.num_labels - 1]`. If the shape is `(batch_size, entity_length,
            num_labels)`, the binary cross entropy loss is used for the multi-label classification. In this case,
            labels should only contain `[0, 1]`, where 0 and 1 indicate false and true, respectively.

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, LukeForEntitySpanClassification

        >>> tokenizer = AutoTokenizer.from_pretrained("studio-ousia/luke-large-finetuned-conll-2003")
        >>> model = LukeForEntitySpanClassification.from_pretrained("studio-ousia/luke-large-finetuned-conll-2003")

        >>> text = "Beyoncé lives in Los Angeles"
        # List all possible entity spans in the text

        >>> word_start_positions = [0, 8, 14, 17, 21]  # character-based start positions of word tokens
        >>> word_end_positions = [7, 13, 16, 20, 28]  # character-based end positions of word tokens
        >>> entity_spans = []
        >>> for i, start_pos in enumerate(word_start_positions):
        ...     for end_pos in word_end_positions[i:]:
        ...         entity_spans.append((start_pos, end_pos))

        >>> inputs = tokenizer(text, entity_spans=entity_spans, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> logits = outputs.logits
        >>> predicted_class_indices = logits.argmax(-1).squeeze().tolist()
        >>> for span, predicted_class_idx in zip(entity_spans, predicted_class_indices):
        ...     if predicted_class_idx != 0:
        ...         print(text[span[0] : span[1]], model.config.id2label[predicted_class_idx])
        Beyoncé PER
        Los Angeles LOC
        ```NTrl  rV   rv   r   rw   c                 s   r   r~   r$   r   r$   r$   r%   r   e  rt  z:LukeForEntitySpanClassification.forward.<locals>.<genexpr>ru  )rS   rC  r  r  r]   rh   ri   rY   r[   r    gatherr   r   rP   rs  rv  r   r   rw  r   rr  rx  r{   r#   r-   r   r.   r5   )rR   ra   r   rb   r`   rt   r?  r@  rA  r  r  r   rc   rj  r   r  r  r   rE   start_states
end_statesrz  r+   r(   r$   r$   r%   rf     sZ   O


  z'LukeForEntitySpanClassification.forward)NNNNNNNNNNNNNNNN)r   r   r   rB   r   r   r    r   r!   rT  r   r#   r5   rf   rk   r$   r$   rT   r%   r~    sl    	

r~  z
    The LUKE Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    c                "       ro  )LukeForSequenceClassificationc                    sZ   t  | |j| _t|| _t|jd ur|jn|j| _	t
|j|j| _|   d S r~   rA   rB   rr  r-  r  r   rN   classifier_dropoutrO   rP   rr   rE   rs  r1  rQ   rT   r$   r%   rB   {  s   
z&LukeForSequenceClassification.__init__Nra   r   rb   r`   rt   r?  r@  rA  r   rc   rj  r   r  r  r   c                 C   s  |dur|n| j j}| j|||||||||	|
||dd}|j}| |}| |}d}|dur||j}| j jdu r_| j	dkrEd| j _n| j	dkr[|j
tjksV|j
tjkr[d| j _nd| j _| j jdkr}t }| j	dkrw|| | }n+|||}n%| j jdkrt }||d| j	|d}n| j jdkrt }|||}|std	d
 |||j|j|jfD S t|||j|j|jdS )a  
        entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
            Indices of entity tokens in the entity vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.
        entity_attention_mask (`torch.FloatTensor` of shape `(batch_size, entity_length)`, *optional*):
            Mask to avoid performing attention on padding entity token indices. Mask values selected in `[0, 1]`:

            - 1 for entity tokens that are **not masked**,
            - 0 for entity tokens that are **masked**.
        entity_token_type_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
            Segment token indices to indicate first and second portions of the entity token inputs. Indices are
            selected in `[0, 1]`:

            - 0 corresponds to a *portion A* entity token,
            - 1 corresponds to a *portion B* entity token.
        entity_position_ids (`torch.LongTensor` of shape `(batch_size, entity_length, max_mention_length)`, *optional*):
            Indices of positions of each input entity in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        NTrl  r   
regressionsingle_label_classificationmulti_label_classificationrV   c                 s   r   r~   r$   r   r$   r$   r%   r     rt  z8LukeForSequenceClassification.forward.<locals>.<genexpr>ru  )rS   rC  r  rB  rP   rs  r[   rY   problem_typerr  rX   r    r_   r   r   squeezer   r   r   r#   r-   r   r.   r6   )rR   ra   r   rb   r`   rt   r?  r@  rA  r   rc   rj  r   r  r  r   r  r+   r(   loss_fctr$   r$   r%   rf     sd   +



"


z%LukeForSequenceClassification.forwardr{  )r   r   r   rB   r   r   r    r   r!   rT  r   r#   r6   rf   rk   r$   r$   rT   r%   r  t  r|  r  z
    The LUKE Model with a token classification head on top (a linear layer on top of the hidden-states output). To
    solve Named-Entity Recognition (NER) task using LUKE, `LukeForEntitySpanClassification` is more suitable than this
    class.
    c                "       ro  )LukeForTokenClassificationc                    s^   t  | |j| _t|dd| _t|jd ur|jn|j| _	t
|j|j| _|   d S NF)r.  r  rQ   rT   r$   r%   rB     s   z#LukeForTokenClassification.__init__Nra   r   rb   r`   rt   r?  r@  rA  r   rc   rj  r   r  r  r   c                 C   s   |dur|n| j j}| j|||||||||	|
||dd}|j}| |}| |}d}|durE||j}t }||	d| j
|	d}|sXtdd |||j|j|jfD S t|||j|j|jdS )aM  
        entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
            Indices of entity tokens in the entity vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.
        entity_attention_mask (`torch.FloatTensor` of shape `(batch_size, entity_length)`, *optional*):
            Mask to avoid performing attention on padding entity token indices. Mask values selected in `[0, 1]`:

            - 1 for entity tokens that are **not masked**,
            - 0 for entity tokens that are **masked**.
        entity_token_type_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
            Segment token indices to indicate first and second portions of the entity token inputs. Indices are
            selected in `[0, 1]`:

            - 0 corresponds to a *portion A* entity token,
            - 1 corresponds to a *portion B* entity token.
        entity_position_ids (`torch.LongTensor` of shape `(batch_size, entity_length, max_mention_length)`, *optional*):
            Indices of positions of each input entity in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        NTrl  rV   c                 s   r   r~   r$   r   r$   r$   r%   r   Q  rt  z5LukeForTokenClassification.forward.<locals>.<genexpr>ru  )rS   rC  r  r  rP   rs  r[   rY   r   r   rr  r#   r-   r   r.   r7   )rR   ra   r   rb   r`   rt   r?  r@  rA  r   rc   rj  r   r  r  r   rO  r+   r(   r  r$   r$   r%   rf     sF   +

z"LukeForTokenClassification.forwardr{  )r   r   r   rB   r   r   r    r   r!   rT  r   r#   r7   rf   rk   r$   r$   rT   r%   r    s`    	

r  c                $       s   e Zd Z fddZe															ddeej deej deej deej deej d	eej d
eej deej deej deej deej deej dee	 dee	 dee	 de
eef f ddZ  ZS )LukeForQuestionAnsweringc                    s@   t  | |j| _t|dd| _t|j|j| _| 	  d S r  )
rA   rB   rr  r-  r  r   rr   rE   
qa_outputsr1  rQ   rT   r$   r%   rB   b  s
   z!LukeForQuestionAnswering.__init__Nra   r   rb   r`   rt   r?  r@  rA  r   rc   start_positionsend_positionsr   r  r  r   c                 C   sD  |dur|n| j j}| j|||||||||	|
||dd}|j}| |}|jddd\}}|d}|d}d}|dur|durt| dkrN|d}t| dkr[|d}|d}|	d| |	d| t
|d}|||}|||}|| d	 }|std
d ||||j|j|jfD S t||||j|j|jdS )a  
        entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
            Indices of entity tokens in the entity vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.
        entity_attention_mask (`torch.FloatTensor` of shape `(batch_size, entity_length)`, *optional*):
            Mask to avoid performing attention on padding entity token indices. Mask values selected in `[0, 1]`:

            - 1 for entity tokens that are **not masked**,
            - 0 for entity tokens that are **masked**.
        entity_token_type_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
            Segment token indices to indicate first and second portions of the entity token inputs. Indices are
            selected in `[0, 1]`:

            - 0 corresponds to a *portion A* entity token,
            - 1 corresponds to a *portion B* entity token.
        entity_position_ids (`torch.LongTensor` of shape `(batch_size, entity_length, max_mention_length)`, *optional*):
            Indices of positions of each input entity in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.
        NTrl  r   rV   rw   r   )ignore_indexr   c                 s   r   r~   r$   r   r$   r$   r%   r     s    	z3LukeForQuestionAnswering.forward.<locals>.<genexpr>)r(   r9   r:   r-   r   r.   )rS   rC  r  r  r  splitr  lenr]   clamp_r   r#   r-   r   r.   r8   )rR   ra   r   rb   r`   rt   r?  r@  rA  r   rc   r  r  r   r  r  r   rO  r+   r9   r:   
total_lossignored_indexr  
start_lossend_lossr$   r$   r%   rf   m  sh   (








z LukeForQuestionAnswering.forwardrm  )r   r   r   rB   r   r   r    r   r!   rT  r   r#   r8   rf   rk   r$   r$   rT   r%   r  `  sf    	

r  c                "       ro  )LukeForMultipleChoicec                    sP   t  | t|| _t|jd ur|jn|j| _t	|j
d| _|   d S r   )rA   rB   r-  r  r   rN   r  rO   rP   rr   rE   rs  r1  rQ   rT   r$   r%   rB     s   
zLukeForMultipleChoice.__init__Nra   r   rb   r`   rt   r?  r@  rA  r   rc   rj  r   r  r  r   c                 C   s  |dur|n| j j}|dur|jd n|
jd }|dur%|d|dnd}|dur4|d|dnd}|durC|d|dnd}|durR|d|dnd}|
dure|
d|
d|
dnd}
|durt|d|dnd}|dur|d|dnd}|dur|d|dnd}|dur|d|d|dnd}| j|||||||||	|
||dd}|j}| |}| |}|d|}d}|dur|	|j
}t }|||}|stdd |||j|j|jfD S t|||j|j|jd	S )
a^  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
            Indices of entity tokens in the entity vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.
        entity_attention_mask (`torch.FloatTensor` of shape `(batch_size, entity_length)`, *optional*):
            Mask to avoid performing attention on padding entity token indices. Mask values selected in `[0, 1]`:

            - 1 for entity tokens that are **not masked**,
            - 0 for entity tokens that are **masked**.
        entity_token_type_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
            Segment token indices to indicate first and second portions of the entity token inputs. Indices are
            selected in `[0, 1]`:

            - 0 corresponds to a *portion A* entity token,
            - 1 corresponds to a *portion B* entity token.
        entity_position_ids (`torch.LongTensor` of shape `(batch_size, entity_length, max_mention_length)`, *optional*):
            Indices of positions of each input entity in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Nr   rV   rv   Trl  c                 s   r   r~   r$   r   r$   r$   r%   r   d  r  z0LukeForMultipleChoice.forward.<locals>.<genexpr>ru  )rS   rC  rR  r   r]   r  rB  rP   rs  r[   rY   r   r#   r-   r   r.   r;   )rR   ra   r   rb   r`   rt   r?  r@  rA  r   rc   rj  r   r  r  num_choicesr   r  r+   reshaped_logitsr(   r  r$   r$   r%   rf     s~   C


zLukeForMultipleChoice.forwardr{  )r   r   r   rB   r   r   r    r   r!   rT  r   r#   r;   rf   rk   r$   r$   rT   r%   r    s`    	

r  )
rp  r}  r~  r  r  r  r  r`  r-  r  )Gr   r   dataclassesr   typingr   r   r    torch.utils.checkpointr   torch.nnr   r   r   activationsr
   r   modeling_layersr   modeling_outputsr   r   modeling_utilsr   pytorch_utilsr   utilsr   r   r   configuration_luker   
get_loggerr   loggerr   r&   r'   r0   r4   r5   r6   r7   r8   r;   r,  r<   rl   r   r   r   r   r   r   r   r  r  r  r  r-  rZ   rY  r`  rp  r}  r~  r  r  r  r  __all__r$   r$   r$   r%   <module>   s   
I+r04E ] |  xgv "