o
    eit                     @   s8  d Z ddlmZ ddlmZ ddlZddlmZ ddlmZ ddl	m
Z dd	lmZ dd
lmZ ddlmZmZmZ ddlmZmZ ddlmZ ddlmZ ddlmZmZmZmZm Z  ddl!m"Z" e#e$Z%G dd dej&Z'	d>dej&dej(dej(dej(dej(dB de)de)fddZ*G dd dej&Z+G d d! d!ej&Z,G d"d# d#ej&Z-G d$d% d%ej&Z.G d&d' d'ej&Z/G d(d) d)eZ0G d*d+ d+ej&Z1eG d,d- d-eZ2eG d.d/ d/e2Z3G d0d1 d1ej&Z4G d2d3 d3ej&Z5eG d4d5 d5e2Z6eed6d7G d8d9 d9eZ7ed:d7G d;d< d<e2Z8g d=Z9dS )?zPyTorch Splinter model.    )Callable)	dataclassN)nn)CrossEntropyLoss   )initialization)ACT2FN)GradientCheckpointingLayer)BaseModelOutputModelOutputQuestionAnsweringModelOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)apply_chunking_to_forward)TransformersKwargsauto_docstringcan_return_tupleloggingtorch_compilable_check   )SplinterConfigc                       s`   e Zd ZdZ fddZ				ddejdB dejdB dejdB dejdB d	ef
d
dZ	  Z
S )SplinterEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    s   t    tj|j|j|jd| _t|j|j| _	t|j
|j| _tj|j|jd| _t|j| _| jdt|jddd d S )N)padding_idxepsposition_idsr   F)
persistent)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_buffertorcharangeexpandselfconfig	__class__ l/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/splinter/modeling_splinter.pyr!   .   s   

zSplinterEmbeddings.__init__N	input_idstoken_type_idsr   inputs_embedsreturnc           
      C   s   |d ur	|  }n|  d d }|d }|d u r$| jd d d |f }|d u r3tj|tj| jjd}|d u r<| |}| |}|| }| |}	||	7 }| 	|}| 
|}|S )Nr   r   dtypedevice)sizer   r1   zeroslongrA   r&   r*   r(   r+   r/   )
r5   r;   r<   r   r=   input_shape
seq_lengthr*   
embeddingsr(   r9   r9   r:   forward<   s"   





zSplinterEmbeddings.forward)NNNN)__name__
__module____qualname____doc__r!   r1   
LongTensorFloatTensortuplerH   __classcell__r9   r9   r7   r:   r   +   s$    r           modulequerykeyvalueattention_maskscalingr/   c           
      K   s|   t ||dd| }|d ur|| }tjj|dt jd|j}tjj	||| j
d}t ||}	|	dd }	|	|fS )N   r   r   )dimr@   )ptrainingr   )r1   matmul	transposer   
functionalsoftmaxfloat32tor@   r/   r[   
contiguous)
rR   rS   rT   rU   rV   rW   r/   kwargsattn_weightsattn_outputr9   r9   r:   eager_attention_forward^   s   
rf   c                       V   e Zd Z fddZ		ddejdejdB dedB dee	 d	e
ej f
d
dZ  ZS )SplinterSelfAttentionc                    s   t    |j|j dkrt|dstd|j d|j d|| _|j| _t|j|j | _| j| j | _	t
|j| j	| _t
|j| j	| _t
|j| j	| _t
|j| _|j| _| jd | _d S )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()g      )r    r!   r$   num_attention_headshasattr
ValueErrorr6   intattention_head_sizeall_head_sizer   LinearrS   rT   rU   r-   attention_probs_dropout_probr/   attention_dropoutrW   r4   r7   r9   r:   r!   v   s"   

zSplinterSelfAttention.__init__NFhidden_statesrV   output_attentionsrc   r>   c                 K   s   |j d d }g |d| jR }| ||dd}| ||dd}| ||dd}	t| j	j
t}
|
| |||	|f| jsIdn| j| jd|\}}|jg |dR   }|ri||f}|S |f}|S )Nr   r   rX   rQ   )r/   rW   )shapero   rS   viewr]   rT   rU   r   get_interfacer6   _attn_implementationrf   r[   rs   rW   reshaperb   )r5   rt   rV   ru   rc   rE   hidden_shapequery_states
key_statesvalue_statesattention_interfacere   rd   outputsr9   r9   r:   rH      s2   
zSplinterSelfAttention.forwardNFrI   rJ   rK   r!   r1   TensorrN   boolr   r   rO   rH   rP   r9   r9   r7   r:   rh   u   s    rh   c                       8   e Zd Z fddZdejdejdejfddZ  ZS )SplinterSelfOutputc                    sB   t    t|j|j| _tj|j|jd| _t|j	| _
d S Nr   )r    r!   r   rq   r$   denser+   r,   r-   r.   r/   r4   r7   r9   r:   r!         
zSplinterSelfOutput.__init__rt   input_tensorr>   c                 C   &   |  |}| |}| || }|S Nr   r/   r+   r5   rt   r   r9   r9   r:   rH         

zSplinterSelfOutput.forwardrI   rJ   rK   r!   r1   r   rH   rP   r9   r9   r7   r:   r          $r   c                       rg   )SplinterAttentionc                    s"   t    t|| _t|| _d S r   )r    r!   rh   r5   r   outputr4   r7   r9   r:   r!      s   

zSplinterAttention.__init__NFrt   rV   ru   rc   r>   c                 K   s>   | j |f||d|}| |d |}|f|dd   }|S N)rV   ru   r   r   )r5   r   )r5   rt   rV   ru   rc   self_outputsattention_outputr   r9   r9   r:   rH      s   zSplinterAttention.forwardr   r   r9   r9   r7   r:   r      s    r   c                       s2   e Zd Z fddZdejdejfddZ  ZS )SplinterIntermediatec                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S r   )r    r!   r   rq   r$   intermediate_sizer   
isinstance
hidden_actstrr   intermediate_act_fnr4   r7   r9   r:   r!      s
   
zSplinterIntermediate.__init__rt   r>   c                 C   s   |  |}| |}|S r   )r   r   )r5   rt   r9   r9   r:   rH      s   

zSplinterIntermediate.forwardr   r9   r9   r7   r:   r      s    r   c                       r   )SplinterOutputc                    sB   t    t|j|j| _tj|j|jd| _t	|j
| _d S r   )r    r!   r   rq   r   r$   r   r+   r,   r-   r.   r/   r4   r7   r9   r:   r!      r   zSplinterOutput.__init__rt   r   r>   c                 C   r   r   r   r   r9   r9   r:   rH      r   zSplinterOutput.forwardr   r9   r9   r7   r:   r      r   r   c                       s^   e Zd Z fddZ		ddejdejdB dedB dee	 d	e
ej f
d
dZdd Z  ZS )SplinterLayerc                    s:   t    |j| _d| _t|| _t|| _t|| _	d S )Nr   )
r    r!   chunk_size_feed_forwardseq_len_dimr   	attentionr   intermediater   r   r4   r7   r9   r:   r!      s   


zSplinterLayer.__init__NFrt   rV   ru   rc   r>   c           	      K   sN   | j |f||d|}|d }|dd  }t| j| j| j|}|f| }|S r   )r   r   feed_forward_chunkr   r   )	r5   rt   rV   ru   rc   self_attention_outputsr   r   layer_outputr9   r9   r:   rH      s   
zSplinterLayer.forwardc                 C   s   |  |}| ||}|S r   )r   r   )r5   r   intermediate_outputr   r9   r9   r:   r     s   
z SplinterLayer.feed_forward_chunkr   )rI   rJ   rK   r!   r1   r   rN   r   r   r   rO   rH   r   rP   r9   r9   r7   r:   r      s     
r   c                       sr   e Zd Z fddZe				ddejdejdB dedB d	edB d
edB de	e
 deej eB fddZ  ZS )SplinterEncoderc                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r9   )r   ).0ir6   r9   r:   
<listcomp>   s    z,SplinterEncoder.__init__.<locals>.<listcomp>F)	r    r!   r6   r   
ModuleListrangenum_hidden_layerslayergradient_checkpointingr4   r7   r   r:   r!     s   
 
zSplinterEncoder.__init__NFTrt   rV   ru   output_hidden_statesreturn_dictrc   r>   c                 K   s   |rdnd }|r
dnd }t | jD ]"\}	}
|r||f }|
|||fi |}|d }|r3||d f }q|r;||f }t|||dS )Nr9   r   r   last_hidden_statert   
attentions)	enumerater   r
   )r5   rt   rV   ru   r   r   rc   all_hidden_statesall_self_attentionsr   layer_modulelayer_outputsr9   r9   r:   rH   #  s.   


zSplinterEncoder.forward)NFFT)rI   rJ   rK   r!   r   r1   r   rN   r   r   r   rO   r
   rH   rP   r9   r9   r7   r:   r     s,    r   c                       s.   e Zd ZU eed< dZdZ fddZ  ZS )SplinterPreTrainedModelr6   splinterTc                    s@   t  | t|trt|jt|jj	d 
d d S d S )Nr   r   )r    _init_weightsr   r   initcopy_r   r1   r2   rv   r3   )r5   rR   r7   r9   r:   r   O  s   
&z%SplinterPreTrainedModel._init_weights)	rI   rJ   rK   r   __annotations__base_model_prefixsupports_gradient_checkpointingr   rP   r9   r9   r7   r:   r   I  s
   
 r   c                       s   e Zd ZdZ fddZdd Zdd Zee								dd	e	j
dB d
e	j
dB de	j
dB de	j
dB de	j
dB dedB dedB dedB deeB fddZ  ZS )SplinterModela2  
    The model is an encoder (with only self-attention) following the architecture described in [Attention is all you
    need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones,
    Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
    c                    s2   t  | || _t|| _t|| _|   d S r   )r    r!   r6   r   rG   r   encoder	post_initr4   r7   r9   r:   r!   ]  s
   

zSplinterModel.__init__c                 C   s   | j jS r   rG   r&   )r5   r9   r9   r:   get_input_embeddingsg  s   z"SplinterModel.get_input_embeddingsc                 C   s   || j _d S r   r   )r5   rU   r9   r9   r:   set_input_embeddingsj  s   z"SplinterModel.set_input_embeddingsNr;   rV   r<   r   r=   ru   r   r   r>   c	                 K   s0  |dur|n| j j}|dur|n| j j}|dur|n| j j}|dur*|dur*td|dur9| || | }
n|durF| dd }
ntd|
\}}|durU|jn|j}|du retj	||f|d}|du rrtj
|
tj|d}| ||
}| j||||d}| j||||dd	}|d
 }t||j|jdS )a  
        token_type_ids (`torch.LongTensor` of shape `batch_size, sequence_length`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `batch_size, sequence_length`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        NzDYou cannot specify both input_ids and inputs_embeds at the same timer   z5You have to specify either input_ids or inputs_embeds)rA   r?   )r;   r   r<   r=   T)rV   ru   r   r   r   r   )r6   ru   r   use_return_dictrm   %warn_if_padding_and_no_attention_maskrB   rA   r1   onesrC   rD   get_extended_attention_maskrG   r   r
   rt   r   )r5   r;   rV   r<   r   r=   ru   r   r   rc   rE   
batch_sizerF   rA   extended_attention_maskembedding_outputencoder_outputssequence_outputr9   r9   r:   rH   m  sL   
zSplinterModel.forward)NNNNNNNN)rI   rJ   rK   rL   r!   r   r   r   r   r1   r   r   rO   r
   rH   rP   r9   r9   r7   r:   r   U  sD    
	r   c                       s4   e Zd Zd fdd	ZdejdejfddZ  ZS )	SplinterFullyConnectedLayergeluc                    sD   t    || _|| _t| j| j| _t| | _t	| j| _	d S r   )
r    r!   	input_dim
output_dimr   rq   r   r   act_fnr+   )r5   r   r   r   r7   r9   r:   r!     s   

z$SplinterFullyConnectedLayer.__init__inputsr>   c                 C   s"   |  |}| |}| |}|S r   )r   r   r+   )r5   r   rt   r9   r9   r:   rH     s   


z#SplinterFullyConnectedLayer.forward)r   r   r9   r9   r7   r:   r     s    
r   c                       s(   e Zd ZdZ fddZdd Z  ZS )QuestionAwareSpanSelectionHeadzf
    Implementation of Question-Aware Span Selection (QASS) head, described in Splinter's paper:

    c                    sz   t    t|j|j| _t|j|j| _t|j|j| _t|j|j| _tj	|j|jdd| _
tj	|j|jdd| _d S )NF)bias)r    r!   r   r$   query_start_transformquery_end_transformstart_transformend_transformr   rq   start_classifierend_classifierr4   r7   r9   r:   r!     s   
z'QuestionAwareSpanSelectionHead.__init__c                 C   s   |  \}}}|ddd|}tj|d|d}| |}| |}| |}	| |}
| 	|}|	
ddd}	t||	}| |}|

ddd}
t||
}||fS )Nr   r   )rY   indexr   rX   )rB   	unsqueezerepeatr1   gatherr   r   r   r   r   permuter\   r   )r5   r   	positions_rY   r   gathered_repsquery_start_repsquery_end_reps
start_repsend_repsrt   start_logits
end_logitsr9   r9   r:   rH     s   





z&QuestionAwareSpanSelectionHead.forward)rI   rJ   rK   rL   r!   rH   rP   r9   r9   r7   r:   r     s    r   c                       s   e Zd Z fddZe											ddejdB dejdB dejdB dejdB dejdB d	ejdB d
ejdB dedB dedB dedB dejdB de	e
B fddZ  ZS )SplinterForQuestionAnsweringc                    4   t  | t|| _t|| _|j| _|   d S r   r    r!   r   r   r   splinter_qassquestion_token_idr   r4   r7   r9   r:   r!     
   

z%SplinterForQuestionAnswering.__init__Nr;   rV   r<   r   r=   start_positionsend_positionsru   r   r   question_positionsr>   c              
   K   s  |
dur|
n| j j}
d}|du r9|dur#tjt|| j dd}ntj|dtj	|j
|jd}|d}d}| j|||||||	|
d}|d }| ||\}}|r_|d	|d	}}|dur}|d	| t|jj  }|d	| t|jj  }d}|dur|durt| d	kr|d}t| d	kr|d}|d	}|d| |d| t|d
}|||}|||}|| d }|
s||f|d	d  }|dur|f| S |S t||||j|jdS )a  
        token_type_ids (`torch.LongTensor` of shape `batch_size, sequence_length`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `batch_size, sequence_length`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        question_positions (`torch.LongTensor` of shape `(batch_size, num_questions)`, *optional*):
            The positions of all question tokens. If given, start_logits and end_logits will be of shape `(batch_size,
            num_questions, sequence_length)`. If None, the first question token in each sequence in the batch will be
            the only one for which start_logits and end_logits are calculated and they will be of shape `(batch_size,
            sequence_length)`.
        NFr   )rY   r   )r@   layoutrA   TrV   r<   r   r=   ru   r   r   r   ignore_indexrX   lossr   r   rt   r   )r6   r   r1   argmaxeqr   rn   rC   rB   rD   r   rA   r   r   r   squeezefinfor@   minlenclamp_r   r   rt   r   )r5   r;   rV   r<   r   r=   r   r   ru   r   r   r   rc   question_positions_were_none"question_position_for_each_exampler   r   r   r   
total_lossignored_indexloss_fct
start_lossend_lossr   r9   r9   r:   rH      sh   $






z$SplinterForQuestionAnswering.forwardNNNNNNNNNNN)rI   rJ   rK   r!   r   r1   r   rM   r   rO   r   rH   rP   r9   r9   r7   r:   r     sN    
	
r   zB
    Class for outputs of Splinter as a span selection model.
    )custom_introc                   @   st   e Zd ZU dZdZejdB ed< dZejdB ed< dZ	ejdB ed< dZ
eej dB ed< dZeej dB ed< dS )SplinterForPreTrainingOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when start and end positions are provided):
        Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
    start_logits (`torch.FloatTensor` of shape `(batch_size, num_questions, sequence_length)`):
        Span-start scores (before SoftMax).
    end_logits (`torch.FloatTensor` of shape `(batch_size, num_questions, sequence_length)`):
        Span-end scores (before SoftMax).
    Nr   r   r   rt   r   )rI   rJ   rK   rL   r   r1   rN   r   r   r   rt   rO   r   r9   r9   r9   r:   r  f  s   
 	r  z
    Splinter Model for the recurring span selection task as done during the pretraining. The difference to the QA task
    is that we do not have a question, but multiple question tokens that replace the occurrences of recurring spans
    instead.
    c                       s   e Zd Z fddZe											ddejdB dejdB dejdB dejdB dejdB d	ejdB d
ejdB dedB dedB dedB dejdB de	e
B fddZdejdejfddZ  ZS )SplinterForPreTrainingc                    r   r   r   r4   r7   r9   r:   r!     r   zSplinterForPreTraining.__init__Nr;   rV   r<   r   r=   r   r   ru   r   r   r   r>   c              
   K   s  |
dur|
n| j j}
|du r|dur|durtd|du r&|du r&td|du r/| |}| j|||||||	|
d}|d }| \}}}| ||\}}|d}|dur||d|||}|d| t	
|jj  }|d| t	
|jj  }d}|dur|dur|dtd|d  |dtd|d  t| j jd}|||| |||| }|||| |||| }|| d }|
s||f|dd  }|dur|f| S |S t||||j|jd	S )
a  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_questions, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `batch_size, num_questions, sequence_length`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `batch_size, num_questions, sequence_length`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_questions, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        start_positions (`torch.LongTensor` of shape `(batch_size, num_questions)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size, num_questions)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        question_positions (`torch.LongTensor` of shape `(batch_size, num_questions)`, *optional*):
            The positions of all question tokens. If given, start_logits and end_logits will be of shape `(batch_size,
            num_questions, sequence_length)`. If None, the first question token in each sequence in the batch will be
            the only one for which start_logits and end_logits are calculated and they will be of shape `(batch_size,
            sequence_length)`.
        NzCquestion_positions must be specified in order to calculate the lossz?question_positions must be specified when inputs_embeds is usedr   r   r   r   rX   r   )r6   r   	TypeError_prepare_question_positionsr   rB   r   r   r3   r1   r   r@   r  r  maxr   r%   rw   r  rt   r   )r5   r;   rV   r<   r   r=   r   r   ru   r   r   r   rc   r   r   r   sequence_lengthrY   r   r   num_questions attention_mask_for_each_questionr  r  r	  r
  r   r9   r9   r:   rH     sf   7


zSplinterForPreTraining.forwardc                 C   s   t || jjk\}}t |}t j|d| f| jjt j	|j
d}t|d|dkd t dd |D }||||f< |S )Nr   r?   z?All samples in the batch must have at least one question token.c                 S   s   g | ]}t |qS r9   )r1   r2   )r   nr9   r9   r:   r     s    zFSplinterForPreTraining._prepare_question_positions.<locals>.<listcomp>)r1   wherer6   r   bincountfullrB   r  r%   rD   rA   r   cat)r5   r;   rowsflat_positionsr  r   colsr9   r9   r:   r    s   
z2SplinterForPreTraining._prepare_question_positionsr  )rI   rJ   rK   r!   r   r1   r   rM   r   rO   r  rH   r  rP   r9   r9   r7   r:   r  }  sP    
	
{r  )r   r  r   r   r   )rQ   ):rL   collections.abcr   dataclassesr   r1   r   torch.nnr    r   r   activationsr   modeling_layersr	   modeling_outputsr
   r   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   utilsr   r   r   r   r   configuration_splinterr   
get_loggerrI   loggerModuler   r   floatrf   rh   r   r   r   r   r   r   r   r   r   r   r   r  r  __all__r9   r9   r9   r:   <module>   sv   
:
9'-f&q 