o
    i]                     @   s4  d Z ddlmZ ddlmZmZmZ ddlZddlmZ ddl	m
Z
 ddlmZ dd	lmZ dd
lmZmZmZ ddlmZmZ ddlmZmZmZ ddlmZmZmZ ddlmZ e e!Z"G dd dej#Z$		d=dej#dej%dej%dej%deej% de&de&deej% fddZ'G dd dej#Z(G dd  d ej#Z)G d!d" d"ej#Z*G d#d$ d$ej#Z+G d%d& d&ej#Z,G d'd( d(eZ-G d)d* d*ej#Z.eG d+d, d,eZ/eG d-d. d.e/Z0G d/d0 d0ej#Z1G d1d2 d2ej#Z2eG d3d4 d4e/Z3eed5d6G d7d8 d8eZ4ed9d6G d:d; d;e/Z5g d<Z6dS )>zPyTorch Splinter model.    )	dataclass)CallableOptionalUnionN)nn)CrossEntropyLoss   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputModelOutputQuestionAnsweringModelOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringcan_return_tuplelogging   )SplinterConfigc                       s`   e Zd ZdZ fddZ				ddeej deej deej deej d	e	f
d
dZ
  ZS )SplinterEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    s   t    tj|j|j|jd| _t|j|j| _	t|j
|j| _tj|j|jd| _t|j| _| jdt|jddd t|dd| _d S )	N)padding_idxepsposition_ids)r   F)
persistentposition_embedding_typeabsolute)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_buffertorcharangeexpandgetattrr   selfconfig	__class__ k/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/transformers/models/splinter/modeling_splinter.pyr"   +   s   
zSplinterEmbeddings.__init__N	input_idstoken_type_idsr   inputs_embedsreturnc           
      C   s   |d ur	|  }n|  d d }|d }|d u r$| jd d d |f }|d u r3tj|tj| jjd}|d u r<| |}| |}|| }| jdkrS| 	|}	||	7 }| 
|}| |}|S )Nr   r   dtypedevicer    )sizer   r2   zeroslongrC   r'   r+   r   r)   r,   r0   )
r7   r=   r>   r   r?   input_shape
seq_lengthr+   
embeddingsr)   r;   r;   r<   forward<   s$   






zSplinterEmbeddings.forward)NNNN)__name__
__module____qualname____doc__r"   r   r2   
LongTensorFloatTensortuplerJ   __classcell__r;   r;   r9   r<   r   (   s$    r           modulequerykeyvalueattention_maskscalingr0   	head_maskc                 K   s   t ||dd| }	|d ur'|d d d d d d d |jd f }
|	|
 }	tjj|	dt jd|j	}	tjj
|	|| jd}	|d urM|	|dddd }	t |	|}|dd }||	fS )N   r   r   )dimrB   )ptrainingr   )r2   matmul	transposeshaper   
functionalsoftmaxfloat32torB   r0   r_   view
contiguous)rT   rU   rV   rW   rX   rY   r0   rZ   kwargsattn_weightscausal_maskattn_outputr;   r;   r<   eager_attention_forward^   s   &rm   c                       sZ   e Zd Z fddZ			ddejdeej deej dee d	e	ej f
d
dZ
  ZS )SplinterSelfAttentionc                    s   t    |j|j dkrt|dstd|j d|j d|| _|j| _t|j|j | _| j| j | _	t
|j| j	| _t
|j| j	| _t
|j| j	| _t
|j| _|j| _| jd | _d S )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()g      )r!   r"   r%   num_attention_headshasattr
ValueErrorr8   intattention_head_sizeall_head_sizer   LinearrU   rV   rW   r.   attention_probs_dropout_probr0   attention_dropoutrY   r6   r9   r;   r<   r"   {   s"   

zSplinterSelfAttention.__init__NFhidden_statesrX   rZ   output_attentionsr@   c                 K   s   |j d d }g |d| jR }| ||dd}| ||dd}	| ||dd}
t}| jj	dkrCt
| jj	 }|| ||	|
|f| jsOdn| j| j|d|\}}|jg |dR   }|rp||f}|S |f}|S )Nr   r   r[   eagerrS   )r0   rY   rZ   )rb   ru   rU   rg   ra   rV   rW   rm   r8   _attn_implementationr   r_   ry   rY   reshaperh   )r7   rz   rX   rZ   r{   ri   rG   hidden_shapequery_states
key_statesvalue_statesattention_interfacerl   rj   outputsr;   r;   r<   rJ      s4   	
zSplinterSelfAttention.forwardNNF)rK   rL   rM   r"   r2   Tensorr   rP   boolrQ   rJ   rR   r;   r;   r9   r<   rn   z   s     rn   c                       8   e Zd Z fddZdejdejdejfddZ  ZS )SplinterSelfOutputc                    sB   t    t|j|j| _tj|j|jd| _t|j	| _
d S Nr   )r!   r"   r   rw   r%   denser,   r-   r.   r/   r0   r6   r9   r;   r<   r"         
zSplinterSelfOutput.__init__rz   input_tensorr@   c                 C   &   |  |}| |}| || }|S Nr   r0   r,   r7   rz   r   r;   r;   r<   rJ         

zSplinterSelfOutput.forwardrK   rL   rM   r"   r2   r   rJ   rR   r;   r;   r9   r<   r          $r   c                       sb   e Zd Z fddZdd Z			ddejdeej d	eej d
ee	 de
ej f
ddZ  ZS )SplinterAttentionc                    s*   t    t|| _t|| _t | _d S r   )r!   r"   rn   r7   r   outputsetpruned_headsr6   r9   r;   r<   r"      s   


zSplinterAttention.__init__c                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   r]   )lenr   r7   rq   ru   r   r   rU   rV   rW   r   r   rv   union)r7   headsindexr;   r;   r<   prune_heads   s   zSplinterAttention.prune_headsNFrz   rX   rZ   r{   r@   c           	      K   s@   | j |f|||d|}| |d |}|f|dd   }|S N)rX   rZ   r{   r   r   )r7   r   )	r7   rz   rX   rZ   r{   ri   self_outputsattention_outputr   r;   r;   r<   rJ      s   zSplinterAttention.forwardr   )rK   rL   rM   r"   r   r2   r   r   rP   r   rQ   rJ   rR   r;   r;   r9   r<   r      s"    r   c                       s2   e Zd Z fddZdejdejfddZ  ZS )SplinterIntermediatec                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S r   )r!   r"   r   rw   r%   intermediate_sizer   
isinstance
hidden_actstrr	   intermediate_act_fnr6   r9   r;   r<   r"      s
   
zSplinterIntermediate.__init__rz   r@   c                 C   s   |  |}| |}|S r   )r   r   )r7   rz   r;   r;   r<   rJ      s   

zSplinterIntermediate.forwardr   r;   r;   r9   r<   r      s    r   c                       r   )SplinterOutputc                    sB   t    t|j|j| _tj|j|jd| _t	|j
| _d S r   )r!   r"   r   rw   r   r%   r   r,   r-   r.   r/   r0   r6   r9   r;   r<   r"     r   zSplinterOutput.__init__rz   r   r@   c                 C   r   r   r   r   r;   r;   r<   rJ   	  r   zSplinterOutput.forwardr   r;   r;   r9   r<   r     r   r   c                       sb   e Zd Z fddZ			ddejdeej deej dee d	e	ej f
d
dZ
dd Z  ZS )SplinterLayerc                    s:   t    |j| _d| _t|| _t|| _t|| _	d S )Nr   )
r!   r"   chunk_size_feed_forwardseq_len_dimr   	attentionr   intermediater   r   r6   r9   r;   r<   r"     s   


zSplinterLayer.__init__NFrz   rX   rZ   r{   r@   c           
      K   sP   | j |f|||d|}|d }|dd  }t| j| j| j|}	|	f| }|S r   )r   r   feed_forward_chunkr   r   )
r7   rz   rX   rZ   r{   ri   self_attention_outputsr   r   layer_outputr;   r;   r<   rJ     s    
zSplinterLayer.forwardc                 C   s   |  |}| ||}|S r   )r   r   )r7   r   intermediate_outputr   r;   r;   r<   r   3  s   
z SplinterLayer.feed_forward_chunkr   )rK   rL   rM   r"   r2   r   r   rP   r   rQ   rJ   r   rR   r;   r;   r9   r<   r     s"    
r   c                       sz   e Zd Z fddZe					ddejdeej deej d	ee	 d
ee	 dee	 de
eej ef fddZ  ZS )SplinterEncoderc                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r;   )r   ).0ir8   r;   r<   
<listcomp>>  s    z,SplinterEncoder.__init__.<locals>.<listcomp>F)	r!   r"   r8   r   
ModuleListrangenum_hidden_layerslayergradient_checkpointingr6   r9   r   r<   r"   ;  s   
 
zSplinterEncoder.__init__NFTrz   rX   rZ   r{   output_hidden_statesreturn_dictr@   c                 K   s   |rdnd }|r
dnd }	t | jD ].\}
}|r||f }|d ur$||
 nd }|d||||d|}|d }|r?|	|d f }	q|rG||f }t|||	dS )Nr;   )rz   rX   rZ   r{   r   r   last_hidden_staterz   
attentions)	enumerater   r   )r7   rz   rX   rZ   r{   r   r   ri   all_hidden_statesall_self_attentionsr   layer_modulelayer_head_masklayer_outputsr;   r;   r<   rJ   A  s2   

zSplinterEncoder.forward)NNFFT)rK   rL   rM   r"   r   r2   r   r   rP   r   r   rQ   r   rJ   rR   r;   r;   r9   r<   r   :  s.    	r   c                   @   s&   e Zd ZU eed< dZdZdd ZdS )SplinterPreTrainedModelr8   splinterTc                 C   s   t |tjr |jjjd| jjd |jdur|jj	  dS dS t |tj
rC|jjjd| jjd |jdurA|jj|j 	  dS dS t |tjrX|jj	  |jjd dS dS )zInitialize the weightsrS   )meanstdNg      ?)r   r   rw   weightdatanormal_r8   initializer_rangebiaszero_r#   r   r,   fill_)r7   rT   r;   r;   r<   _init_weightsq  s   

z%SplinterPreTrainedModel._init_weightsN)rK   rL   rM   r   __annotations__base_model_prefixsupports_gradient_checkpointingr   r;   r;   r;   r<   r   k  s
   
 r   c                       s   e Zd ZdZ fddZdd Zdd Zdd	 Zee		
	
	
	
	
	
	
	
	
dde
ej de
ej de
ej de
ej de
ej de
ej de
e de
e de
e deeef fddZ  ZS )SplinterModela2  
    The model is an encoder (with only self-attention) following the architecture described in [Attention is all you
    need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones,
    Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
    c                    s2   t  | || _t|| _t|| _|   d S r   )r!   r"   r8   r   rI   r   encoder	post_initr6   r9   r;   r<   r"     s
   

zSplinterModel.__init__c                 C   s   | j jS r   rI   r'   )r7   r;   r;   r<   get_input_embeddings  s   z"SplinterModel.get_input_embeddingsc                 C   s   || j _d S r   r   )r7   rW   r;   r;   r<   set_input_embeddings  s   z"SplinterModel.set_input_embeddingsc                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr   r   r   r   )r7   heads_to_pruner   r   r;   r;   r<   _prune_heads  s   zSplinterModel._prune_headsNr=   rX   r>   r   rZ   r?   r{   r   r   r@   c
                 C   sB  |dur|n| j j}|dur|n| j j}|	dur|	n| j j}	|dur*|dur*td|dur9| || | }
n|durF| dd }
ntd|
\}}|durU|jn|j}|du retj	||f|d}|du rrtj
|
tj|d}| ||
}| || j j}| j||||d}| j|||||dd	}|d
 }t||j|jdS )a  
        token_type_ids (`torch.LongTensor` of shape `batch_size, sequence_length`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `batch_size, sequence_length`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        NzDYou cannot specify both input_ids and inputs_embeds at the same timer   z5You have to specify either input_ids or inputs_embeds)rC   rA   )r=   r   r>   r?   T)rX   rZ   r{   r   r   r   r   )r8   r{   r   use_return_dictrs   %warn_if_padding_and_no_attention_maskrD   rC   r2   onesrE   rF   get_extended_attention_maskget_head_maskr   rI   r   r   rz   r   )r7   r=   rX   r>   r   rZ   r?   r{   r   r   rG   
batch_sizerH   rC   extended_attention_maskembedding_outputencoder_outputssequence_outputr;   r;   r<   rJ     sP   
zSplinterModel.forward)	NNNNNNNNN)rK   rL   rM   rN   r"   r   r   r   r   r   r   r2   r   r   r   rQ   r   rJ   rR   r;   r;   r9   r<   r     sL    
	

r   c                       s4   e Zd Zd fdd	ZdejdejfddZ  ZS )	SplinterFullyConnectedLayergeluc                    sD   t    || _|| _t| j| j| _t| | _t	| j| _	d S r   )
r!   r"   	input_dim
output_dimr   rw   r   r	   act_fnr,   )r7   r   r   r   r9   r;   r<   r"     s   

z$SplinterFullyConnectedLayer.__init__inputsr@   c                 C   s"   |  |}| |}| |}|S r   )r   r   r,   )r7   r   rz   r;   r;   r<   rJ     s   


z#SplinterFullyConnectedLayer.forward)r   r   r;   r;   r9   r<   r     s    
r   c                       s(   e Zd ZdZ fddZdd Z  ZS )QuestionAwareSpanSelectionHeadzf
    Implementation of Question-Aware Span Selection (QASS) head, described in Splinter's paper:

    c                    sz   t    t|j|j| _t|j|j| _t|j|j| _t|j|j| _tj	|j|jdd| _
tj	|j|jdd| _d S )NF)r   )r!   r"   r   r%   query_start_transformquery_end_transformstart_transformend_transformr   rw   start_classifierend_classifierr6   r9   r;   r<   r"     s   
z'QuestionAwareSpanSelectionHead.__init__c                 C   s   |  \}}}|ddd|}tj|d|d}| |}| |}| |}	| |}
| 	|}|	
ddd}	t||	}| |}|

ddd}
t||
}||fS )Nr   r   )r]   r   r   r[   )rD   	unsqueezerepeatr2   gatherr   r   r   r   r   permuter`   r   )r7   r   	positions_r]   r   gathered_repsquery_start_repsquery_end_reps
start_repsend_repsrz   start_logits
end_logitsr;   r;   r<   rJ     s   





z&QuestionAwareSpanSelectionHead.forward)rK   rL   rM   rN   r"   rJ   rR   r;   r;   r9   r<   r     s    r   c                       s   e Zd Z fddZe												ddeej deej deej deej deej d	eej d
eej deej dee	 dee	 dee	 deej de
eef fddZ  ZS )SplinterForQuestionAnsweringc                    4   t  | t|| _t|| _|j| _|   d S r   r!   r"   r   r   r   splinter_qassquestion_token_idr   r6   r9   r;   r<   r"   3  
   

z%SplinterForQuestionAnswering.__init__Nr=   rX   r>   r   rZ   r?   start_positionsend_positionsr{   r   r   question_positionsr@   c                 C   s  |dur|n| j j}d}|du r9|dur#tjt|| j dd}ntj|dtj	|j
|jd}|d}d}| j|||||||	|
|d	}|d }| ||\}}|r`|d	|d	}}|dur~|d	| t|jj  }|d	| t|jj  }d}|dur|durt| d	kr|d}t| d	kr|d}|d	}|d| |d| t|d
}|||}|||}|| d }|s||f|d	d  }|dur|f| S |S t||||j|jdS )a  
        token_type_ids (`torch.LongTensor` of shape `batch_size, sequence_length`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `batch_size, sequence_length`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        question_positions (`torch.LongTensor` of shape `(batch_size, num_questions)`, *optional*):
            The positions of all question tokens. If given, start_logits and end_logits will be of shape `(batch_size,
            num_questions, sequence_length)`. If None, the first question token in each sequence in the batch will be
            the only one for which start_logits and end_logits are calculated and they will be of shape `(batch_size,
            sequence_length)`.
        NFr   r   r   )rB   layoutrC   TrX   r>   r   rZ   r?   r{   r   r   r   ignore_indexr[   lossr  r  rz   r   )r8   r   r2   argmaxeqr  rt   rE   rD   rF   r  rC   r   r   r  squeezefinforB   minr   clamp_r   r   rz   r   )r7   r=   rX   r>   r   rZ   r?   r	  r
  r{   r   r   r  question_positions_were_none"question_position_for_each_exampler   r   r  r  
total_lossignored_indexloss_fct
start_lossend_lossr   r;   r;   r<   rJ   =  sj   $






z$SplinterForQuestionAnswering.forwardNNNNNNNNNNNN)rK   rL   rM   r"   r   r   r2   r   rO   r   r   rQ   r   rJ   rR   r;   r;   r9   r<   r  1  sT    
	

r  zB
    Class for outputs of Splinter as a span selection model.
    )custom_introc                   @   st   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZeeej  ed< dZeeej  ed< dS )SplinterForPreTrainingOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when start and end positions are provided):
        Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
    start_logits (`torch.FloatTensor` of shape `(batch_size, num_questions, sequence_length)`):
        Span-start scores (before SoftMax).
    end_logits (`torch.FloatTensor` of shape `(batch_size, num_questions, sequence_length)`):
        Span-end scores (before SoftMax).
    Nr  r  r  rz   r   )rK   rL   rM   rN   r  r   r2   rP   r   r  r  rz   rQ   r   r;   r;   r;   r<   r!    s   
 	r!  z
    Splinter Model for the recurring span selection task as done during the pretraining. The difference to the QA task
    is that we do not have a question, but multiple question tokens that replace the occurrences of recurring spans
    instead.
    c                       s   e Zd Z fddZe												ddeej deej deej deej deej d	eej d
eej deej dee	 dee	 dee	 deej de
eef fddZdejdejfddZ  ZS )SplinterForPreTrainingc                    r  r   r  r6   r9   r;   r<   r"     r  zSplinterForPreTraining.__init__Nr=   rX   r>   r   rZ   r?   r	  r
  r{   r   r   r  r@   c                 C   s  |dur|n| j j}|du r|dur|durtd|du r&|du r&td|du r/| |}| j|||||||	|
|d	}|d }| \}}}| ||\}}|d}|dur}|d|||}|d| t	
|jj  }|d| t	
|jj  }d}|dur|dur|dtd|d  |dtd|d  t| j jd}|||| |||| }|||| |||| }|| d }|s||f|dd  }|dur|f| S |S t||||j|jd	S )
a  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_questions, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `batch_size, num_questions, sequence_length`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `batch_size, num_questions, sequence_length`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_questions, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        start_positions (`torch.LongTensor` of shape `(batch_size, num_questions)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size, num_questions)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        question_positions (`torch.LongTensor` of shape `(batch_size, num_questions)`, *optional*):
            The positions of all question tokens. If given, start_logits and end_logits will be of shape `(batch_size,
            num_questions, sequence_length)`. If None, the first question token in each sequence in the batch will be
            the only one for which start_logits and end_logits are calculated and they will be of shape `(batch_size,
            sequence_length)`.
        NzCquestion_positions must be specified in order to calculate the lossz>question_positions must be specified when input_embeds is usedr  r   r   r  r[   r  )r8   r   	TypeError_prepare_question_positionsr   rD   r  r   r4   r2   r  rB   r  r  maxr   r&   rg   r!  rz   r   )r7   r=   rX   r>   r   rZ   r?   r	  r
  r{   r   r   r  r   r   r   sequence_lengthr]   r  r  num_questions attention_mask_for_each_questionr  r  r  r  r   r;   r;   r<   rJ     sh   7


zSplinterForPreTraining.forwardc                 C   sl   t || jjk\}}t |}t j|d| f| jjt j	|j
d}t dd |D }||||f< |S )Nr   rA   c                 S   s   g | ]}t |qS r;   )r2   r3   )r   nr;   r;   r<   r   S  s    zFSplinterForPreTraining._prepare_question_positions.<locals>.<listcomp>)r2   wherer8   r  bincountfullrD   r%  r&   rF   rC   cat)r7   r=   rowsflat_positionsr'  r   colsr;   r;   r<   r$  J  s   
z2SplinterForPreTraining._prepare_question_positionsr  )rK   rL   rM   r"   r   r   r2   r   rO   r   r   rQ   r!  rJ   r$  rR   r;   r;   r9   r<   r"    sV    
	

|r"  )r  r"  r   r   r   )rS   N)7rN   dataclassesr   typingr   r   r   r2   r   torch.nnr   activationsr	   modeling_layersr
   modeling_outputsr   r   r   modeling_utilsr   r   pytorch_utilsr   r   r   utilsr   r   r   configuration_splinterr   
get_loggerrK   loggerModuler   r   floatrm   rn   r   r   r   r   r   r   r   r   r   r   r  r!  r"  __all__r;   r;   r;   r<   <module>   sx   
=
;.)1v&r 