o
    ei`                     @   st  d Z ddlZddlZddlmZ ddlZddlmZ ddlmZm	Z	m
Z
 ddlmZ ddlmZ dd	lmZ dd
lmZmZmZmZmZmZ ddlmZ ddlmZ ddlmZmZ ddl m!Z! e"e#Z$g dZ%eeddG dd deZ&G dd dej'Z(G dd dej'Z)G dd dej'Z*G dd dej'Z+G dd dej'Z,G dd  d ej'Z-G d!d" d"ej'Z.G d#d$ d$ej'Z/G d%d& d&eZ0G d'd( d(ej'Z1G d)d* d*ej'Z2G d+d, d,ej'Z3G d-d. d.ej'Z4G d/d0 d0ej'Z5eG d1d2 d2eZ6eG d3d4 d4e6Z7ed5dG d6d7 d7e6Z8eG d8d9 d9e6Z9eG d:d; d;e6Z:eG d<d= d=e6Z;g d>Z<dS )?zPyTorch CANINE model.    N)	dataclass)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FN)GradientCheckpointingLayer)BaseModelOutputModelOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward)auto_docstringlogging   )CanineConfig)   +   ;   =   I   a   g   q                           a  
    Output type of [`CanineModel`]. Based on [`~modeling_outputs.BaseModelOutputWithPooling`], but with slightly
    different `hidden_states` and `attentions`, as these also include the hidden states and attentions of the shallow
    Transformer encoders.
    )custom_introc                   @   sb   e Zd ZU dZdZejdB ed< dZejdB ed< dZ	e
ej dB ed< dZe
ej dB ed< dS )CanineModelOutputWithPoolinga  
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model (i.e. the output of the final
        shallow Transformer encoder).
    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
        Hidden-state of the first token of the sequence (classification token) at the last layer of the deep
        Transformer encoder, further processed by a Linear layer and a Tanh activation function. The Linear layer
        weights are trained from the next sentence prediction (classification) objective during pretraining.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the input to each encoder + one for the output of each layer of each
        encoder) of shape `(batch_size, sequence_length, hidden_size)` and `(batch_size, sequence_length //
        config.downsampling_rate, hidden_size)`. Hidden-states of the model at the output of each layer plus the
        initial input to each Transformer encoder. The hidden states of the shallow encoders have length
        `sequence_length`, but the hidden states of the deep encoder have length `sequence_length` //
        `config.downsampling_rate`.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of the 3 Transformer encoders of shape `(batch_size,
        num_heads, sequence_length, sequence_length)` and `(batch_size, num_heads, sequence_length //
        config.downsampling_rate, sequence_length // config.downsampling_rate)`. Attentions weights after the
        attention softmax, used to compute the weighted average in the self-attention heads.
    Nlast_hidden_statepooler_outputhidden_states
attentions)__name__
__module____qualname____doc__r)   torchFloatTensor__annotations__r*   r+   tupler,    r5   r5   h/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/canine/modeling_canine.pyr(   0   s   
 	r(   c                       s   e Zd ZdZ fddZdedefddZdededefd	d
Z				ddej	dB dej	dB dej	dB dej
dB dej
f
ddZ  ZS )CanineEmbeddingsz<Construct the character, position and token_type embeddings.c                    s   t    || _|j|j }t|jD ]}d| }t| |t|j	| qt|j	|j| _
t|j|j| _tj|j|jd| _t|j| _| jdt|jddd d S )NHashBucketCodepointEmbedder_epsposition_idsr   F)
persistent)super__init__confighidden_sizenum_hash_functionsrangesetattrr   	Embeddingnum_hash_bucketschar_position_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_bufferr1   arangemax_position_embeddingsexpand)selfrA   shard_embedding_sizeiname	__class__r5   r6   r@   X   s   


zCanineEmbeddings.__init__
num_hashesnum_bucketsc                 C   sV   |t tkrtdt t td| }g }|D ]}|d | | }|| q|S )a  
        Converts ids to hash bucket ids via multiple hashing.

        Args:
            input_ids: The codepoints or other IDs to be hashed.
            num_hashes: The number of hash functions to use.
            num_buckets: The number of hash buckets (i.e. embeddings in each table).

        Returns:
            A list of tensors, each of which is the hash bucket IDs from one hash function.
        z`num_hashes` must be <= Nr   )len_PRIMES
ValueErrorappend)rT   	input_idsrZ   r[   primesresult_tensorsprimehashedr5   r5   r6   _hash_bucket_tensorsm   s   z%CanineEmbeddings._hash_bucket_tensorsembedding_sizec                 C   sx   || dkrt d| d| d| j|||d}g }t|D ]\}}d| }	t| |	|}
||
 qtj|ddS )	zDConverts IDs (e.g. codepoints) into embeddings via multiple hashing.r   zExpected `embedding_size` (z) % `num_hashes` (z) == 0)rZ   r[   r8   r=   dim)r^   re   	enumerategetattrr_   r1   cat)rT   r`   rf   rZ   r[   hash_bucket_tensorsembedding_shardsrV   hash_bucket_idsrW   shard_embeddingsr5   r5   r6   _embed_hash_buckets   s   
z$CanineEmbeddings._embed_hash_bucketsNr`   token_type_idsr;   inputs_embedsreturnc           
      C   s   |d ur	|  }n|  d d }|d }|d u r$| jd d d |f }|d u r3tj|tj| jjd}|d u rE| || jj| jj	| jj
}| |}|| }| |}	||	7 }| |}| |}|S )Nr=   r   dtypedevice)sizer;   r1   zeroslongrv   rp   rA   rB   rC   rG   rJ   rH   rK   rO   )
rT   r`   rq   r;   rr   input_shape
seq_lengthrJ   
embeddingsposition_embeddingsr5   r5   r6   forward   s&   




zCanineEmbeddings.forward)NNNN)r-   r.   r/   r0   r@   intre   rp   r1   
LongTensorr2   r~   __classcell__r5   r5   rX   r6   r7   U   s(    r7   c                       s6   e Zd ZdZ fddZdejdejfddZ  ZS )CharactersToMoleculeszeConvert character sequence to initial molecule sequence (i.e. downsample) using strided convolutions.c                    sJ   t    tj|j|j|j|jd| _t|j | _	tj
|j|jd| _
d S )Nin_channelsout_channelskernel_sizestrider9   )r?   r@   r   Conv1drB   downsampling_rateconvr	   
hidden_act
activationrK   rL   rT   rA   rX   r5   r6   r@      s   
zCharactersToMolecules.__init__char_encodingrs   c                 C   s   |d d ddd d f }t |dd}| |}t |dd}| |}|d d ddd d f }t j||gdd}| |}|S )Nr   r      r=   rg   )r1   	transposer   r   rk   rK   )rT   r   cls_encodingdownsampleddownsampled_truncatedresultr5   r5   r6   r~      s   


zCharactersToMolecules.forward	r-   r.   r/   r0   r@   r1   Tensorr~   r   r5   r5   rX   r6   r      s    r   c                       sD   e Zd ZdZ fddZ	d
dejdejdB dejfdd	Z  ZS )ConvProjectionz
    Project representations from hidden_size*2 back to hidden_size across a window of w = config.upsampling_kernel_size
    characters.
    c                    s`   t    || _tj|jd |j|jdd| _t|j	 | _
tj|j|jd| _t|j| _d S )Nr   r   r   r9   )r?   r@   rA   r   r   rB   upsampling_kernel_sizer   r	   r   r   rK   rL   rM   rN   rO   r   rX   r5   r6   r@      s   
zConvProjection.__init__Ninputsfinal_seq_char_positionsrs   c           
      C   s   t |dd}| jjd }|d }|| }t||fd}| ||}t |dd}| |}| |}| 	|}|}|d urDt
d|}	|	S )Nr   r   r   z,CanineForMaskedLM is currently not supported)r1   r   rA   r   r   ConstantPad1dr   r   rK   rO   NotImplementedError)
rT   r   r   	pad_totalpad_begpad_endpadr   final_char_seq	query_seqr5   r5   r6   r~      s   


zConvProjection.forwardNr   r5   r5   rX   r6   r      s    r   c                       s^   e Zd Z fddZ		ddejdejdejdB dedB d	eejejdB f f
d
dZ	  Z
S )CanineSelfAttentionc                    s   t    |j|j dkrt|dstd|j d|j d|j| _t|j|j | _| j| j | _t	
|j| j| _t	
|j| j| _t	
|j| j| _t	|j| _d S )Nr   rf   zThe hidden size (z6) is not a multiple of the number of attention heads ())r?   r@   rB   num_attention_headshasattrr^   r   attention_head_sizeall_head_sizer   LinearquerykeyvaluerM   attention_probs_dropout_probrO   r   rX   r5   r6   r@     s   

zCanineSelfAttention.__init__NFfrom_tensor	to_tensorattention_maskoutput_attentionsrs   c                 C   sN  |j \}}}| ||d| j| jdd}| ||d| j| jdd}	| ||d| j| jdd}
t	|
|dd}|t
| j }|d url|jdkrhtj|dd}d|  t|jj }|| }tjj|dd}| |}t	||	}|dddd }| d d | jf }|j| }|r||f}|S |f}|S )	Nr=   r   r   r   rg   g      ?r   )shaper   viewr   r   r   r   r   r1   matmulmathsqrtndim	unsqueezefloatfinforu   minr   
functionalsoftmaxrO   permute
contiguousrw   r   )rT   r   r   r   r   
batch_sizer{   _	key_layervalue_layerquery_layerattention_scoresattention_probscontext_layernew_context_layer_shapeoutputsr5   r5   r6   r~   -  s<   


zCanineSelfAttention.forwardNF)r-   r.   r/   r@   r1   r   r2   boolr4   r~   r   r5   r5   rX   r6   r     s    r   c                       sF   e Zd Z fddZdeej dejdeejejf fddZ  ZS )CanineSelfOutputc                    sB   t    t|j|j| _tj|j|jd| _t|j	| _
d S Nr9   )r?   r@   r   r   rB   denserK   rL   rM   rN   rO   r   rX   r5   r6   r@   l     
zCanineSelfOutput.__init__r+   input_tensorrs   c                 C   &   |  |}| |}| || }|S r   r   rO   rK   rT   r+   r   r5   r5   r6   r~   r  s   

zCanineSelfOutput.forward	r-   r.   r/   r@   r4   r1   r2   r~   r   r5   r5   rX   r6   r   k  s    r   c                       s   e Zd ZdZ							ddededededed	ef fd
dZ		ddeej	 dej	dB dedB deej	ej	dB f fddZ
  ZS )CanineAttentionav  
    Additional arguments related to local attention:

        - **local** (`bool`, *optional*, defaults to `False`) -- Whether to apply local attention.
        - **always_attend_to_first_position** (`bool`, *optional*, defaults to `False`) -- Should all blocks be able to
          attend
        to the `to_tensor`'s first position (e.g. a [CLS] position)? - **first_position_attends_to_all** (`bool`,
        *optional*, defaults to `False`) -- Should the *from_tensor*'s first position be able to attend to all
        positions within the *from_tensor*? - **attend_from_chunk_width** (`int`, *optional*, defaults to 128) -- The
        width of each block-wise chunk in `from_tensor`. - **attend_from_chunk_stride** (`int`, *optional*, defaults to
        128) -- The number of elements to skip when moving to the next block in `from_tensor`. -
        **attend_to_chunk_width** (`int`, *optional*, defaults to 128) -- The width of each block-wise chunk in
        *to_tensor*. - **attend_to_chunk_stride** (`int`, *optional*, defaults to 128) -- The number of elements to
        skip when moving to the next block in `to_tensor`.
    F   always_attend_to_first_positionfirst_position_attends_to_allattend_from_chunk_widthattend_from_chunk_strideattend_to_chunk_widthattend_to_chunk_stridec	           	         sl   t    t|| _t|| _|| _||k rtd||k r"td|| _|| _	|| _
|| _|| _|| _d S )Nze`attend_from_chunk_width` < `attend_from_chunk_stride` would cause sequence positions to get skipped.z``attend_to_chunk_width` < `attend_to_chunk_stride`would cause sequence positions to get skipped.)r?   r@   r   rT   r   outputlocalr^   r   r   r   r   r   r   	rT   rA   r   r   r   r   r   r   r   rX   r5   r6   r@     s$   



zCanineAttention.__init__Nr+   r   r   rs   c                 C   sB  | j s| ||||}|d }n|jd  }}| }}	g }
| jr(|
d d}nd}t||| jD ]}t||| j }|
||f q1g }| jrO|d|f td|| j	D ]}t||| j
 }|||f qVt|
t|kr{td|
 d|
 dg }g }t|
|D ]v\\}}\}}|d d ||d d f }|	d d ||d d f }|d d ||||f }| jr|d d ||ddf }tj||gdd}|	d d ddd d f }tj||gdd}| ||||}||d  |r||d  qtj|dd}| ||}|f}| j s||dd   }|S |t| }|S )	Nr   r   )r   r   z/Expected to have same number of `from_chunks` (z) and `to_chunks` (z). Check strides.r   rg   )r   rT   r   r   r_   rD   r   r   r   r   r   r\   r^   zipr   r1   rk   r   r4   )rT   r+   r   r   self_outputsattention_outputfrom_seq_lengthto_seq_lengthr   r   from_chunks
from_startchunk_start	chunk_end	to_chunksattention_output_chunksattention_probs_chunksfrom_endto_startto_endfrom_tensor_chunkto_tensor_chunkattention_mask_chunkcls_attention_maskcls_positionattention_outputs_chunkr   r5   r5   r6   r~     sf   

zCanineAttention.forwardFFFr   r   r   r   r   )r-   r.   r/   r0   r   r   r@   r4   r1   r2   r~   r   r5   r5   rX   r6   r   {  sB    	#r   c                       s2   e Zd Z fddZdejdejfddZ  ZS )CanineIntermediatec                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S r   )r?   r@   r   r   rB   intermediate_sizer   
isinstancer   strr	   intermediate_act_fnr   rX   r5   r6   r@     s
   
zCanineIntermediate.__init__r+   rs   c                 C      |  |}| |}|S r   )r   r   rT   r+   r5   r5   r6   r~        

zCanineIntermediate.forward)r-   r.   r/   r@   r1   r2   r~   r   r5   r5   rX   r6   r     s    r   c                       s<   e Zd Z fddZdeej dejdejfddZ  ZS )CanineOutputc                    sB   t    t|j|j| _tj|j|jd| _t	|j
| _d S r   )r?   r@   r   r   r   rB   r   rK   rL   rM   rN   rO   r   rX   r5   r6   r@     r   zCanineOutput.__init__r+   r   rs   c                 C   r   r   r   r   r5   r5   r6   r~     s   

zCanineOutput.forwardr   r5   r5   rX   r6   r     s    (r   c                       sd   e Zd Z fddZ		ddeej dejdB dedB deejejdB f fd	d
Zdd Z	  Z
S )CanineLayerc	           	   	      sH   t    |j| _d| _t||||||||| _t|| _t|| _	d S Nr   )
r?   r@   chunk_size_feed_forwardseq_len_dimr   	attentionr   intermediater   r   r   rX   r5   r6   r@     s   


zCanineLayer.__init__NFr+   r   r   rs   c                 C   sF   | j |||d}|d }|dd  }t| j| j| j|}|f| }|S )N)r   r   r   )r  r   feed_forward_chunkr  r  )rT   r+   r   r   self_attention_outputsr   r   layer_outputr5   r5   r6   r~   /  s   
zCanineLayer.forwardc                 C   s   |  |}| ||}|S r   )r  r   )rT   r   intermediate_outputr	  r5   r5   r6   r  E  s   
zCanineLayer.feed_forward_chunkr   )r-   r.   r/   r@   r4   r1   r2   r   r~   r  r   r5   r5   rX   r6   r    s    
r  c                       st   e Zd Z							d fdd	Z				ddeej dejdB d	edB d
edB dedB deeB fddZ	  Z
S )CanineEncoderFr   c	           	   
      sH   t    | _t fddtjD | _d| _d S )Nc                    s"   g | ]}t  qS r5   )r  ).0r   r   r   r   r   r   rA   r   r   r5   r6   
<listcomp>Z  s    z*CanineEncoder.__init__.<locals>.<listcomp>F)	r?   r@   rA   r   
ModuleListrD   num_hidden_layerslayergradient_checkpointingr   rX   r  r6   r@   L  s   

zCanineEncoder.__init__NTr+   r   r   output_hidden_statesreturn_dictrs   c                 C   s   |rdnd }|r
dnd }t | jD ]\}}	|r||f }|	|||}
|
d }|r/||
d f }q|r7||f }|sEtdd |||fD S t|||dS )Nr5   r   r   c                 s       | ]	}|d ur|V  qd S r   r5   r  vr5   r5   r6   	<genexpr>      z(CanineEncoder.forward.<locals>.<genexpr>)r)   r+   r,   )ri   r  r4   r   )rT   r+   r   r   r  r  all_hidden_statesall_self_attentionsrV   layer_modulelayer_outputsr5   r5   r6   r~   j  s&   

zCanineEncoder.forwardr   )NFFT)r-   r.   r/   r@   r4   r1   r2   r   r   r~   r   r5   r5   rX   r6   r  K  s4    !r  c                       6   e Zd Z fddZdeej dejfddZ  ZS )CaninePoolerc                    s*   t    t|j|j| _t | _d S r   )r?   r@   r   r   rB   r   Tanhr   r   rX   r5   r6   r@     s   
zCaninePooler.__init__r+   rs   c                 C   s(   |d d df }|  |}| |}|S )Nr   )r   r   )rT   r+   first_token_tensorpooled_outputr5   r5   r6   r~     s   

zCaninePooler.forwardr   r5   r5   rX   r6   r    s    "r  c                       r  )CaninePredictionHeadTransformc                    sV   t    t|j|j| _t|jtrt	|j | _
n|j| _
tj|j|jd| _d S r   )r?   r@   r   r   rB   r   r   r   r   r	   transform_act_fnrK   rL   r   rX   r5   r6   r@     s   
z&CaninePredictionHeadTransform.__init__r+   rs   c                 C   s"   |  |}| |}| |}|S r   )r   r$  rK   r   r5   r5   r6   r~     s   


z%CaninePredictionHeadTransform.forwardr   r5   r5   rX   r6   r#    s    "	r#  c                       r  )CanineLMPredictionHeadc                    sB   t    t|| _tj|j|jdd| _t	t
|j| _d S )NT)bias)r?   r@   r#  	transformr   r   rB   
vocab_sizedecoder	Parameterr1   rx   r&  r   rX   r5   r6   r@     s   

zCanineLMPredictionHead.__init__r+   rs   c                 C   r   r   )r'  r)  r   r5   r5   r6   r~     r   zCanineLMPredictionHead.forwardr   r5   r5   rX   r6   r%    s    "r%  c                       s:   e Zd Z fddZdeej deej fddZ  ZS )CanineOnlyMLMHeadc                    s   t    t|| _d S r   )r?   r@   r%  predictionsr   rX   r5   r6   r@     s   
zCanineOnlyMLMHead.__init__sequence_outputrs   c                 C   s   |  |}|S r   )r,  )rT   r-  prediction_scoresr5   r5   r6   r~     s   
zCanineOnlyMLMHead.forward)	r-   r.   r/   r@   r4   r1   r   r~   r   r5   r5   rX   r6   r+    s    r+  c                       s.   e Zd ZU eed< dZdZ fddZ  ZS )CaninePreTrainedModelrA   canineTc                    s@   t  | t|trt|jt|jj	d 
d d S d S )Nr=   r<   )r?   _init_weightsr   r7   initcopy_r;   r1   rQ   r   rS   )rT   modulerX   r5   r6   r1    s   
&z#CaninePreTrainedModel._init_weights)	r-   r.   r/   r   r3   base_model_prefixsupports_gradient_checkpointingr1  r   r5   r5   rX   r6   r/    s
   
 r/  c                       s   e Zd Zd fdd	Zdd Zdejdefdd	Zd
ejdedejfddZ	e
								ddejdB dejdB dejdB dejdB dejdB dedB dedB dedB deeB fddZ  ZS )CanineModelTc              
      s   t  | || _t|}d|_t|| _t|ddd|j	|j	|j	|j	d| _
t|| _t|| _t|| _t|| _|rAt|nd| _|   dS )zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        r   TF)r   r   r   r   r   r   r   N)r?   r@   rA   copydeepcopyr  r7   char_embeddingsr  local_transformer_strideinitial_char_encoderr   chars_to_moleculesencoderr   
projectionfinal_char_encoderr  pooler	post_init)rT   rA   add_pooling_layershallow_configrX   r5   r6   r@     s*   






zCanineModel.__init__c                 C   s\   |j d |j d }}|j d }t||d|f }tj||dftj|jd}|| }|S )aP  
        Create 3D attention mask from a 2D tensor mask.

        Args:
            from_tensor: 2D or 3D Tensor of shape [batch_size, from_seq_length, ...].
            to_mask: int32 Tensor of shape [batch_size, to_seq_length].

        Returns:
            float Tensor of shape [batch_size, from_seq_length, to_seq_length].
        r   r   )rw   ru   rv   )r   r1   reshaper   onesfloat32rv   )rT   r   to_maskr   r   r   broadcast_onesmaskr5   r5   r6   )_create_3d_attention_mask_from_input_mask  s   
z5CanineModel._create_3d_attention_mask_from_input_maskchar_attention_maskr   c                 C   sF   |j \}}t||d|f}tjj||d| }tj|dd}|S )z[Downsample 2D character attention mask to 2D molecule attention mask using MaxPool1d layer.r   )r   r   r=   rg   )r   r1   rE  r   	MaxPool1dr   squeeze)rT   rL  r   r   char_seq_lenpoolable_char_maskpooled_molecule_maskmolecule_attention_maskr5   r5   r6   _downsample_attention_mask  s   
z&CanineModel._downsample_attention_mask	moleculeschar_seq_lengthrs   c           	      C   sz   | j j}|ddddddf }tj||dd}|ddddddf }|| }tj||| dd}tj||gddS )zDRepeats molecules to make them the same length as the char sequence.Nr   r   )repeatsrh   r=   rg   )rA   r   r1   repeat_interleaverk   )	rT   rT  rU  ratemolecules_without_extra_clsrepeatedlast_moleculeremainder_lengthremainder_repeatedr5   r5   r6   _repeat_molecules'  s   zCanineModel._repeat_moleculesNr`   r   rq   r;   rr   r   r  r  c	           "      K   s  |d ur|n| j j}|d ur|n| j j}|rdnd }
|rdnd }|d ur&|n| j j}|d ur6|d ur6td|d urE| || | }n|d urR| d d }ntd|\}}|d ura|jn|j}|d u rqtj	||f|d}|d u r~tj
|tj|d}| ||}| j|| j jd}| |||jd f}| j||||d}| |d ur|n||}| j||||d	}|j}| |}| j|||||d
}|d }| jd ur| |nd }| j||d d}tj||gdd}| |}| j||||d	}|j}|r|r|jn|d }|
|j | |j }
|r)|r|jn|d } ||j |  |j }|s?||f}!|!tdd |
|fD 7 }!|!S t|||
|dS )Nr5   zDYou cannot specify both input_ids and inputs_embeds at the same timer=   z5You have to specify either input_ids or inputs_embeds)rv   rt   )r   )r`   r;   rq   rr   )r   r   r  )r   r   r  r  r   )rU  rg   r   c                 s   r  r   r5   r  r5   r5   r6   r    r  z&CanineModel.forward.<locals>.<genexpr>)r)   r*   r+   r,   )rA   r   r  use_return_dictr^   %warn_if_padding_and_no_attention_maskrw   rv   r1   rF  rx   ry   get_extended_attention_maskrS  r   r   r:  rK  r<  r)   r=  r>  rA  r^  rk   r?  r@  r+   r,   r4   r(   )"rT   r`   r   rq   r;   rr   r   r  r  kwargsr  r  rz   r   r{   rv   extended_attention_maskrR   extended_molecule_attention_maskinput_char_embeddingsrL  init_chars_encoder_outputsinput_char_encodinginit_molecule_encodingencoder_outputsmolecule_sequence_outputr"  repeated_moleculesconcatr-  final_chars_encoder_outputsdeep_encoder_hidden_statesdeep_encoder_self_attentionsr   r5   r5   r6   r~   @  s   



zCanineModel.forward)T)NNNNNNNN)r-   r.   r/   r@   rK  r1   r   r   rS  r^  r   r   r2   r   r4   r(   r~   r   r5   r5   rX   r6   r7    sB    "	r7  z
    CANINE Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    c                          e Zd Z fddZe									ddejdB dejdB dejdB dejdB dejdB d	ejdB d
edB dedB dedB de	e
B fddZ  ZS )CanineForSequenceClassificationc                    J   t  | |j| _t|| _t|j| _t	|j
|j| _|   d S r   r?   r@   
num_labelsr7  r0  r   rM   rN   rO   r   rB   
classifierrB  r   rX   r5   r6   r@        
z(CanineForSequenceClassification.__init__Nr`   r   rq   r;   rr   labelsr   r  r  rs   c
              
   K   sp  |	dur|	n| j j}	| j||||||||	d}|d }| |}| |}d}|dur| j jdu rU| jdkr;d| j _n| jdkrQ|jtj	ksL|jtj
krQd| j _nd| j _| j jdkrst }| jdkrm|| | }n+|||}n%| j jdkrt }||d| j|d}n| j jdkrt }|||}|	s|f|dd  }|dur|f| S |S t|||j|jd	S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   rq   r;   rr   r   r  r  r   
regressionsingle_label_classificationmulti_label_classificationr=   r   losslogitsr+   r,   )rA   r_  r0  rO   ru  problem_typert  ru   r1   ry   r   r   rN  r   r   r   r   r+   r,   )rT   r`   r   rq   r;   rr   rw  r   r  r  rb  r   r"  r~  r}  loss_fctr   r5   r5   r6   r~     sT   



"


z'CanineForSequenceClassification.forward	NNNNNNNNN)r-   r.   r/   r@   r   r1   r   r2   r   r4   r   r~   r   r5   r5   rX   r6   rq    sB    	
rq  c                       rp  )CanineForMultipleChoicec                    s@   t  | t|| _t|j| _t|j	d| _
|   d S r  )r?   r@   r7  r0  r   rM   rN   rO   r   rB   ru  rB  r   rX   r5   r6   r@   4  s
   
z CanineForMultipleChoice.__init__Nr`   r   rq   r;   rr   rw  r   r  r  rs   c
              
   K   sl  |	dur|	n| j j}	|dur|jd n|jd }|dur%|d|dnd}|dur4|d|dnd}|durC|d|dnd}|durR|d|dnd}|dure|d|d|dnd}| j||||||||	d}|d }| |}| |}|d|}d}|durt }|||}|	s|f|dd  }|dur|f| S |S t	|||j
|jdS )a[  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Nr   r=   r   rx  r   r|  )rA   r_  r   r   rw   r0  rO   ru  r   r   r+   r,   )rT   r`   r   rq   r;   rr   rw  r   r  r  rb  num_choicesr   r"  r~  reshaped_logitsr}  r  r   r5   r5   r6   r~   >  sJ   ,


zCanineForMultipleChoice.forwardr  )r-   r.   r/   r@   r   r1   r   r2   r   r4   r   r~   r   r5   r5   rX   r6   r  2  sB    
	
r  c                       rp  )CanineForTokenClassificationc                    rr  r   rs  r   rX   r5   r6   r@     rv  z%CanineForTokenClassification.__init__Nr`   r   rq   r;   rr   rw  r   r  r  rs   c
              
   K   s   |	dur|	n| j j}	| j||||||||	d}|d }| |}| |}d}|dur;t }||d| j|d}|	sQ|f|dd  }|durO|f| S |S t|||j	|j
dS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, CanineForTokenClassification
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("google/canine-s")
        >>> model = CanineForTokenClassification.from_pretrained("google/canine-s")

        >>> inputs = tokenizer(
        ...     "HuggingFace is a company based in Paris and New York", add_special_tokens=False, return_tensors="pt"
        ... )

        >>> with torch.no_grad():
        ...     logits = model(**inputs).logits

        >>> predicted_token_class_ids = logits.argmax(-1)

        >>> # Note that tokens are classified rather then input words which means that
        >>> # there might be more predicted token classes than words.
        >>> # Multiple token classes might account for the same word
        >>> predicted_tokens_classes = [model.config.id2label[t.item()] for t in predicted_token_class_ids[0]]
        >>> predicted_tokens_classes  # doctest: +SKIP
        ```

        ```python
        >>> labels = predicted_token_class_ids
        >>> loss = model(**inputs, labels=labels).loss
        >>> round(loss.item(), 2)  # doctest: +SKIP
        ```Nrx  r   r=   r   r|  )rA   r_  r0  rO   ru  r   r   rt  r   r+   r,   )rT   r`   r   rq   r;   rr   rw  r   r  r  rb  r   r-  r~  r}  r  r   r5   r5   r6   r~     s6   0

z$CanineForTokenClassification.forwardr  )r-   r.   r/   r@   r   r1   r   r2   r   r4   r   r~   r   r5   r5   rX   r6   r    sB    	
r  c                       s   e Zd Z fddZe										ddejdB dejdB dejdB dejdB dejdB d	ejdB d
ejdB dedB dedB dedB de	e
B fddZ  ZS )CanineForQuestionAnsweringc                    s<   t  | |j| _t|| _t|j|j| _| 	  d S r   )
r?   r@   rt  r7  r0  r   r   rB   
qa_outputsrB  r   rX   r5   r6   r@     s
   
z#CanineForQuestionAnswering.__init__Nr`   r   rq   r;   rr   start_positionsend_positionsr   r  r  rs   c              
   K   s>  |
d ur|
n| j j}
| j|||||||	|
d}|d }| |}|jddd\}}|d}|d}d }|d ur}|d ur}t| dkrJ|d}t| dkrW|d}|d}|d| |d| t	|d}|||}|||}|| d }|
s||f|dd   }|d ur|f| S |S t
||||j|jdS )	Nrx  r   r   r=   rg   )ignore_indexr   )r}  start_logits
end_logitsr+   r,   )rA   r_  r0  r  splitrN  r\   rw   clamp_r   r   r+   r,   )rT   r`   r   rq   r;   rr   r  r  r   r  r  rb  r   r-  r~  r  r  
total_lossignored_indexr  
start_lossend_lossr   r5   r5   r6   r~     sN   








z"CanineForQuestionAnswering.forward)
NNNNNNNNNN)r-   r.   r/   r@   r   r1   r   r2   r   r4   r   r~   r   r5   r5   rX   r6   r    sH    
	
r  )r  r  rq  r  r  r7  r/  )=r0   r8  r   dataclassesr   r1   r   torch.nnr   r   r    r   r2  activationsr	   modeling_layersr
   modeling_outputsr   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   utilsr   r   configuration_caniner   
get_loggerr-   loggerr]   r(   Moduler7   r   r   r   r   r   r   r   r  r  r  r#  r%  r+  r/  r7  rq  r  r  r  __all__r5   r5   r5   r6   <module>   sj    
a,8Q{8@  Tf_L