o
    ei,                     @   s^  d Z ddlZddlmZ ddlZddlmZ ddlmZmZm	Z	 ddl
mZ ddlmZmZ dd	lmZ dd
lmZmZmZmZmZmZ ddlmZ ddlmZ ddlmZmZ ddl m!Z! e"e#Z$G dd dej%Z&eG dd deZ'G dd dej%Z(G dd dej%Z)G dd dej%Z*G dd dej%Z+G dd dej%Z,G dd dej%Z-G d d! d!ej%Z.G d"d# d#eZ/G d$d% d%ej%Z0G d&d' d'ej%Z1G d(d) d)ej%Z2eG d*d+ d+e'Z3G d,d- d-ej%Z4eG d.d/ d/e'Z5G d0d1 d1ej%Z6ed2d3G d4d5 d5e'Z7eG d6d7 d7e'Z8eG d8d9 d9e'Z9eG d:d; d;e'Z:g d<Z;dS )=zPyTorch ConvBERT model.    N)Callable)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FNget_activation)GradientCheckpointingLayer)"BaseModelOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward)auto_docstringlogging   )ConvBertConfigc                       sb   e Zd ZdZ fddZ				ddejdB dejdB dejdB dejdB d	ejf
d
dZ  Z	S )ConvBertEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    s   t    tj|j|j|jd| _t|j|j| _	t|j
|j| _tj|j|jd| _t|j| _| jdt|jddd | jdtj| j tjddd d S )	N)padding_idxepsposition_idsr   F)
persistenttoken_type_idsdtype)super__init__r   	Embedding
vocab_sizeembedding_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_buffertorcharangeexpandzerosr   sizelongselfconfig	__class__ l/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/convbert/modeling_convbert.pyr$   1   s   

zConvBertEmbeddings.__init__N	input_idsr    r   inputs_embedsreturnc                 C   s   |d ur	|  }n|  d d }|d }|d u r$| jd d d |f }|d u rNt| drC| jd d d |f }||d |}|}ntj|tj| jjd}|d u rW| 	|}| 
|}	| |}
||	 |
 }| |}| |}|S )Nr   r   r    r   r"   device)r8   r   hasattrr    r6   r4   r7   r9   rE   r)   r+   r-   r.   r2   )r;   rA   r    r   rB   input_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr+   r-   
embeddingsr?   r?   r@   forwardA   s(   






zConvBertEmbeddings.forward)NNNN)
__name__
__module____qualname____doc__r$   r4   
LongTensorFloatTensorrL   __classcell__r?   r?   r=   r@   r   .   s$    r   c                       s6   e Zd ZU eed< dZdZe  fddZ	  Z
S )ConvBertPreTrainedModelr<   convbertTc                    s   t  | t|trt|j dS t|tr+tj|j	d| j
jd t|j dS t|trIt|jt|jjd d t|j dS dS )zInitialize the weights        meanstdr   r   N)r#   _init_weights
isinstanceSeparableConv1Dinitzeros_biasGroupedLinearLayernormal_weightr<   initializer_ranger   copy_r   r4   r5   shaper6   r    )r;   moduler=   r?   r@   rZ   n   s   


"z%ConvBertPreTrainedModel._init_weights)rM   rN   rO   r   __annotations__base_model_prefixsupports_gradient_checkpointingr4   no_gradrZ   rS   r?   r?   r=   r@   rT   h   s   
 rT   c                       6   e Zd ZdZ fddZdejdejfddZ  ZS )r\   zSThis class implements separable convolution, i.e. a depthwise and a pointwise layerc                    s~   t    tj|||||d dd| _tj||ddd| _tt|d| _	| jj
jjd|jd | jj
jjd|jd d S )N   F)kernel_sizegroupspaddingr_   r   )rm   r_   rV   rW   )r#   r$   r   Conv1d	depthwise	pointwise	Parameterr4   r7   r_   rb   datara   rc   )r;   r<   input_filtersoutput_filtersrm   kwargsr=   r?   r@   r$      s   
zSeparableConv1D.__init__hidden_statesrC   c                 C   s"   |  |}| |}|| j7 }|S N)rq   rr   r_   )r;   rx   xr?   r?   r@   rL         


zSeparableConv1D.forward	rM   rN   rO   rP   r$   r4   TensorrL   rS   r?   r?   r=   r@   r\   |   s    r\   c                       sd   e Zd Z fddZ			ddejdejdB dejdB dedB d	eejejdB f f
d
dZ	  Z
S )ConvBertSelfAttentionc                    s`  t    |j|j dkrt|dstd|j d|j d|j|j }|dk r1|j| _d| _n|| _|j| _|j| _|j| j dkrHtd|j| j d | _| j| j | _	t
|j| j	| _t
|j| j	| _t
|j| j	| _t||j| j	| j| _t
| j	| j| j | _t
|j| j	| _t
j| jdgt| jd d dgd	| _t
|j| _d S )
Nr   r'   zThe hidden size (z6) is not a multiple of the number of attention heads ()r   z6hidden_size should be divisible by num_attention_headsrl   )rm   ro   )r#   r$   hidden_sizenum_attention_headsrF   
ValueError
head_ratioconv_kernel_sizeattention_head_sizeall_head_sizer   Linearquerykeyvaluer\   key_conv_attn_layerconv_kernel_layerconv_out_layerUnfoldintunfoldr0   attention_probs_dropout_probr2   )r;   r<   new_num_attention_headsr=   r?   r@   r$      s<   

zConvBertSelfAttention.__init__NFrx   attention_maskencoder_hidden_statesoutput_attentionsrC   c                 C   s~  |j \}}}|d ur| |}| |}	n
| |}| |}	| |dd}
|
dd}
| |}||d| j| jdd}||d| j| jdd}|	|d| j| jdd}t	
|
|}| |}t	|d| jdg}t	j|dd}| |}t	||d| jg}|dd d}tjj|| jdgd| jd d dgdd}|dd|d| j| j}t	|d| j| jg}t	||}t	|d| jg}t	||dd}|t| j }|d ur|| }tjj|dd}| |}t	||}|dddd }t	||d| j| jg}t	||gd}| d d | j| j d f }|j| }|r:||f}|S |f}|S )	Nr   rl   r   dimr   )rm   dilationro   strider   )re   r   r   r   	transposer   viewr   r   r4   multiplyr   reshaper   softmaxr   r   
contiguous	unsqueezer   
functionalr   matmulmathsqrtr2   permutecatr8   )r;   rx   r   r   r   
batch_sizerH   _mixed_key_layermixed_value_layermixed_key_conv_attn_layermixed_query_layerquery_layer	key_layervalue_layerconv_attn_layerr   r   attention_scoresattention_probscontext_layerconv_outnew_context_layer_shapeoutputsr?   r?   r@   rL      sx   







zConvBertSelfAttention.forwardNNFrM   rN   rO   r$   r4   r}   rR   booltuplerL   rS   r?   r?   r=   r@   r~      s     *r~   c                       8   e Zd Z fddZdejdejdejfddZ  ZS )ConvBertSelfOutputc                    sB   t    t|j|j| _tj|j|jd| _t|j	| _
d S Nr   )r#   r$   r   r   r   denser.   r/   r0   r1   r2   r:   r=   r?   r@   r$     s   
zConvBertSelfOutput.__init__rx   input_tensorrC   c                 C   &   |  |}| |}| || }|S ry   r   r2   r.   r;   rx   r   r?   r?   r@   rL        

zConvBertSelfOutput.forwardrM   rN   rO   r$   r4   r}   rL   rS   r?   r?   r=   r@   r     s    $r   c                       sd   e Zd Z fddZ			ddejdejdB dejdB dedB d	eejejdB f f
d
dZ	  Z
S )ConvBertAttentionc                    s"   t    t|| _t|| _d S ry   )r#   r$   r~   r;   r   outputr:   r=   r?   r@   r$   !  s   

zConvBertAttention.__init__NFrx   r   r   r   rC   c                 C   s6   |  ||||}| |d |}|f|dd   }|S )Nr   r   )r;   r   )r;   rx   r   r   r   self_outputsattention_outputr   r?   r?   r@   rL   &  s   zConvBertAttention.forwardr   r   r?   r?   r=   r@   r      s     r   c                       2   e Zd Z fddZdejdejfddZ  ZS )r`   c                    sj   t    || _|| _|| _| j| j | _| j| j | _tt	
| j| j| j| _tt	
|| _d S ry   )r#   r$   
input_sizeoutput_size
num_groupsgroup_in_dimgroup_out_dimr   rs   r4   emptyrb   r_   )r;   r   r   r   r=   r?   r@   r$   9  s   
zGroupedLinearLayer.__init__rx   rC   c                 C   sr   t | d }t|d| j| jg}|ddd}t|| j}|ddd}t||d| j	g}|| j
 }|S )Nr   r   r   rl   )listr8   r4   r   r   r   r   r   rb   r   r_   )r;   rx   r   rz   r?   r?   r@   rL   C  s   
zGroupedLinearLayer.forwardr   r?   r?   r=   r@   r`   8  s    
r`   c                       r   )ConvBertIntermediatec                    sf   t    |jdkrt|j|j| _nt|j|j|jd| _t	|j
tr-t|j
 | _d S |j
| _d S )Nr   r   r   r   )r#   r$   r   r   r   r   intermediate_sizer   r`   r[   
hidden_actstrr	   intermediate_act_fnr:   r=   r?   r@   r$   O  s   

zConvBertIntermediate.__init__rx   rC   c                 C   s   |  |}| |}|S ry   )r   r   r;   rx   r?   r?   r@   rL   \  s   

zConvBertIntermediate.forwardr   r?   r?   r=   r@   r   N  s    r   c                       r   )ConvBertOutputc                    sd   t    |jdkrt|j|j| _nt|j|j|jd| _tj	|j|j
d| _	t|j| _d S )Nr   r   r   )r#   r$   r   r   r   r   r   r   r`   r.   r/   r0   r1   r2   r:   r=   r?   r@   r$   c  s   

zConvBertOutput.__init__rx   r   rC   c                 C   r   ry   r   r   r?   r?   r@   rL   n  r   zConvBertOutput.forwardr   r?   r?   r=   r@   r   b  s    $r   c                       sx   e Zd Z fddZ				ddejdejdB dejdB dejdB d	edB d
eejejdB f fddZ	dd Z
  ZS )ConvBertLayerc                    sn   t    |j| _d| _t|| _|j| _|j| _| jr+| js&t|  dt|| _	t
|| _t|| _d S )Nr   z> should be used as a decoder model if cross attention is added)r#   r$   chunk_size_feed_forwardseq_len_dimr   	attention
is_decoderadd_cross_attention	TypeErrorcrossattentionr   intermediater   r   r:   r=   r?   r@   r$   v  s   



zConvBertLayer.__init__NFrx   r   r   encoder_attention_maskr   rC   c                 C   s   | j |||d}|d }|dd  }| jr:|d ur:t| ds&td|  d| ||||}	|	d }||	dd   }t| j| j| j|}
|
f| }|S )N)r   r   r   r   z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`)	r   r   rF   AttributeErrorr   r   feed_forward_chunkr   r   )r;   rx   r   r   r   r   self_attention_outputsr   r   cross_attention_outputslayer_outputr?   r?   r@   rL     s2   


zConvBertLayer.forwardc                 C   s   |  |}| ||}|S ry   )r   r   )r;   r   intermediate_outputr   r?   r?   r@   r     s   
z ConvBertLayer.feed_forward_chunk)NNNF)rM   rN   rO   r$   r4   r}   rR   r   r   rL   r   rS   r?   r?   r=   r@   r   u  s(    
%r   c                       sx   e Zd Z fddZ						ddejdejdB dejdB d	ejdB d
edB dedB dedB dee	B fddZ
  ZS )ConvBertEncoderc                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r?   )r   ).0r   r<   r?   r@   
<listcomp>  s    z,ConvBertEncoder.__init__.<locals>.<listcomp>F)	r#   r$   r<   r   
ModuleListrangenum_hidden_layerslayergradient_checkpointingr:   r=   r   r@   r$     s   
 
zConvBertEncoder.__init__NFTrx   r   r   r   r   output_hidden_statesreturn_dictrC   c                 C   s   |rdnd }|r
dnd }	|r| j jrdnd }
t| jD ]+\}}|r&||f }||||||}|d }|rF|	|d f }	| j jrF|
|d f }
q|rN||f }|s]tdd |||	|
fD S t|||	|
dS )Nr?   r   r   rl   c                 s   s    | ]	}|d ur|V  qd S ry   r?   )r   vr?   r?   r@   	<genexpr>  s    z*ConvBertEncoder.forward.<locals>.<genexpr>)last_hidden_staterx   
attentionscross_attentions)r<   r   	enumerater   r   r   )r;   rx   r   r   r   r   r   r   all_hidden_statesall_self_attentionsall_cross_attentionsilayer_modulelayer_outputsr?   r?   r@   rL     s>   



zConvBertEncoder.forward)NNNFFT)rM   rN   rO   r$   r4   r}   rR   r   r   r   rL   rS   r?   r?   r=   r@   r     s2    		r   c                       r   )ConvBertPredictionHeadTransformc                    sV   t    t|j|j| _t|jtrt	|j | _
n|j| _
tj|j|jd| _d S r   )r#   r$   r   r   r   r   r[   r   r   r	   transform_act_fnr.   r/   r:   r=   r?   r@   r$     s   
z(ConvBertPredictionHeadTransform.__init__rx   rC   c                 C   s"   |  |}| |}| |}|S ry   )r   r  r.   r   r?   r?   r@   rL     r{   z'ConvBertPredictionHeadTransform.forwardr   r?   r?   r=   r@   r    s    	r  c                       sJ   e Zd ZdZdef fddZ	ddejdejdB dejfd	d
Z	  Z
S )ConvBertSequenceSummarya  
    Compute a single vector summary of a sequence hidden states.

    Args:
        config ([`ConvBertConfig`]):
            The config used by the model. Relevant arguments in the config class of the model are (refer to the actual
            config class of your model for the default values it uses):

            - **summary_type** (`str`) -- The method to use to make this summary. Accepted values are:

                - `"last"` -- Take the last token hidden state (like XLNet)
                - `"first"` -- Take the first token hidden state (like Bert)
                - `"mean"` -- Take the mean of all tokens hidden states
                - `"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2)
                - `"attn"` -- Not implemented now, use multi-head attention

            - **summary_use_proj** (`bool`) -- Add a projection after the vector extraction.
            - **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to `config.num_labels` classes
              (otherwise to `config.hidden_size`).
            - **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the output,
              another string or `None` will add no activation.
            - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation.
            - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation.
    r<   c                    s   t    t|dd| _| jdkrtt | _t|dr<|j	r<t|dr1|j
r1|jdkr1|j}n|j}t|j|| _t|dd }|rHt|nt | _t | _t|drc|jdkrct|j| _t | _t|d	r{|jdkr}t|j| _d S d S d S )
Nsummary_typelastattnsummary_use_projsummary_proj_to_labelsr   summary_activationsummary_first_dropoutsummary_last_dropout)r#   r$   getattrr  NotImplementedErrorr   IdentitysummaryrF   r  r  
num_labelsr   r   r
   
activationfirst_dropoutr
  r0   last_dropoutr  )r;   r<   num_classesactivation_stringr=   r?   r@   r$     s&   




z ConvBertSequenceSummary.__init__Nrx   	cls_indexrC   c                 C   s  | j dkr|dddf }ne| j dkr|dddf }nW| j dkr(|jdd}nK| j d	krl|du rItj|d
ddddf |jd d tjd}n|dd}|d| d  |	df }|
d|d}n| j dkrst| |}| |}| |}| |}|S )ak  
        Compute a single vector summary of a sequence hidden states.

        Args:
            hidden_states (`torch.FloatTensor` of shape `[batch_size, seq_len, hidden_size]`):
                The hidden states of the last layer.
            cls_index (`torch.LongTensor` of shape `[batch_size]` or `[batch_size, ...]` where ... are optional leading dimensions of `hidden_states`, *optional*):
                Used if `summary_type == "cls_index"` and takes the last token of the sequence as classification token.

        Returns:
            `torch.FloatTensor`: The summary of the sequence hidden states.
        r  Nr   firstr   rX   r   r   r  .r   r!   )r   r  )r  rX   r4   	full_likere   r9   r   r6   r   r8   gathersqueezer  r  r  r  r  )r;   rx   r  r   r?   r?   r@   rL   .  s.   



"




zConvBertSequenceSummary.forwardry   )rM   rN   rO   rP   r   r$   r4   rR   rQ   rL   rS   r?   r?   r=   r@   r    s    r  c                       s   e Zd Z fddZdd Zdd Ze								ddejdB d	ej	dB d
ejdB dejdB dej	dB de
dB de
dB de
dB deeB fddZ  ZS )ConvBertModelc                    sP   t  | t|| _|j|jkrt|j|j| _t	|| _
|| _|   d S ry   )r#   r$   r   rK   r'   r   r   r   embeddings_projectr   encoderr<   	post_initr:   r=   r?   r@   r$   \  s   

zConvBertModel.__init__c                 C   s   | j jS ry   rK   r)   r;   r?   r?   r@   get_input_embeddingsh  s   z"ConvBertModel.get_input_embeddingsc                 C   s   || j _d S ry   r  )r;   r   r?   r?   r@   set_input_embeddingsk  s   z"ConvBertModel.set_input_embeddingsNrA   r   r    r   rB   r   r   r   rC   c	                 K   s`  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d ur*|d ur*td|d ur9| || | }
n|d urF| d d }
ntd|
\}}|d urU|jn|j}|d u rctj	|
|d}|d u rt
| jdr| jjd d d |f }|||}|}n	tj|
tj|d}| ||
}| j||||d}t
| dr| |}| j|||||d	}|S )
NzDYou cannot specify both input_ids and inputs_embeds at the same timer   z5You have to specify either input_ids or inputs_embeds)rE   r    rD   )rA   r   r    rB   r  )r   r   r   r   )r<   r   r   use_return_dictr   %warn_if_padding_and_no_attention_maskr8   rE   r4   onesrF   rK   r    r6   r7   r9   get_extended_attention_maskr  r  )r;   rA   r   r    r   rB   r   r   r   rw   rG   r   rH   rE   rI   rJ   extended_attention_maskrx   r?   r?   r@   rL   n  sH   


zConvBertModel.forward)NNNNNNNN)rM   rN   rO   r$   r!  r"  r   r4   rQ   rR   r   r   r   rL   rS   r?   r?   r=   r@   r  Z  s@    	r  c                       rk   )ConvBertGeneratorPredictionszAPrediction module for the generator, made up of two dense layers.c                    s>   t    td| _tj|j|jd| _t|j	|j| _
d S )Ngelur   )r#   r$   r
   r  r   r.   r'   r/   r   r   r   r:   r=   r?   r@   r$     s   

z%ConvBertGeneratorPredictions.__init__generator_hidden_statesrC   c                 C   s"   |  |}| |}| |}|S ry   )r   r  r.   )r;   r*  rx   r?   r?   r@   rL     s   


z$ConvBertGeneratorPredictions.forward)	rM   rN   rO   rP   r$   r4   rR   rL   rS   r?   r?   r=   r@   r(    s    r(  c                       s   e Zd ZddiZ fddZdd Zdd Ze																		dd
ej	d	B dej
d	B dej	d	B dej	d	B dej
d	B dej	d	B ded	B ded	B ded	B deeB fddZ  ZS )ConvBertForMaskedLMzgenerator_lm_head.weightz*convbert.embeddings.word_embeddings.weightc                    s>   t  | t|| _t|| _t|j|j	| _
|   d S ry   )r#   r$   r  rU   r(  generator_predictionsr   r   r'   r&   generator_lm_headr  r:   r=   r?   r@   r$     s
   

zConvBertForMaskedLM.__init__c                 C   s   | j S ry   r-  r   r?   r?   r@   get_output_embeddings  s   z)ConvBertForMaskedLM.get_output_embeddingsc                 C   s
   || _ d S ry   r.  )r;   r)   r?   r?   r@   set_output_embeddings  s   
z)ConvBertForMaskedLM.set_output_embeddingsNrA   r   r    r   rB   labelsr   r   r   rC   c
              
   K   s   |	dur|	n| j j}	| ||||||||	}|d }| |}| |}d}|dur<t }||d| j j|d}|	sR|f|dd  }|durP|f| S |S t	|||j
|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        Nr   r   r   losslogitsrx   r   )r<   r#  rU   r,  r-  r   r   r   r&   r   rx   r   )r;   rA   r   r    r   rB   r1  r   r   r   rw   r*  generator_sequence_outputprediction_scoresr3  loss_fctr   r?   r?   r@   rL     s6   


zConvBertForMaskedLM.forward	NNNNNNNNN)rM   rN   rO   _tied_weights_keysr$   r/  r0  r   r4   rQ   rR   r   r   r   rL   rS   r?   r?   r=   r@   r+    sH    
	
r+  c                       rk   )ConvBertClassificationHeadz-Head for sentence-level classification tasks.c                    sZ   t    t|j|j| _|jd ur|jn|j}t|| _	t|j|j
| _|| _d S ry   )r#   r$   r   r   r   r   classifier_dropoutr1   r0   r2   r  out_projr<   r;   r<   r;  r=   r?   r@   r$     s   

z#ConvBertClassificationHead.__init__rx   rC   c                 K   sR   |d d dd d f }|  |}| |}t| jj |}|  |}| |}|S )Nr   )r2   r   r	   r<   r   r<  )r;   rx   rw   rz   r?   r?   r@   rL     s   



z"ConvBertClassificationHead.forwardr|   r?   r?   r=   r@   r:  	  s    r:  z
    ConvBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    )custom_introc                          e Zd Z fddZe									ddejdB dejdB dejdB dejdB dejdB d	ejdB d
edB dedB dedB de	e
B fddZ  ZS )!ConvBertForSequenceClassificationc                    s:   t  | |j| _|| _t|| _t|| _|   d S ry   )	r#   r$   r  r<   r  rU   r:  
classifierr  r:   r=   r?   r@   r$   (  s   

z*ConvBertForSequenceClassification.__init__NrA   r   r    r   rB   r1  r   r   r   rC   c
              
   K   sf  |	dur|	n| j j}	| j||||||||	d}|d }| |}d}|dur| j jdu rP| jdkr6d| j _n| jdkrL|jtjksG|jtj	krLd| j _nd| j _| j jdkrnt
 }| jdkrh|| | }n+|||}n%| j jdkrt }||d| j|d}n| j jdkrt }|||}|	s|f|dd  }|dur|f| S |S t|||j|jd	S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r    r   rB   r   r   r   r   r   
regressionsingle_label_classificationmulti_label_classificationr   r2  )r<   r#  rU   rA  problem_typer  r"   r4   r9   r   r   r  r   r   r   r   rx   r   r;   rA   r   r    r   rB   r1  r   r   r   rw   r   sequence_outputr4  r3  r7  r   r?   r?   r@   rL   2  sR   


"


z)ConvBertForSequenceClassification.forwardr8  )rM   rN   rO   r$   r   r4   rQ   rR   r   r   r   rL   rS   r?   r?   r=   r@   r@  !  sB    
	
r@  c                       r?  )ConvBertForMultipleChoicec                    s<   t  | t|| _t|| _t|jd| _	| 
  d S )Nr   )r#   r$   r  rU   r  sequence_summaryr   r   r   rA  r  r:   r=   r?   r@   r$   {  s
   

z"ConvBertForMultipleChoice.__init__NrA   r   r    r   rB   r1  r   r   r   rC   c
              
   K   sl  |	dur|	n| j j}	|dur|jd n|jd }|dur%|d|dnd}|dur4|d|dnd}|durC|d|dnd}|durR|d|dnd}|dure|d|d|dnd}| j||||||||	d}|d }| |}| |}|d|}d}|durt }|||}|	s|f|dd  }|dur|f| S |S t	|||j
|jdS )a\  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:


            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Nr   r   r   rB  r   r2  )r<   r#  re   r   r8   rU   rJ  rA  r   r   rx   r   )r;   rA   r   r    r   rB   r1  r   r   r   rw   num_choicesr   rH  pooled_outputr4  reshaped_logitsr3  r7  r   r?   r?   r@   rL     sJ   -


z!ConvBertForMultipleChoice.forwardr8  )rM   rN   rO   r$   r   r4   rQ   rR   r   r   r   rL   rS   r?   r?   r=   r@   rI  y  sB    
	
rI  c                       r?  )ConvBertForTokenClassificationc                    s^   t  | |j| _t|| _|jd ur|jn|j}t|| _	t
|j|j| _|   d S ry   )r#   r$   r  r  rU   r;  r1   r   r0   r2   r   r   rA  r  r=  r=   r?   r@   r$     s   
z'ConvBertForTokenClassification.__init__NrA   r   r    r   rB   r1  r   r   r   rC   c
              
   K   s   |	dur|	n| j j}	| j||||||||	d}|d }| |}| |}d}|dur;t }||d| j|d}|	sQ|f|dd  }|durO|f| S |S t|||j	|j
dS )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        NrB  r   r   r   r2  )r<   r#  rU   r2   rA  r   r   r  r   rx   r   rG  r?   r?   r@   rL     s6   

z&ConvBertForTokenClassification.forwardr8  )rM   rN   rO   r$   r   r4   rQ   rR   r   r   r   rL   rS   r?   r?   r=   r@   rN    sB    	
rN  c                       s   e Zd Z fddZe										ddejdB dejdB dejdB dejdB dejdB d	ejdB d
ejdB dedB dedB dedB de	e
B fddZ  ZS )ConvBertForQuestionAnsweringc                    s<   t  | |j| _t|| _t|j|j| _| 	  d S ry   )
r#   r$   r  r  rU   r   r   r   
qa_outputsr  r:   r=   r?   r@   r$   (  s
   
z%ConvBertForQuestionAnswering.__init__NrA   r   r    r   rB   start_positionsend_positionsr   r   r   rC   c              
   K   sF  |
d ur|
n| j j}
| j|||||||	|
d}|d }| |}|jddd\}}|d }|d }d }|d ur|d urt| dkrN|d}t| dkr[|d}|d}|	d|}|	d|}t
|d}|||}|||}|| d }|
s||f|dd   }|d ur|f| S |S t||||j|jdS )	NrB  r   r   r   r   )ignore_indexrl   )r3  start_logits
end_logitsrx   r   )r<   r#  rU   rP  splitr  r   lenr8   clampr   r   rx   r   )r;   rA   r   r    r   rB   rQ  rR  r   r   r   rw   r   rH  r4  rT  rU  
total_lossignored_indexr7  
start_lossend_lossr   r?   r?   r@   rL   2  sN   






z$ConvBertForQuestionAnswering.forward)
NNNNNNNNNN)rM   rN   rO   r$   r   r4   rQ   rR   r   r   r   rL   rS   r?   r?   r=   r@   rO  &  sH    
	
rO  )r+  rI  rO  r@  rN  r   r  rT   )<rP   r   collections.abcr   r4   r   torch.nnr   r   r    r   r]   activationsr	   r
   modeling_layersr   modeling_outputsr   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   utilsr   r   configuration_convbertr   
get_loggerrM   loggerModuler   rT   r\   r~   r   r   r`   r   r   r   r   r  r  r  r(  r+  r:  r@  rI  rN  rO  __all__r?   r?   r?   r@   <module>   s\    
:|:6cQJRgDL