o
    ei                     @   s  d Z ddlZddlZddlmZ ddlmZmZmZ ddlm	Z
 ddlmZ ddlmZ dd	lmZmZmZmZmZmZ dd
lmZ ddlmZ ddlmZmZmZmZmZm Z  ddl!m"Z" e #e$Z%da&dd Z'dd Z(dQddZ)dQddZ*dQddZ+dd Z,G dd dej-j.Z/G dd dej-j.Z0G d d! d!Z1dRd"d#Z2d$d% Z3			dSd&d'Z4G d(d) d)ej5Z6G d*d+ d+ej5Z7G d,d- d-ej5Z8G d.d/ d/ej5Z9G d0d1 d1ej5Z:G d2d3 d3ej5Z;G d4d5 d5eZ<G d6d7 d7ej5Z=G d8d9 d9ej5Z>G d:d; d;ej5Z?G d<d= d=ej5Z@eG d>d? d?eZAeG d@dA dAeAZBeG dBdC dCeAZCG dDdE dEej5ZDedFdGG dHdI dIeAZEeG dJdK dKeAZFeG dLdM dMeAZGeG dNdO dOeAZHg dPZIdS )TzPyTorch MRA model.    N)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FN)GradientCheckpointingLayer)"BaseModelOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward)auto_docstringis_cuda_platformis_kernels_availableis_ninja_availableis_torch_cuda_availablelogging   )	MraConfigc                  C   s&   t  stdddlm}  | dad S )NzFkernels is not installed, please install it with `pip install kernels`r   
get_kernelzkernels-community/mra)r   ImportErrorintegrations.hub_kernelsr   mra_cuda_kernelr    r   b/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/mra/modeling_mra.pyload_cuda_kernels3   s   r!   c                 C   s   t |  dkrtdt | dkrtd| ddkr#td| ddkr.td| jd	d
jdd	}| }| }| }t	||||\}}|dd	dddddddf }||fS )z8
    Computes maximum values for softmax stability.
       z.sparse_qk_prod must be a 4-dimensional tensor.   'indices must be a 2-dimensional tensor.    z>The size of the second dimension of sparse_qk_prod must be 32.r   z=The size of the third dimension of sparse_qk_prod must be 32.dimN)
lensize
ValueErrormaxvalues	transpose
contiguousintr   	index_max)sparse_qk_prodindicesquery_num_blockkey_num_block
index_valsmax_valsmax_vals_scatterr   r   r    
sparse_max<   s   $r:   r%   c                 C   s   t |  dkrtdt | dkrtd| jd |jd kr&td| j\}}|| }tj|dtj|jd}| |||} | |dddf ||  ddf } | S )zN
    Converts attention mask to a sparse mask for high resolution logits.
    r#   z$mask must be a 2-dimensional tensor.r$   r   zBmask and indices must have the same size in the zero-th dimension.dtypedeviceN)	r*   r+   r,   shapetorcharangelongr=   reshape)maskr4   
block_size
batch_sizeseq_len	num_block	batch_idxr   r   r    sparse_maskX   s   
&rI   c           	      C   s"  |   \}}}|  \}}}|| dkrtd|| dkr"td| ||| ||dd} |||| ||dd}t|   dkrJtdt|  dkrVtdt|  d	krbtd
|  ddkrmtd| ddkrxtd|  } | }| }| }t| || S )z7
    Performs Sampled Dense Matrix Multiplication.
    r   zTquery_size (size of first dimension of dense_query) must be divisible by block_size.Pkey_size (size of first dimension of dense_key) must be divisible by block_size.r)   r&   r"   z+dense_query must be a 4-dimensional tensor.)dense_key must be a 4-dimensional tensor.r#   r$   r   r%   z.The third dimension of dense_query must be 32.z,The third dimension of dense_key must be 32.)	r+   r,   rB   r/   r*   r0   r1   r   mm_to_sparse)	dense_query	dense_keyr4   rD   rE   
query_sizer(   _key_sizer   r   r    rL   o   s.   rL   c           	      C   s  |  \}}}|| dkrtd|  d|krtd|  d|kr'td|||| ||dd}t|   d	krAtd
t|  d	krMtdt|  dkrYtd| ddkrdtd|  } | }| }| }t| |||}|dd||| |}|S )zP
    Performs matrix multiplication of a sparse matrix with a dense matrix.
    r   rJ   r#   zQThe size of the second dimension of sparse_query must be equal to the block_size.r   zPThe size of the third dimension of sparse_query must be equal to the block_size.r)   r&   r"   ,sparse_query must be a 4-dimensional tensor.rK   r$   r%   z8The size of the third dimension of dense_key must be 32.)	r+   r,   rB   r/   r*   r0   r1   r   sparse_dense_mm)	sparse_queryr4   rN   r5   rD   rE   rQ   r(   dense_qk_prodr   r   r    rS      s.   rS   c                 C   s    | | | t j| |dd  S )Nfloorrounding_mode)r?   divrA   )r4   dim_1_blockdim_2_blockr   r   r    transpose_indices   s    r\   c                   @   s2   e Zd Zedd Zedd Zed	ddZdS )
MraSampledDenseMatMulc                 C   &   t ||||}| ||| || _|S N)rL   save_for_backwardrD   )ctxrM   rN   r4   rD   r3   r   r   r    forward      zMraSampledDenseMatMul.forwardc                 C   sj   | j \}}}| j}|d| }|d| }t|||}t|dd|||}	t||||}
|
|	d d fS Nr   r)   r&   )saved_tensorsrD   r+   r\   rS   r/   )ra   gradrM   rN   r4   rD   r5   r6   	indices_Tgrad_key
grad_queryr   r   r    backward   s   zMraSampledDenseMatMul.backwardr%   c                 C      t | |||S r_   )r]   apply)rM   rN   r4   rD   r   r   r    operator_call      z#MraSampledDenseMatMul.operator_callNr%   __name__
__module____qualname__staticmethodrb   rj   rm   r   r   r   r    r]      s    


r]   c                   @   s0   e Zd Zedd Zedd Zedd ZdS )MraSparseDenseMatMulc                 C   r^   r_   )rS   r`   r5   )ra   rT   r4   rN   r5   r3   r   r   r    rb      rc   zMraSparseDenseMatMul.forwardc           
      C   s`   | j \}}}| j}|d|d }t|||}t|dd|||}t|||}	|	d |d fS rd   )re   r5   r+   r\   rS   r/   rL   )
ra   rf   rT   r4   rN   r5   r6   rg   rh   ri   r   r   r    rj      s   zMraSparseDenseMatMul.backwardc                 C   rk   r_   )ru   rl   )rT   r4   rN   r5   r   r   r    rm      rn   z"MraSparseDenseMatMul.operator_callNrp   r   r   r   r    ru      s    

	ru   c                   @   s   e Zd Zedd ZdS )MraReduceSumc                 C   s  |   \}}}}t|   dkrtdt|  dkr td|   \}}}}|  \}}| jdd|| |} tj| dtj|jd}tj	||dd	 |d d d f |  || }	tj
|| |f| j| jd}
|
d|	| |||}|||| }|S )
Nr"   rR   r#   r$   r'   r   r;   rV   rW   )r+   r*   r,   sumrB   r?   r@   rA   r=   rY   zerosr<   	index_add)rT   r4   r5   r6   rE   rG   rD   rP   rH   global_idxestempoutputr   r   r    rm      s$   &
zMraReduceSum.operator_callN)rq   rr   rs   rt   rm   r   r   r   r    rv      s    rv   c                 C   s  |   \}}}|| }d}	|durl||||jdd}
| ||||jdd|
dddddf d  }|||||jdd|
dddddf d  }|durk|||||jdd|
dddddf d  }	n5|tj||tj| jd }
| ||||jdd}|||||jdd}|dur|||||jdd}	t||	ddt
| }|jdddj}|dur|d	|
dddddf |
dddddf  d
k    }||
||	fS )z/
    Compute low resolution approximation.
    Nr)   r'   r&   ư>r;   T)r(   keepdims     @g      ?)r+   rB   rw   r?   onesfloatr=   meanmatmulr/   mathsqrtr-   r.   )querykeyrD   rC   valuerE   rF   head_dimnum_block_per_row	value_hattoken_count	query_hatkey_hatlow_resolution_logitlow_resolution_logit_row_maxr   r   r    get_low_resolution_logit  s6   :r   c                 C   sT  | j \}}}|dkr3|d }tj||| jd}	tjtj|	| d|d}
| |
dddddf d  } |dkrk| ddd|ddf d | ddd|ddf< | ddddd|f d | ddddd|f< tj| |d|ddd	d
}|j}|dkr|j	j
ddj	}| |ddddf k }||fS |dkrd}||fS t| d)zZ
    Compute the indices of the subset of components to be used in the approximation.
    r   r#   r=   )diagonalNg     @r)   TF)r(   largestsortedfullr'   sparsez# is not a valid approx_model value.)r>   r?   r   r=   triltriutopkrB   r4   r.   minr   r,   )r   
num_blocksapprox_modeinitial_prior_first_n_blocksinitial_prior_diagonal_n_blocksrE   total_blocks_per_rowrP   offset	temp_maskdiagonal_mask
top_k_valsr4   	thresholdhigh_resolution_maskr   r   r    get_block_idxes7  s.   r   c	           $      C   s  t du rt|  S |  \}	}
}}|	|
 }|| dkr!td|| }| |||} ||||}||||}|dure| |dddddf  } ||dddddf  }||dddddf  }|dkrvt| ||||\}}}}n(|dkrt  t| |||\}}}}W d   n1 sw   Y  nt	dt  || }t
|||||\}}W d   n1 sw   Y  tj| |||dt| }t||||\}}|| }|dur|dd	t||dddddddf    }t|}t||||}t||||}|dkrt|| d|  |dddddf  }t||dddddddf d	d	|d	|||}|jd
ddddddf d	d	|||}|d	d	|||| } |durq| | } t| | dk  }!||!dddddf  }||! }t|  | dk  }"||"dddddf  }||" }|| |dddddf |dddddf  d  }#n|dkr||dddddf d  }#nt	d|dur|#|dddddf  }#|#|	|
||}#|#S )z0
    Use Mra to approximate self-attention.
    Nr   z4sequence length must be divisible by the block_size.r   r   z&approx_mode must be "full" or "sparse")rD   r   r   r)   r'   r}   z-config.approx_mode must be "full" or "sparse")r   r?   
zeros_likerequires_grad_r+   r,   rB   r   no_grad	Exceptionr   r]   rm   r   r   r:   rI   expru   rv   r   repeatrw   r   )$r   r   r   rC   r   r   rD   r   r   rE   num_headrF   r   
meta_batchr   r   r   r   r   rP   low_resolution_logit_normalizedr4   r   high_resolution_logitr8   r9   high_resolution_attnhigh_resolution_attn_outhigh_resolution_normalizerlow_resolution_attnlow_resolution_attn_outlow_resolution_normalizerlog_correctionlow_resolution_corrhigh_resolution_corrcontext_layerr   r   r    mra2_attention]  s   




.

"
.
.
 
r   c                       s*   e Zd ZdZ fddZdddZ  ZS )MraEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    s   t    tj|j|j|jd| _t|jd |j| _	t|j
|j| _tj|j|jd| _t|j| _| dt|jdd  | jdtj| j tj| jjddd	 d S )
N)padding_idxr#   epsposition_idsr   r)   token_type_idsr;   F)
persistent)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_bufferr?   r@   expandrx   r   r+   rA   r=   selfconfig	__class__r   r    r     s   

zMraEmbeddings.__init__Nc                 C   s   |d ur	|  }n|  d d }|d }|d u r$| jd d d |f }|d u rNt| drC| jd d d |f }||d |}|}ntj|tj| jjd}|d u rW| 	|}| 
|}	||	 }
| |}|
|7 }
| |
}
| |
}
|
S )Nr)   r   r   r   r;   )r+   r   hasattrr   r   r?   rx   rA   r=   r   r   r   r   r   )r   	input_idsr   r   inputs_embedsinput_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr   
embeddingsr   r   r   r    rb     s*   






zMraEmbeddings.forward)NNNNrq   rr   rs   __doc__r   rb   __classcell__r   r   r   r    r     s    r   c                       &   e Zd Z fddZdddZ  ZS )MraSelfAttentionc              
      sR  t    |j|j dkrt|dstd|j d|j dtd u}t rNt rNt	 rN|sNzt
  W n tyM } ztd|  W Y d }~nd }~ww |j| _t|j|j | _| j| j | _t|j| j| _t|j| j| _t|j| j| _t|j| _|jd |j | _t| jt|jd d | _|j| _|j| _|j| _d S )	Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()zGCould not load the custom kernel for multi-scale deformable attention: r%   r#   ) r   r   r   num_attention_headsr   r,   r   r   r   r   r!   r   loggerwarningr1   attention_head_sizeall_head_sizer   Linearr   r   r   r   attention_probs_dropout_probr   r   block_per_rowrG   r   r   r   r   )r   r   kernel_loadeder   r   r    r     s6   


zMraSelfAttention.__init__Nc              
   C   s  |j \}}}| ||d| j| jdd}| ||d| j| jdd}| ||d| j| jdd}d|d  }| 	d| jd
|| j | }d}	| j|	k r|| j||	| j f}
tj|tj|
|jdgdd}tj|tj|
|jdgdd}tj|tj|
|jdgdd}t| | | | | j| j| j| jd	}| j|	k r|d d d d d d d | jf }|
|| j|| j}|d
ddd }| d d | jf }|j| }|f}|S )Nr)   r   r#   g      ?r   r%   r   r'   )r   r   r   r   r   r&   )r>   r   viewr   r   r/   r   r   squeezer   rB   r1   r?   catrx   r=   r   r   rG   r   r   r   permuter0   r+   r   )r   hidden_statesattention_maskrE   rF   rP   query_layer	key_layervalue_layergpu_warp_sizepad_sizer   new_context_layer_shapeoutputsr   r   r    rb   (  sV   	

"
zMraSelfAttention.forwardr_   rq   rr   rs   r   rb   r   r   r   r   r    r     s     r   c                       8   e Zd Z fddZdejdejdejfddZ  ZS )MraSelfOutputc                    sB   t    t|j|j| _tj|j|jd| _t|j	| _
d S Nr   )r   r   r   r   r   denser   r   r   r   r   r   r   r   r    r   i     
zMraSelfOutput.__init__r   input_tensorreturnc                 C   &   |  |}| |}| || }|S r_   r  r   r   r   r   r  r   r   r    rb   o     

zMraSelfOutput.forwardrq   rr   rs   r   r?   Tensorrb   r   r   r   r   r    r  h      $r  c                       r   )MraAttentionc                    s"   t    t|| _t|| _d S r_   )r   r   r   r   r  r|   r   r   r   r    r   w  s   

zMraAttention.__init__Nc                 C   s2   |  ||}| |d |}|f|dd   }|S Nr   r   )r   r|   )r   r   r   self_outputsattention_outputr  r   r   r    rb   |  s   zMraAttention.forwardr_   r  r   r   r   r    r  v  s    r  c                       2   e Zd Z fddZdejdejfddZ  ZS )MraIntermediatec                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S r_   )r   r   r   r   r   intermediate_sizer  
isinstance
hidden_actstrr   intermediate_act_fnr   r   r   r    r     s
   
zMraIntermediate.__init__r   r	  c                 C      |  |}| |}|S r_   )r  r  r   r   r   r   r    rb        

zMraIntermediate.forwardr  r   r   r   r    r    s    r  c                       r  )	MraOutputc                    sB   t    t|j|j| _tj|j|jd| _t	|j
| _d S r  )r   r   r   r   r  r   r  r   r   r   r   r   r   r   r   r    r     r  zMraOutput.__init__r   r  r	  c                 C   r
  r_   r  r  r   r   r    rb     r  zMraOutput.forwardr  r   r   r   r    r    r  r  c                       s.   e Zd Z fddZdddZdd Z  ZS )	MraLayerc                    sB   t    |j| _d| _t|| _|j| _t|| _t	|| _
d S Nr   )r   r   chunk_size_feed_forwardseq_len_dimr  	attentionadd_cross_attentionr  intermediater  r|   r   r   r   r    r     s   


zMraLayer.__init__Nc                 C   sB   |  ||}|d }|dd  }t| j| j| j|}|f| }|S r  )r$  r   feed_forward_chunkr"  r#  )r   r   r   self_attention_outputsr  r  layer_outputr   r   r    rb     s   
zMraLayer.forwardc                 C   s   |  |}| ||}|S r_   )r&  r|   )r   r  intermediate_outputr)  r   r   r    r'    s   
zMraLayer.feed_forward_chunkr_   )rq   rr   rs   r   rb   r'  r   r   r   r   r    r     s    
	r   c                       s,   e Zd Z fddZ			dddZ  ZS )	
MraEncoderc                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r   )r   ).0rP   r   r   r    
<listcomp>  s    z'MraEncoder.__init__.<locals>.<listcomp>F)	r   r   r   r   
ModuleListrangenum_hidden_layerslayergradient_checkpointingr   r   r-  r    r     s   
 
zMraEncoder.__init__NFTc           	      C   st   |rdnd }t | jD ]\}}|r||f }|||}|d }q|r'||f }|s4tdd ||fD S t||dS )Nr   r   c                 s   s    | ]	}|d ur|V  qd S r_   r   )r,  vr   r   r    	<genexpr>  s    z%MraEncoder.forward.<locals>.<genexpr>)last_hidden_stater   )	enumerater2  tupler
   )	r   r   r   output_hidden_statesreturn_dictall_hidden_statesilayer_modulelayer_outputsr   r   r    rb     s   



zMraEncoder.forward)NFTr  r   r   r   r    r+    s    	r+  c                       r  )MraPredictionHeadTransformc                    sV   t    t|j|j| _t|jtrt	|j | _
n|j| _
tj|j|jd| _d S r  )r   r   r   r   r   r  r  r  r  r   transform_act_fnr   r   r   r   r   r    r     s   
z#MraPredictionHeadTransform.__init__r   r	  c                 C   s"   |  |}| |}| |}|S r_   )r  r@  r   r  r   r   r    rb     s   


z"MraPredictionHeadTransform.forwardr  r   r   r   r    r?    s    	r?  c                       s$   e Zd Z fddZdd Z  ZS )MraLMPredictionHeadc                    sB   t    t|| _tj|j|jdd| _t	t
|j| _d S )NT)bias)r   r   r?  	transformr   r   r   r   decoder	Parameterr?   rx   rB  r   r   r   r    r     s   

zMraLMPredictionHead.__init__c                 C   r  r_   )rC  rD  r  r   r   r    rb     r  zMraLMPredictionHead.forwardr  r   r   r   r    rA    s    	rA  c                       r  )MraOnlyMLMHeadc                    s   t    t|| _d S r_   )r   r   rA  predictionsr   r   r   r    r     s   
zMraOnlyMLMHead.__init__sequence_outputr	  c                 C   s   |  |}|S r_   )rG  )r   rH  prediction_scoresr   r   r    rb     s   
zMraOnlyMLMHead.forwardr  r   r   r   r    rF    s    rF  c                       s>   e Zd ZU eed< dZdZe de	j
f fddZ  ZS )MraPreTrainedModelr   mraTmodulec                    sj   t  | t|trt|j dS t|tr3t|j	t
|j	jd dd  t|j dS dS )zInitialize the weightsr)   r   r#   N)r   _init_weightsr  rA  initzeros_rB  r   copy_r   r?   r@   r>   r   r   )r   rL  r   r   r    rM    s   

&z MraPreTrainedModel._init_weights)rq   rr   rs   r   __annotations__base_model_prefixsupports_gradient_checkpointingr?   r   r   ModulerM  r   r   r   r   r    rJ    s   
 rJ  c                       s   e Zd Z fddZdd Zdd Ze							ddejdB d	ejdB d
ejdB dejdB dejdB de	dB de	dB de
eB fddZ  ZS )MraModelc                    s2   t  | || _t|| _t|| _|   d S r_   )r   r   r   r   r   r+  encoder	post_initr   r   r   r    r   $  s
   

zMraModel.__init__c                 C   s   | j jS r_   r   r   r   r   r   r    get_input_embeddings.  s   zMraModel.get_input_embeddingsc                 C   s   || j _d S r_   rX  )r   r   r   r   r    set_input_embeddings1  s   zMraModel.set_input_embeddingsNr   r   r   r   r   r9  r:  r	  c                 K   sj  |d ur|n| j j}|d ur|n| j j}|d ur |d ur td|d ur/| || | }	n|d ur<| d d }	ntd|	\}
}|d urK|jn|j}|d u r[tj|
|f|d}|d u rt	| j
drz| j
jd d d |f }||
|}|}n	tj|	tj|d}| ||	}| j
||||d}| j||||d}|d	 }|s|f|d
d   S t||j|j|jdS )NzDYou cannot specify both input_ids and inputs_embeds at the same timer)   z5You have to specify either input_ids or inputs_embedsr   r   r;   )r   r   r   r   )r   r9  r:  r   r   )r6  r   
attentionscross_attentions)r   r9  use_return_dictr,   %warn_if_padding_and_no_attention_maskr+   r=   r?   r   r   r   r   r   rx   rA   get_extended_attention_maskrV  r
   r   r\  r]  )r   r   r   r   r   r   r9  r:  kwargsr   rE   r   r=   r   r   extended_attention_maskembedding_outputencoder_outputsrH  r   r   r    rb   4  sV   
zMraModel.forward)NNNNNNN)rq   rr   rs   r   rZ  r[  r   r?   r  boolr8  r
   rb   r   r   r   r   r    rU  "  s:    

rU  c                       s   e Zd ZdddZ fddZdd Zdd	 Ze	
	
	
	
	
	
	
	
ddej	d
B dej	d
B dej	d
B dej	d
B dej	d
B dej	d
B de
d
B de
d
B deeB fddZ  ZS )MraForMaskedLMzcls.predictions.biasz%mra.embeddings.word_embeddings.weight)zcls.predictions.decoder.biaszcls.predictions.decoder.weightc                    s,   t  | t|| _t|| _|   d S r_   )r   r   rU  rK  rF  clsrW  r   r   r   r    r     s   

zMraForMaskedLM.__init__c                 C   s
   | j jjS r_   )rg  rG  rD  rY  r   r   r    get_output_embeddings  s   
z$MraForMaskedLM.get_output_embeddingsc                 C   s   || j j_|j| j j_d S r_   )rg  rG  rD  rB  )r   new_embeddingsr   r   r    set_output_embeddings  s   
z$MraForMaskedLM.set_output_embeddingsNr   r   r   r   r   labelsr9  r:  r	  c	              	   K   s   |dur|n| j j}| j|||||||d}
|
d }| |}d}|dur6t }||d| j j|d}|sL|f|
dd  }|durJ|f| S |S t|||
j|
j	dS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        Nr   r   r   r   r9  r:  r   r)   r   losslogitsr   r\  )
r   r^  rK  rg  r   r   r   r   r   r\  )r   r   r   r   r   r   rk  r9  r:  ra  r  rH  rI  masked_lm_lossloss_fctr|   r   r   r    rb     s2   

zMraForMaskedLM.forwardNNNNNNNN)rq   rr   rs   _tied_weights_keysr   rh  rj  r   r?   r  re  r8  r   rb   r   r   r   r   r    rf  z  sF    		rf  c                       s(   e Zd ZdZ fddZdd Z  ZS )MraClassificationHeadz-Head for sentence-level classification tasks.c                    sF   t    t|j|j| _t|j| _t|j|j	| _
|| _d S r_   )r   r   r   r   r   r  r   r   r   
num_labelsout_projr   r   r   r   r    r     s
   

zMraClassificationHead.__init__c                 K   sR   |d d dd d f }|  |}| |}t| jj |}|  |}| |}|S )Nr   )r   r  r   r   r  rv  )r   featuresra  xr   r   r    rb     s   



zMraClassificationHead.forwardr   r   r   r   r    rt    s    rt  z
    MRA Model transformer with a sequence classification/regression head on top (a linear layer on top of
    the pooled output) e.g. for GLUE tasks.
    )custom_introc                          e Zd Z fddZe								ddejdB dejdB dejdB dejdB dejdB d	ejdB d
edB dedB dee	B fddZ
  ZS )MraForSequenceClassificationc                    s4   t  | |j| _t|| _t|| _|   d S r_   )r   r   ru  rU  rK  rt  
classifierrW  r   r   r   r    r     s
   

z%MraForSequenceClassification.__init__Nr   r   r   r   r   rk  r9  r:  r	  c	              	   K   sd  |dur|n| j j}| j|||||||d}
|
d }| |}d}|dur| j jdu rO| jdkr5d| j _n| jdkrK|jtjksF|jtj	krKd| j _nd| j _| j jdkrmt
 }| jdkrg|| | }n+|||}n%| j jdkrt }||d| j|d}n| j jdkrt }|||}|s|f|
dd  }|dur|f| S |S t|||
j|
jd	S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nrl  r   r   
regressionsingle_label_classificationmulti_label_classificationr)   rm  )r   r^  rK  r|  problem_typeru  r<   r?   rA   r1   r   r   r   r   r   r   r   r\  )r   r   r   r   r   r   rk  r9  r:  ra  r  rH  ro  rn  rq  r|   r   r   r    rb     sP   



"


z$MraForSequenceClassification.forwardrr  )rq   rr   rs   r   r   r?   r  re  r8  r   rb   r   r   r   r   r    r{    s<    		r{  c                       rz  )MraForMultipleChoicec                    sD   t  | t|| _t|j|j| _t|jd| _| 	  d S r!  )
r   r   rU  rK  r   r   r   pre_classifierr|  rW  r   r   r   r    r   0  s
   
zMraForMultipleChoice.__init__Nr   r   r   r   r   rk  r9  r:  r	  c	              	   K   s  |dur|n| j j}|dur|jd n|jd }
|dur%|d|dnd}|dur4|d|dnd}|durC|d|dnd}|durR|d|dnd}|dure|d|d|dnd}| j|||||||d}|d }|dddf }| |}t |}| 	|}|d|
}d}|durt
 }|||}|s|f|dd  }|dur|f| S |S t|||j|jdS )a[  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Nr   r)   r&   rl  r   rm  )r   r^  r>   r   r+   rK  r  r   ReLUr|  r   r   r   r\  )r   r   r   r   r   r   rk  r9  r:  ra  num_choicesr  hidden_statepooled_outputro  reshaped_logitsrn  rq  r|   r   r   r    rb   :  sL   +



zMraForMultipleChoice.forwardrr  )rq   rr   rs   r   r   r?   r  re  r8  r   rb   r   r   r   r   r    r  .  s<    
	r  c                       rz  )MraForTokenClassificationc                    sJ   t  | |j| _t|| _t|j| _t	|j
|j| _|   d S r_   )r   r   ru  rU  rK  r   r   r   r   r   r   r|  rW  r   r   r   r    r     s   
z"MraForTokenClassification.__init__Nr   r   r   r   r   rk  r9  r:  r	  c	              	   K   s  |dur|n| j j}| j|||||||d}
|
d }| |}| |}d}|durct }|durV|ddk}|d| j}t	||dt
|j|}|||}n||d| j|d}|sy|f|
dd  }|durw|f| S |S t|||
j|
jdS )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nrl  r   r)   r   rm  )r   r^  rK  r   r|  r   r   ru  r?   wheretensorignore_indextype_asr   r   r\  )r   r   r   r   r   r   rk  r9  r:  ra  r  rH  ro  rn  rq  active_lossactive_logitsactive_labelsr|   r   r   r    rb     sB   


z!MraForTokenClassification.forwardrr  )rq   rr   rs   r   r   r?   r  re  r8  r   rb   r   r   r   r   r    r    s<    	r  c                       s   e Zd Z fddZe									ddejdB dejdB dejdB dejdB dejdB d	ejdB d
ejdB dedB dedB dee	B fddZ
  ZS )MraForQuestionAnsweringc                    sB   t  | d|_|j| _t|| _t|j|j| _| 	  d S )Nr#   )
r   r   ru  rU  rK  r   r   r   
qa_outputsrW  r   r   r   r    r     s   
z MraForQuestionAnswering.__init__Nr   r   r   r   r   start_positionsend_positionsr9  r:  r	  c
              	   K   s<  |	d ur|	n| j j}	| j|||||||	d}|d }| |}|jddd\}}|d}|d}d }|d ur||d ur|t| dkrI|d}t| dkrV|d}|d}|d|}|d|}t	|d}|||}|||}|| d }|	s||f|dd   }|d ur|f| S |S t
||||j|jdS )	Nrl  r   r   r)   r'   )r  r#   )rn  start_logits
end_logitsr   r\  )r   r^  rK  r  splitr   r*   r+   clampr   r   r   r\  )r   r   r   r   r   r   r  r  r9  r:  ra  r  rH  ro  r  r  
total_lossignored_indexrq  
start_lossend_lossr|   r   r   r    rb     sL   









zMraForQuestionAnswering.forward)	NNNNNNNNN)rq   rr   rs   r   r   r?   r  re  r8  r   rb   r   r   r   r   r    r    sB    	
r  )rf  r  r  r{  r  r   rU  rJ  ro   )NN)r%   r   r   )Jr   r   r?   r   torch.nnr   r   r    r   rN  activationsr   modeling_layersr	   modeling_outputsr
   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   utilsr   r   r   r   r   r   configuration_mrar   
get_loggerrq   r   r   r!   r:   rI   rL   rS   r\   autogradFunctionr]   ru   rv   r   r   r   rT  r   r   r  r  r  r  r   r+  r?  rA  rF  rJ  rU  rf  rt  r{  r  r  r  __all__r   r   r   r    <module>   sv     
	


((
(-
s7a$
WJNfHL