o
    ei                     @   s  d dl Z d dlmZ d dlmZ d dlZd dlmZ d dlmZm	Z	m
Z
 ddlmZ ddlmZ dd	lmZmZ dd
lmZmZ ddlmZ ddlmZmZmZmZmZmZ ddlm Z m!Z! ddl"m#Z#m$Z$ ddl%m&Z& ddl'm(Z(m)Z) ddl*m+Z+m,Z,m-Z- ddl.m/Z/ ddl0m1Z1 G dd dej2Z3G dd dej2Z4G dd dej2Z5	dDdej2dej6dej6dej6d ej6dB d!e7d"e7fd#d$Z8d%d& Z9ed'dEd(d)Z:ee:G d*d+ d+ej2Z;G d,d- d-eZ<e)G d.d/ d/e$Z=e)G d0d1 d1e=Z>G d2d3 d3ej2Z?e)d4d5G d6d7 d7e=Z@e)d8d5G d9d: d:e=ZAe)d;d5G d<d= d=e=ZBe)G d>d? d?e=ZCe)d@d5G dAdB dBe=ZDg dCZEdS )F    N)Callable)Optional)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FN)use_kernel_func_from_hubuse_kernelized_func)create_bidirectional_mask(create_bidirectional_sliding_window_mask)GradientCheckpointingLayer)BaseModelOutputMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstring)can_return_tuplemaybe_autocastmerge_with_config_defaults)capture_outputs   )ModernBertConfigc                       sN   e Zd ZdZdef fddZ	ddejdB dejdB dejfd	d
Z	  Z
S )ModernBertEmbeddingszV
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    configc                    sR   t    || _tj|j|j|jd| _tj	|j|j
|jd| _t|j| _d S )N)padding_idxepsbias)super__init__r$   r   	Embedding
vocab_sizehidden_sizepad_token_idtok_embeddings	LayerNormnorm_eps	norm_biasnormDropoutembedding_dropoutdropselfr$   	__class__ p/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/modernbert/modeling_modernbert.pyr*   9   s
   
zModernBertEmbeddings.__init__N	input_idsinputs_embedsreturnc                 C   s6   |d ur|  | |}|S |  | | |}|S N)r6   r3   r/   )r8   r=   r>   hidden_statesr;   r;   r<   forward@   s
   zModernBertEmbeddings.forwardNN)__name__
__module____qualname____doc__r"   r*   torch
LongTensorTensorrB   __classcell__r;   r;   r9   r<   r#   4   s    r#   c                       s<   e Zd ZdZdef fddZdejdejfddZ  Z	S )	ModernBertMLPa6  Applies the GLU at the end of each ModernBERT layer.

    Compared to the default BERT architecture, this block replaces :class:`~transformers.model.bert.modeling_bert.BertIntermediate`
    and :class:`~transformers.model.bert.modeling_bert.SelfOutput` with a single module that has similar functionality.
    r$   c                    sf   t    || _tj|jt|jd |jd| _	t
|j | _t|j| _tj|j|j|jd| _d S )N   r(   )r)   r*   r$   r   Linearr-   intintermediate_sizemlp_biasWir
   hidden_activationactr4   mlp_dropoutr6   Wor7   r9   r;   r<   r*   Q   s   
 zModernBertMLP.__init__rA   r?   c                 C   s2   |  |jddd\}}| | | || S )NrM   dim)rS   chunkrW   r6   rU   )r8   rA   inputgater;   r;   r<   rB   Y   s   zModernBertMLP.forward)
rD   rE   rF   rG   r"   r*   rH   rJ   rB   rK   r;   r;   r9   r<   rL   J   s    rL   c                       s   e Zd ZU ejed< ddef fddZe				ddedB de	d de
dB d	edB d
edef f
ddZe edddZ  ZS )ModernBertRotaryEmbeddinginv_freqNr$   c                    s   t    |j| _|j| _|| _tt|j| _i | _	| jD ]P}| jj
| }|d u r+q|d | j	|< | j}| j	| dkrCt| j	|  }|| j||d\}}| j| d|dd | j| d| dd t| | d| qd S )	N	rope_typedefault
layer_type	_inv_freqF)
persistent_original_inv_freq_attention_scaling)r)   r*   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr$   listsetlayer_typesr`   rope_parameterscompute_default_rope_parametersr   register_bufferclonesetattr)r8   r$   devicerc   rope_paramsrope_init_fncurr_inv_freqcurr_attention_scalingr9   r;   r<   r*   a   s&   

z"ModernBertRotaryEmbedding.__init__rs   ztorch.deviceseq_lenrc   r?   ztorch.Tensorc                 C   s^   | j | d }t| ddp| j| j }d}d|tjd|dtjdj|tjd|   }||fS )	a|  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
            layer_type (`str`, *optional*):
                The current layer type if the model has different RoPE parameters per type.
                Should not be used unless `config.layer_types is not None`

        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetahead_dimNg      ?r   rM   dtypers   r|   )	rn   getattrr-   num_attention_headsrH   arangeint64tofloat)r$   rs   rx   rc   baserZ   attention_factorr_   r;   r;   r<   ro   x   s   &z9ModernBertRotaryEmbedding.compute_default_rope_parametersc                 C   s  t | | d}t | | d}|d d d d f  |jd dd|j}|d d d d d f  }t|jjtrE|jjdkrE|jjnd}t	|dd	) | |  
dd
}	tj|	|	fdd}
|
 | }|
 | }W d    n1 syw   Y  |j|jd|j|jdfS )Nrd   rg   r   rX   r!   mpscpuF)device_typeenabledrM   rY   r{   )r~   r   expandshaper   rs   
isinstancetypestrr   	transposerH   catcossinr|   )r8   xposition_idsrc   r_   attention_scalinginv_freq_expandedposition_ids_expandedr   freqsembr   r   r;   r;   r<   rB      s   .&z!ModernBertRotaryEmbedding.forwardr@   NNNN)rD   rE   rF   rH   rJ   __annotations__r"   r*   staticmethodr   rP   r   tupler   ro   no_gradr   rB   rK   r;   r;   r9   r<   r^   ^   s,   
 

#r^           modulequerykeyvalueattention_maskscalingdropoutc           
      K   s|   t ||dd| }|d ur|| }tjj|dt jd|j}tjj	||| j
d}t ||}	|	dd }	|	|fS )NrM   r   rX   )rZ   r|   )ptrainingr!   )rH   matmulr   r   
functionalsoftmaxfloat32r   r|   r   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputr;   r;   r<   eager_attention_forward   s   
r   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..NrX   rM   rY   )r   rH   r   )r   x1x2r;   r;   r<   rotate_half   s   r   rotary_pos_embc                 C   sf   | j }||}||}|  | t|  |  }| | t| |  }||||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )r|   	unsqueezer   r   r   )qkr   r   unsqueeze_dimoriginal_dtypeq_embedk_embedr;   r;   r<   apply_rotary_pos_emb   s   

r   c                       s   e Zd ZdZddededB f fddZ		ddejde	ejejf dB d	ejdB d
e
e de	ejejdB f f
ddZ  ZS )ModernBertAttentiona  Performs multi-headed self attention on a batch of unpadded sequences.

    If Flash Attention 2 is installed, this module uses Flash Attention to improve throughput.
    If Flash Attention 2 is not installed, the implementation will use PyTorch's SDPA kernel,
    which requires padding and unpadding inputs, adding some overhead.

    See `forward` method for additional details.
    Nr$   	layer_idxc                    s   t    || _|| _|j|j dkr td|j d|j d|j| _|j| _|j|j | _	t
j|jd| j	 |j |jd| _|j| dkrN|jd | _nd | _d	| _t
j|j|j|jd| _|jd
krnt
|j| _d S t
 | _d S )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()r   rN   sliding_attentionr!   Fr   )r)   r*   r$   r   r-   r   
ValueErrorattention_dropoutdeterministic_flash_attnrz   r   rO   attention_biasWqkvrm   sliding_window	is_causalrW   r4   Identityout_dropr8   r$   r   r9   r;   r<   r*      s&   
*zModernBertAttention.__init__rA   position_embeddingsr   r   r?   c                 K   s  |j d d }| |}|jg |dd| jR  }|jdd\}}}	|dd}|dd}|	dd}	|\}
}t|||
|dd\}}t}| jj	dkrSt
| jj	 }|| |||	|f| jr`| jnd	| jd
 | j| jd|\}}|jg |dR   }| | |}||fS )NrX   r   rY   r!   rM   )r   eagerr         )r   r   r   deterministic)r   r   viewrz   unbindr   r   r   r$   _attn_implementationr   r   r   r   r   reshaper   r   rW   )r8   rA   r   r   r   input_shapeqkvquery_states
key_statesvalue_statesr   r   attention_interfacer   r   r;   r;   r<   rB     s:   


zModernBertAttention.forwardr@   rC   )rD   rE   rF   rG   r"   rP   r*   rH   rJ   r   r   r   rB   rK   r;   r;   r9   r<   r      s     	 r   c                       sd   e Zd ZddededB f fddZ		ddejdejdB dejdB d	ee	 d
ejf
ddZ
  ZS )ModernBertEncoderLayerNr$   r   c                    s   t    || _|| _|dkrt | _ntj|j|j	|j
d| _t||d| _tj|j|j	|j
d| _t|| _|j| | _d S )Nr   r&   )r$   r   )r)   r*   r$   r   r   r   	attn_normr0   r-   r1   r2   r   attnmlp_normrL   mlprm   attention_typer   r9   r;   r<   r*   :  s   

zModernBertEncoderLayer.__init__rA   r   r   r   r?   c                 K   sB   | j | |f||d|\}}|| }|| | | }|S )N)r   r   )r   r   r   r   )r8   rA   r   r   r   r   _r;   r;   r<   rB   G  s   
zModernBertEncoderLayer.forwardr@   rC   )rD   rE   rF   r"   rP   r*   rH   rJ   r   r   rB   rK   r;   r;   r9   r<   r   9  s    r   c                   @   sX   e Zd ZU eed< dZdZddgZdZdZ	dZ
dZeedZe dejfdd	Zd
S )ModernBertPreTrainedModelr$   modelTr#   r   )rA   
attentionsr   c                    s  | j j  d u r
d dtjdtf fdd}| j j| j jtd| j j  | j j| j j	d d}t
|tr?||j|d	  d S t
|trV||j|d
  ||j|d  d S t
|trm||j|d
  ||j|d  d S t
|tr|||j|d  d S t
|tr||j|d  d S t
|ttttfr||j|d  d S t
|tjrt|j |jd urt |j d S d S t
|t!r|j"D ]6}|j#}|j$| dkrt%|j$|  }||j |d\}}t&t'|| d| t&t'|| d| qd S d S )Nr   r   stdc                    sN   t j| jd|  |  | d t| tjr#| jd ur%t | j d S d S d S )Nr   )meanr   ab)inittrunc_normal_weightr   r   rO   r(   zeros_)r   r   cutoff_factorr;   r<   init_weighto  s   
z<ModernBertPreTrainedModel._init_weights.<locals>.init_weightg       @r   )inout	embedding	final_outr   r   r   r   ra   rb   rd   rf   )(r$   initializer_cutoff_factorr   Moduler   initializer_rangemathsqrtnum_hidden_layersr-   r   r#   r/   rL   rS   rW   r   r   ModernBertPredictionHeaddenseModernBertForMaskedLMdecoder#ModernBertForSequenceClassificationModernBertForMultipleChoice ModernBertForTokenClassificationModernBertForQuestionAnswering
classifierr0   r   ones_r   r(   r   r^   rm   ro   r`   r   copy_r~   )r8   r   r   stdsrc   ru   rv   r   r;   r   r<   _init_weightsi  sZ   





	


z'ModernBertPreTrainedModel._init_weightsN)rD   rE   rF   r"   r   base_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_attention_backendr   r   _can_record_outputsrH   r   r   r   r  r;   r;   r;   r<   r   Y  s   
 r   c                       s   e Zd Zdef fddZdd Zdd Zeee					dd	e
jdB d
e
jdB de
jdB de
jdB dee defddZ  ZS )ModernBertModelr$   c                    sr   t     | _t | _t fddt jD | _	tj
 j j jd| _t d| _d| _|   d S )Nc                    s   g | ]}t  |qS r;   )r   ).0r   r$   r;   r<   
<listcomp>  s    z,ModernBertModel.__init__.<locals>.<listcomp>r&   r  F)r)   r*   r$   r#   
embeddingsr   
ModuleListranger   layersr0   r-   r1   r2   
final_normr^   
rotary_embgradient_checkpointing	post_initr7   r9   r  r<   r*     s   
zModernBertModel.__init__c                 C   s   | j jS r@   r  r/   r8   r;   r;   r<   get_input_embeddings  s   z$ModernBertModel.get_input_embeddingsc                 C   s   || j _d S r@   r  )r8   r   r;   r;   r<   set_input_embeddings  s   z$ModernBertModel.set_input_embeddingsNr=   r   r   r>   r   r?   c                 K   s  |d u |d uA rt d|d ur|jd n|jd }|d ur!|jn|j}|d u r2tj||dd}| j||d}t| }	tsV| j	||d}
t
d
i |
td
i |
d}	i }| j	jD ]}| |||||< q\| jD ]}||f|	|j ||j d|}qk| |}t|d	S )Nz:You must specify exactly one of input_ids or inputs_embedsr!   rs   r   )r=   r>   )r$   r>   r   )full_attentionr   )r   r   )last_hidden_stater;   )r   r   rs   rH   r   r   r  r   dictr$   r   r   rm   r  r  r   r  r   )r8   r=   r   r   r>   r   rx   rs   rA   attention_mask_mappingmask_kwargsr   rc   encoder_layerr;   r;   r<   rB     s:   


zModernBertModel.forwardr   )rD   rE   rF   r"   r*   r  r  r   r    r   rH   rI   rJ   r   r   r   rB   rK   r;   r;   r9   r<   r    s0    r  c                       s8   e Zd Zdef fddZdejdejfddZ  ZS )r   r$   c                    sN   t    || _t|j|j|j| _t|j	 | _
tj|j|j|jd| _d S )Nr&   )r)   r*   r$   r   rO   r-   classifier_biasr   r
   classifier_activationrU   r0   r1   r2   r3   r7   r9   r;   r<   r*     s
   
z!ModernBertPredictionHead.__init__rA   r?   c                 C   s   |  | | |S r@   )r3   rU   r   )r8   rA   r;   r;   r<   rB     s   z ModernBertPredictionHead.forward)	rD   rE   rF   r"   r*   rH   rJ   rB   rK   r;   r;   r9   r<   r     s    r   zd
    The ModernBert Model with a decoder head on top that is used for masked language modeling.
    )custom_introc                       s   e Zd ZddiZdef fddZdd Zdejfd	d
Z	e
e					ddejdB dejdB dejdB dejdB dejdB dee deej eB fddZ  ZS )r   zdecoder.weightz&model.embeddings.tok_embeddings.weightr$   c                    s^   t  | || _t|| _t|| _tj|j	|j
|jd| _| jj| _| jj| _|   d S )NrN   )r)   r*   r$   r  r   r   headr   rO   r-   r,   decoder_biasr   sparse_predictionsparse_pred_ignore_indexr  r7   r9   r;   r<   r*     s   



zModernBertForMaskedLM.__init__c                 C   s   | j S r@   r   r  r;   r;   r<   get_output_embeddings  s   z+ModernBertForMaskedLM.get_output_embeddingsnew_embeddingsc                 C   s
   || _ d S r@   r-  )r8   r/  r;   r;   r<   set_output_embeddings  s   
z+ModernBertForMaskedLM.set_output_embeddingsNr=   r   r   r>   labelsr   r?   c                 K   s   | j d||||d|}|d }| jr3|d ur3|d}||jd d}|| jk}	||	 }||	 }| | |}
d }|d urO| j|
|fd| jj	i|}t
||
|j|jdS )Nr=   r   r   r>   r   rX   r,   losslogitsrA   r   r;   )r   r+  r   r   r,  r   r)  loss_functionr$   r,   r   rA   r   )r8   r=   r   r   r>   r1  r   outputsr!  mask_tokensr5  r4  r;   r;   r<   rB     s2   

zModernBertForMaskedLM.forwardNNNNN)rD   rE   rF   _tied_weights_keysr"   r*   r.  r   rO   r0  r   r   rH   rI   rJ   r   r   r   r   rB   rK   r;   r;   r9   r<   r     s6    r   z`
    The ModernBert Model with a sequence classification head on top that performs pooling.
    c                          e Zd Zdef fddZee					ddejdB dej	dB dej	dB dej	dB d	ej	dB d
e
e deej	 eB fddZ  ZS )r   r$   c                    s\   t  | |j| _|| _t|| _t|| _tj	
|j| _t	|j|j| _|   d S r@   )r)   r*   
num_labelsr$   r  r   r   r)  rH   r   r4   classifier_dropoutr6   rO   r-   r  r  r7   r9   r;   r<   r*   F  s   

z,ModernBertForSequenceClassification.__init__Nr=   r   r   r>   r1  r   r?   c                 K   s  | j d||||d|}|d }| jjdkr |dddf }n+| jjdkrK|du r9tj|jdd |jtjd}||d j	d	d
|j	d	dd }| 
|}	| |	}	| |	}
d}|dur| jjdu r| jd	krpd| j_n| jd	kr|jtjks|jtjkrd| j_nd| j_| jjdkrt }| jd	kr||
 | }n+||
|}n%| jjdkrt }||
d| j|d}n| jjdkrt }||
|}t||
|j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        r2  r   clsNr   rM   r}   rX   r!   rY   TrZ   keepdim
regressionsingle_label_classificationmulti_label_classificationr3  r;   )r   r$   classifier_poolingrH   onesr   rs   boolr   sumr)  r6   r  problem_typer<  r|   longrP   r   squeezer   r   r   r   rA   r   )r8   r=   r   r   r>   r1  r   r7  r!  pooled_outputr5  r4  loss_fctr;   r;   r<   rB   S  s^   




"


z+ModernBertForSequenceClassification.forwardr9  )rD   rE   rF   r"   r*   r   r   rH   rI   rJ   r   r   r   r   rB   rK   r;   r;   r9   r<   r   @  s0    r   zv
    The ModernBert Model with a token classification head on top, e.g. for Named Entity Recognition (NER) tasks.
    c                       r;  )r   r$   c                    V   t  | |j| _t|| _t|| _tj	|j
| _t|j|j| _|   d S r@   r)   r*   r<  r  r   r   r)  rH   r   r4   r=  r6   rO   r-   r  r  r7   r9   r;   r<   r*     s   

z)ModernBertForTokenClassification.__init__Nr=   r   r   r>   r1  r   r?   c                 K   s   | j d||||d|}|d }| |}| |}| |}	d}
|dur6t }||	d| j|d}
t|
|	|j|j	dS )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        r2  r   NrX   r3  r;   )
r   r)  r6   r  r   r   r<  r   rA   r   )r8   r=   r   r   r>   r1  r   r7  r!  r5  r4  rL  r;   r;   r<   rB     s,   


z(ModernBertForTokenClassification.forwardr9  )rD   rE   rF   r"   r*   r   r   rH   rI   rJ   r   r   r   r   rB   rK   r;   r;   r9   r<   r     0    r   c                       s   e Zd Zdef fddZee					ddejdB dejdB dejdB dejdB d	ejdB d
e	e
 deej eB fddZ  ZS )r  r$   c                    rM  r@   rN  r7   r9   r;   r<   r*     s   

z'ModernBertForQuestionAnswering.__init__Nr=   r   r   start_positionsend_positionsr   r?   c                 K   s   | j |f||d|}|d }| |}| |}| |}	|	jddd\}
}|
d }
|d }d }|d urL|d urL| j|
|||fi |}t||
||j	|j
dS )N)r   r   r   r!   rX   rY   )r4  start_logits
end_logitsrA   r   )r   r)  r6   r  splitrJ  r   r6  r   rA   r   )r8   r=   r   r   rP  rQ  r   r7  r!  r5  rR  rS  r4  r;   r;   r<   rB     s2   


z&ModernBertForQuestionAnswering.forwardr9  )rD   rE   rF   r"   r*   r   r   rH   rJ   r   r   r   r   rB   rK   r;   r;   r9   r<   r    s0    r  z
    The ModernBert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks.
    c                       r;  )r   r$   c                    sR   t  | || _t|| _t|| _tj	|j
| _t|jd| _|   d S )Nr!   )r)   r*   r$   r  r   r   r)  rH   r   r4   r=  r6   rO   r-   r  r  r7   r9   r;   r<   r*     s   

z$ModernBertForMultipleChoice.__init__Nr=   r   r   r>   r1  r   r?   c                 K   s  |dur	|j d n|j d }|dur|d|dnd}|dur*|d|dnd}|dur9|d|dnd}|durL|d|d|dnd}| jd||||d|}|d }	| jjdkrtj|	j d |	jd}
|dur|j	dd	
|	j}n
tjdtj|	jd
}|	|
|f }	n| jjdkr|jddd}|	|d jdd	| }	| |	}| |}| |}|d|}d}|durt }|||}t|||j|jdS )a&  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors.
        Nr!   rX   r2  r   r>  r  rY   )r|   rs   r   Tr?  r3  r;   )r   r   sizer   r$   rD  rH   r   rs   argmaxr   tensorrI  rG  r   r)  r6   r  r   r   r   rA   r   )r8   r=   r   r   r>   r1  r   num_choicesr7  r!  	indices_0cls_masknum_non_pad_tokensrK  r5  reshaped_logitsr4  rL  r;   r;   r<   rB     sP   



z#ModernBertForMultipleChoice.forwardr9  )rD   rE   rF   r"   r*   r   r   rH   rI   rJ   r   r   r   r   rB   rK   r;   r;   r9   r<   r     rO  r   )r  r   r   r   r   r  r   )r   )r!   )Fr   collections.abcr   typingr   rH   r   torch.nnr   r   r    r	   r   activationsr
   integrationsr   r   masking_utilsr   r   modeling_layersr   modeling_outputsr   r   r   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   utils.genericr   r   r   utils.output_capturingr    configuration_modernbertr"   r   r#   rL   r^   rJ   r   r   r   r   r   r   r   r  r   r   r   r   r  r   __all__r;   r;   r;   r<   <module>   s    X
Q MEBV64U