o
    ei                     @   sx  d dl Z d dlmZmZ d dlZd dlmZ d dlmZmZm	Z	 ddl
mZ ddlmZ ddlmZmZ dd	lmZmZ dd
lmZmZ ddlmZ ddlmZmZmZmZmZm Z  ddl!m"Z"m#Z# ddl$m%Z%m&Z& ddl'm(Z( ddl)m*Z*m+Z+m,Z, ddl-m.Z.m/Z/ ddl0m1Z1 ddl2m3Z3 ddl4m5Z5m6Z6 e,7e8Z9G dd deZ:G dd dej;Z<G dd dej;Z=G dd de5Z>edd<d d!Z?ee?G d"d# d#ej;Z@G d$d% d%eZAe+G d&d' d'e&ZBe+G d(d) d)eBZCG d*d+ d+ej;ZDe+d,d-G d.d/ d/eBZEe+d0d-G d1d2 d2eBZFe+d3d-G d4d5 d5eBZGe+G d6d7 d7eBZHe+d8d-G d9d: d:eBZIg d;ZJdS )=    N)LiteralOptional)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FN)PreTrainedConfiglayer_type_validation)use_kernel_func_from_hubuse_kernelized_func)create_bidirectional_mask(create_bidirectional_sliding_window_mask)GradientCheckpointingLayer)BaseModelOutputMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)ROPE_INIT_FUNCTIONSRopeParameters)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringlogging)can_return_tuplemerge_with_config_defaults)capture_outputs   )eager_attention_forward)Gemma3RotaryEmbeddingrotate_halfc                F       s  e Zd ZdZdZdgZdddZ fddZ			
																																dJdedB dedB dedB d edB d!edB d"e	dB d#edB d$e
dB d%e
dB d&e
dB d'edB d(edB d)edB d*edB d+edB d,edB d-edB d.e
dB d/ee	 dB d0eed1 ef dB d2edB d3e
dB d4edB d5e
dB d6edB d7ed8 d9e
dB d:edB d;e	dB d<edB d=edB d>edB d?edB d@edB fD fdAdBZdKdCdDZ fdEdFZedGdH ZejdIdH Z  ZS )LModernBertConfigaO  
    This is the configuration class to store the configuration of a [`ModernBertModel`]. It is used to instantiate an ModernBert
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the ModernBERT-base.
    e.g. [answerdotai/ModernBERT-base](https://huggingface.co/answerdotai/ModernBERT-base)

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        vocab_size (`int`, *optional*, defaults to 50368):
            Vocabulary size of the ModernBert model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`ModernBertModel`]
        hidden_size (`int`, *optional*, defaults to 768):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 1152):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 22):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 12):
            Number of attention heads for each attention layer in the Transformer decoder.
        hidden_activation (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the decoder. Will default to `"gelu"`
            if not specified.
        max_position_embeddings (`int`, *optional*, defaults to 8192):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        initializer_cutoff_factor (`float`, *optional*, defaults to 2.0):
            The cutoff factor for the truncated_normal_initializer for initializing all weight matrices.
        norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the rms normalization layers.
        norm_bias (`bool`, *optional*, defaults to `False`):
            Whether to use bias in the normalization layers.
        pad_token_id (`int`, *optional*, defaults to 50283):
            Padding token id.
        eos_token_id (`int`, *optional*, defaults to 50282):
            End of stream token id.
        bos_token_id (`int`, *optional*, defaults to 50281):
            Beginning of stream token id.
        cls_token_id (`int`, *optional*, defaults to 50281):
            Classification token id.
        sep_token_id (`int`, *optional*, defaults to 50282):
            Separation token id.
        attention_bias (`bool`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        layer_types (`list`, *optional*):
            Attention pattern for each layer.
        rope_parameters (`dict`, *optional*):
            Dictionary mapping attention patterns (`"full_attention"`, `"sliding_attention"`) to `RopeParameters`.
            Each value should be a dictionary containing `rope_type` and optional scaling parameters.
        local_attention (`int`, *optional*, defaults to 128):
            The window size for local attention.
        embedding_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the embeddings.
        mlp_bias (`bool`, *optional*, defaults to `False`):
            Whether to use bias in the MLP layers.
        mlp_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the MLP layers.
        decoder_bias (`bool`, *optional*, defaults to `True`):
            Whether to use bias in the decoder layers.
        classifier_pooling (`str`, *optional*, defaults to `"cls"`):
            The pooling method for the classifier. Should be either `"cls"` or `"mean"`. In local attention layers, the
            CLS token doesn't attend to all tokens on long sequences.
        classifier_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the classifier.
        classifier_bias (`bool`, *optional*, defaults to `False`):
            Whether to use bias in the classifier.
        classifier_activation (`str`, *optional*, defaults to `"gelu"`):
            The activation function for the classifier.
        deterministic_flash_attn (`bool`, *optional*, defaults to `False`):
            Whether to use deterministic flash attention. If `False`, inference will be faster but not deterministic.
        sparse_prediction (`bool`, *optional*, defaults to `False`):
            Whether to use sparse prediction for the masked language model instead of returning the full dense logits.
        sparse_pred_ignore_index (`int`, *optional*, defaults to -100):
            The index to ignore for the sparse prediction.
        reference_compile (`bool`, *optional*):
            Whether to compile the layers of the model which were compiled during pretraining. If `None`, then parts of
            the model will be compiled if 1) `triton` is installed, 2) the model is not on MPS, 3) the model is not
            shared between devices, and 4) the model is not resized after initialization. If `True`, then the model may
            be faster in some scenarios. This argument is deprecated and will be removed in a future version.
        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
            Whether to tie weight embeddings

    Examples:

    ```python
    >>> from transformers import ModernBertModel, ModernBertConfig

    >>> # Initializing a ModernBert style configuration
    >>> configuration = ModernBertConfig()

    >>> # Initializing a model from the modernbert-base style configuration
    >>> model = ModernBertModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
modernbertpast_key_valuesg     Ag     @)globallocalc                    s0   |dkr|d urt d d }t || d S )Nreference_compilezThe `reference_compile` argument is deprecated and will be removed in `transformers v5.2.0`Use `torch.compile()` directly on the model instead.)loggerwarning_oncesuper__setattr__)selfnamevalue	__class__ o/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/modernbert/modular_modernbert.pyr0      s   zModernBertConfig.__setattr__             gelu    {Gz?       @h㈵>Fk  j  i          N   Tcls
vocab_sizehidden_sizeintermediate_sizenum_hidden_layersnum_attention_headshidden_activationmax_position_embeddingsinitializer_rangeinitializer_cutoff_factornorm_eps	norm_biaspad_token_ideos_token_idbos_token_idcls_token_idsep_token_idattention_biasattention_dropoutlayer_typesrope_parametersfull_attentionsliding_attentionlocal_attentionembedding_dropoutmlp_biasmlp_dropoutdecoder_biasclassifier_poolingrG   meanclassifier_dropoutclassifier_biasclassifier_activationdeterministic_flash_attnsparse_predictionsparse_pred_ignore_indexr,   tie_word_embeddingsc#           $         s>  | _ | _| _| _| _|" _| _| _| _| _	| _
| _| _|	 _|
 _| _| _| _| _| _| _| _| _| _| _| _| _| _| _| _|  _|! _ jdvrnt d j d| _!|#"dd _# j!d u r fddt$ j
D  _!t% j! j
 | _&t' j(di |# d S )	Nrf   zQInvalid value for `classifier_pooling`, should be either "cls" or "mean", but is .global_attn_every_n_layersr   c                    s"   g | ]}t | j rd ndqS r_   r^   )boolrp   ).0ir1   r6   r7   
<listcomp>   s    z-ModernBertConfig.__init__.<locals>.<listcomp>r6   ))rT   rV   rU   rW   rX   rn   rI   rO   rJ   rK   rL   rM   rP   rQ   rR   rS   rY   rZ   rN   r`   ra   rb   rc   rd   re   rh   ri   rj   rk   rl   rm   r,   
ValueErrorr[   getrp   ranger   r\   r/   __init__)$r1   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   rZ   r[   r\   r`   ra   rb   rc   rd   re   rh   ri   rj   rk   rl   rm   r,   rn   kwargsr4   ru   r7   rz      sZ   &


zModernBertConfig.__init__c                 K   s   | dd }ddiddid}| jd ur| jn|| _|d ur.| jd | | jd | | jdd u r=ddi| jd< | jd d| d| jd	  | jdd u r\ddi| jd< | jd d| d
| jd  |   | j|d |S )Nrope_scaling	rope_typedefaultrq   r^   r_   
rope_thetaglobal_rope_thetar*   local_rope_thetar+   )ignore_keys)popr\   updaterx   
setdefaultdefault_thetastandardize_rope_paramsvalidate_rope)r1   ignore_keys_at_rope_validationr{   r|   default_rope_paramsr6   r6   r7   convert_rope_params_to_dict   s*   

z,ModernBertConfig.convert_rope_params_to_dictc                    s   t   }|dd  |S )Nr,   )r/   to_dictr   )r1   outputr4   r6   r7   r     s   
zModernBertConfig.to_dictc                 C   s
   | j d S )zKHalf-window size: `local_attention` is the total window, so we divide by 2.r#   r`   ru   r6   r6   r7   sliding_window$  s   
zModernBertConfig.sliding_windowc                 C   s   |d | _ dS )z<Set sliding_window by updating local_attention to 2 * value.r#   Nr   r1   r3   r6   r6   r7   r   )  s   )"r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   FrB   rC   rD   rD   rC   FrE   NNrF   rE   FrE   TrG   rE   Fr=   FFrH   NTN)__name__
__module____qualname____doc__
model_typekeys_to_ignore_at_inferencer   r0   intstrfloatrr   listdictr   r   rz   r   r   propertyr   setter__classcell__r6   r6   r4   r7   r'   2   s    e
	

 !"#
[
r'   c                       sN   e Zd ZdZdef fddZ	ddejdB dejdB dejfd	d
Z	  Z
S )ModernBertEmbeddingszV
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    configc                    sR   t    || _tj|j|j|jd| _tj	|j|j
|jd| _t|j| _d S )N)padding_idxepsbias)r/   rz   r   r   	EmbeddingrI   rJ   rT   tok_embeddings	LayerNormrR   rS   normDropoutra   dropr1   r   r4   r6   r7   rz   4  s
   
zModernBertEmbeddings.__init__N	input_idsinputs_embedsreturnc                 C   s6   |d ur|  | |}|S |  | | |}|S r   )r   r   r   )r1   r   r   hidden_statesr6   r6   r7   forward;  s
   zModernBertEmbeddings.forwardNN)r   r   r   r   r'   rz   torch
LongTensorTensorr   r   r6   r6   r4   r7   r   /  s    r   c                       s<   e Zd ZdZdef fddZdejdejfddZ  Z	S )	ModernBertMLPa6  Applies the GLU at the end of each ModernBERT layer.

    Compared to the default BERT architecture, this block replaces :class:`~transformers.model.bert.modeling_bert.BertIntermediate`
    and :class:`~transformers.model.bert.modeling_bert.SelfOutput` with a single module that has similar functionality.
    r   c                    sf   t    || _tj|jt|jd |jd| _	t
|j | _t|j| _tj|j|j|jd| _d S )Nr#   r   )r/   rz   r   r   LinearrJ   r   rK   rb   Wir
   rN   actr   rc   r   Wor   r4   r6   r7   rz   L  s   
 zModernBertMLP.__init__r   r   c                 C   s2   |  |jddd\}}| | | || S )Nr#   dim)r   chunkr   r   r   )r1   r   inputgater6   r6   r7   r   T  s   zModernBertMLP.forward)
r   r   r   r   r'   rz   r   r   r   r   r6   r6   r4   r7   r   E  s    r   c                       sl   e Zd Zddef fddZe				ddedB ded dedB dedB d	e	d
e
f f
 fddZ  ZS )ModernBertRotaryEmbeddingNr   c                    s   t  || d S r   )r/   rz   )r1   r   devicer4   r6   r7   rz   Z  s   z"ModernBertRotaryEmbedding.__init__r   ztorch.deviceseq_len
layer_typer   ztorch.Tensorc                    s   t  | |||S r   )r/   compute_default_rope_parameters)r   r   r   r   r4   r6   r7   r   ]  s   z9ModernBertRotaryEmbedding.compute_default_rope_parametersr   NNNN)r   r   r   r'   rz   staticmethodr   r   r   tupler   r   r   r6   r6   r4   r7   r   Y  s$    
r   rotary_pos_emb   c                 C   sf   | j }||}||}|  | t|  |  }| | t| |  }||||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )dtype	unsqueezer   r&   to)qkcossinunsqueeze_dimoriginal_dtypeq_embedk_embedr6   r6   r7   apply_rotary_pos_embg  s   

r   c                       s   e Zd ZdZddededB f fddZ		ddejde	ejejf dB d	ejdB d
e
e de	ejejdB f f
ddZ  ZS )ModernBertAttentiona  Performs multi-headed self attention on a batch of unpadded sequences.

    If Flash Attention 2 is installed, this module uses Flash Attention to improve throughput.
    If Flash Attention 2 is not installed, the implementation will use PyTorch's SDPA kernel,
    which requires padding and unpadding inputs, adding some overhead.

    See `forward` method for additional details.
    Nr   	layer_idxc                    s   t    || _|| _|j|j dkr td|j d|j d|j| _|j| _|j|j | _	t
j|jd| j	 |j |jd| _|j| dkrN|jd | _nd | _d	| _t
j|j|j|jd| _|jd
krnt
|j| _d S t
 | _d S )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()r   r   r_   r   FrE   )r/   rz   r   r   rJ   rM   rw   rZ   rk   head_dimr   r   rY   Wqkvr[   r   	is_causalr   r   Identityout_dropr1   r   r   r4   r6   r7   rz     s&   
*zModernBertAttention.__init__r   position_embeddingsattention_maskr{   r   c                 K   s  |j d d }| |}|jg |dd| jR  }|jdd\}}}	|dd}|dd}|	dd}	|\}
}t|||
|dd\}}t}| jj	dkrSt
| jj	 }|| |||	|f| jr`| jnd	| jd
 | j| jd|\}}|jg |dR   }| | |}||fS )Nr   r   r   r   r#   )r   eagerrE         )dropoutscalingr   deterministic)shaper   viewr   unbind	transposer   r$   r   _attn_implementationr   trainingrZ   r   rk   reshape
contiguousr   r   )r1   r   r   r   r{   input_shapeqkvquery_states
key_statesvalue_statesr   r   attention_interfaceattn_outputattn_weightsr6   r6   r7   r     s:   


zModernBertAttention.forwardr   r   )r   r   r   r   r'   r   rz   r   r   r   r   r   r   r   r6   r6   r4   r7   r     s     	 r   c                       sd   e Zd ZddededB f fddZ		ddejdejdB dejdB d	ee	 d
ejf
ddZ
  ZS )ModernBertEncoderLayerNr   r   c                    s   t    || _|| _|dkrt | _ntj|j|j	|j
d| _t||d| _tj|j|j	|j
d| _t|| _|j| | _d S )Nr   r   )r   r   )r/   rz   r   r   r   r   	attn_normr   rJ   rR   rS   r   attnmlp_normr   mlpr[   attention_typer   r4   r6   r7   rz     s   

zModernBertEncoderLayer.__init__r   r   r   r{   r   c                 K   sB   | j | |f||d|\}}|| }|| | | }|S )N)r   r   )r   r   r   r   )r1   r   r   r   r{   r   _r6   r6   r7   r     s   
zModernBertEncoderLayer.forwardr   r   )r   r   r   r'   r   rz   r   r   r   r   r   r   r6   r6   r4   r7   r     s    r   c                   @   sX   e Zd ZU eed< dZdZddgZdZdZ	dZ
dZeedZe dejfdd	Zd
S )ModernBertPreTrainedModelr   modelTr   r   )r   
attentionsmodulec                    s  | j j  d u r
d dtjdtf fdd}| j j| j jtd| j j  | j j| j j	d d}t
|tr?||j|d	  d S t
|trV||j|d
  ||j|d  d S t
|trm||j|d
  ||j|d  d S t
|tr|||j|d  d S t
|tr||j|d  d S t
|ttttfr||j|d  d S t
|tjrt|j |jd urt |j d S d S t
|t!r|j"D ]6}|j#}|j$| dkrt%|j$|  }||j |d\}}t&t'|| d| t&t'|| d| qd S d S )Nr   r  stdc                    sN   t j| jd|  |  | d t| tjr#| jd ur%t | j d S d S d S )NrE   )rg   r  ab)inittrunc_normal_weight
isinstancer   r   r   zeros_)r  r  cutoff_factorr6   r7   init_weight
  s   
z<ModernBertPreTrainedModel._init_weights.<locals>.init_weightr@   r   )inout	embedding	final_outr  r  r  r  r~   )r   	_inv_freq_original_inv_freq)(r   rQ   r   Moduler   rP   mathsqrtrL   rJ   r
  r   r   r   r   r   r   r   ModernBertPredictionHeaddenseModernBertForMaskedLMdecoder#ModernBertForSequenceClassificationModernBertForMultipleChoice ModernBertForTokenClassificationModernBertForQuestionAnswering
classifierr   r  ones_r	  r   r  r   r[   r   r}   r   copy_getattr)r1   r  r  stdsr   rope_init_fncurr_inv_freqr   r6   r  r7   _init_weights  sZ   





	


z'ModernBertPreTrainedModel._init_weightsN)r   r   r   r'   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_attention_backendr   r   _can_record_outputsr   no_gradr   r  r'  r6   r6   r6   r7   r     s   
 r   c                       s   e Zd Zdef fddZdd Zdd Zeee					dd	e
jdB d
e
jdB de
jdB de
jdB dee defddZ  ZS )ModernBertModelr   c                    sr   t     | _t | _t fddt jD | _	tj
 j j jd| _t d| _d| _|   d S )Nc                    s   g | ]}t  |qS r6   )r   )rs   r   r   r6   r7   rv   I  s    z,ModernBertModel.__init__.<locals>.<listcomp>r   r3  F)r/   rz   r   r   
embeddingsr   
ModuleListry   rL   layersr   rJ   rR   rS   
final_normr   
rotary_embgradient_checkpointing	post_initr   r4   r3  r7   rz   D  s   
zModernBertModel.__init__c                 C   s   | j jS r   r4  r   ru   r6   r6   r7   get_input_embeddingsP  s   z$ModernBertModel.get_input_embeddingsc                 C   s   || j _d S r   r;  r   r6   r6   r7   set_input_embeddingsS  s   z$ModernBertModel.set_input_embeddingsNr   r   position_idsr   r{   r   c                 K   s  |d u |d uA rt d|d ur|jd n|jd }|d ur!|jn|j}|d u r2tj||dd}| j||d}t| }	tsV| j	||d}
t
d
i |
td
i |
d}	i }| j	jD ]}| |||||< q\| jD ]}||f|	|j ||j d|}qk| |}t|d	S )Nz:You must specify exactly one of input_ids or inputs_embedsr   r   r   )r   r   )r   r   r   r]   )r   r   )last_hidden_stater6   )rw   r   r   r   aranger   r4  r
  r   r   r   r   r[   r8  r6  r   r7  r   )r1   r   r   r>  r   r{   r   r   r   attention_mask_mappingmask_kwargsr   r   encoder_layerr6   r6   r7   r   V  s:   


zModernBertModel.forwardr   )r   r   r   r'   rz   r<  r=  r!   r"   r   r   r   r   r   r   r   r   r   r6   r6   r4   r7   r2  B  s0    r2  c                       s8   e Zd Zdef fddZdejdejfddZ  ZS )r  r   c                    sN   t    || _t|j|j|j| _t|j	 | _
tj|j|j|jd| _d S )Nr   )r/   rz   r   r   r   rJ   ri   r  r
   rj   r   r   rR   rS   r   r   r4   r6   r7   rz     s
   
z!ModernBertPredictionHead.__init__r   r   c                 C   s   |  | | |S r   )r   r   r  )r1   r   r6   r6   r7   r     s   z ModernBertPredictionHead.forward)	r   r   r   r'   rz   r   r   r   r   r6   r6   r4   r7   r    s    r  zd
    The ModernBert Model with a decoder head on top that is used for masked language modeling.
    )custom_introc                       s   e Zd ZddiZdef fddZdd Zdejfd	d
Z	e
e					ddejdB dejdB dejdB dejdB dejdB dee deej eB fddZ  ZS )r  zdecoder.weightz&model.embeddings.tok_embeddings.weightr   c                    s^   t  | || _t|| _t|| _tj|j	|j
|jd| _| jj| _| jj| _|   d S )Nr   )r/   rz   r   r2  r  r  headr   r   rJ   rI   rd   r  rl   rm   r:  r   r4   r6   r7   rz     s   



zModernBertForMaskedLM.__init__c                 C   s   | j S r   r  ru   r6   r6   r7   get_output_embeddings  s   z+ModernBertForMaskedLM.get_output_embeddingsnew_embeddingsc                 C   s
   || _ d S r   rG  )r1   rI  r6   r6   r7   set_output_embeddings  s   
z+ModernBertForMaskedLM.set_output_embeddingsNr   r   r>  r   labelsr{   r   c                 K   s   | j d||||d|}|d }| jr3|d ur3|d}||jd d}|| jk}	||	 }||	 }| | |}
d }|d urO| j|
|fd| jj	i|}t
||
|j|jdS )Nr   r   r>  r   r   r   rI   losslogitsr   r  r6   )r  rl   r   r   rm   r  rF  loss_functionr   rI   r   r   r  )r1   r   r   r>  r   rK  r{   outputsr@  mask_tokensrO  rN  r6   r6   r7   r     s2   

zModernBertForMaskedLM.forwardNNNNN)r   r   r   _tied_weights_keysr'   rz   rH  r   r   rJ  r    r   r   r   r   r   r   r   r   r   r   r6   r6   r4   r7   r    s6    r  z`
    The ModernBert Model with a sequence classification head on top that performs pooling.
    c                          e Zd Zdef fddZee					ddejdB dej	dB dej	dB dej	dB d	ej	dB d
e
e deej	 eB fddZ  ZS )r  r   c                    s\   t  | |j| _|| _t|| _t|| _tj	
|j| _t	|j|j| _|   d S r   )r/   rz   
num_labelsr   r2  r  r  rF  r   r   r   rh   r   r   rJ   r   r:  r   r4   r6   r7   rz     s   

z,ModernBertForSequenceClassification.__init__Nr   r   r>  r   rK  r{   r   c                 K   s  | j d||||d|}|d }| jjdkr |dddf }n+| jjdkrK|du r9tj|jdd |jtjd}||d j	d	d
|j	d	dd }| 
|}	| |	}	| |	}
d}|dur| jjdu r| jd	krpd| j_n| jd	kr|jtjks|jtjkrd| j_nd| j_| jjdkrt }| jd	kr||
 | }n+||
|}n%| jjdkrt }||
d| j|d}n| jjdkrt }||
|}t||
|j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        rL  r   rG   Nrg   r#   )r   r   r   r   r   Tr   keepdim
regressionsingle_label_classificationmulti_label_classificationrM  r6   )r  r   re   r   onesr   r   rr   r   sumrF  r   r   problem_typerV  r   longr   r   squeezer   r   r   r   r   r  )r1   r   r   r>  r   rK  r{   rQ  r@  pooled_outputrO  rN  loss_fctr6   r6   r7   r     s^   




"


z+ModernBertForSequenceClassification.forwardrS  )r   r   r   r'   rz   r    r   r   r   r   r   r   r   r   r   r   r6   r6   r4   r7   r    s0    r  zv
    The ModernBert Model with a token classification head on top, e.g. for Named Entity Recognition (NER) tasks.
    c                       rU  )r  r   c                    V   t  | |j| _t|| _t|| _tj	|j
| _t|j|j| _|   d S r   r/   rz   rV  r2  r  r  rF  r   r   r   rh   r   r   rJ   r   r:  r   r4   r6   r7   rz   <  s   

z)ModernBertForTokenClassification.__init__Nr   r   r>  r   rK  r{   r   c                 K   s   | j d||||d|}|d }| |}| |}| |}	d}
|dur6t }||	d| j|d}
t|
|	|j|j	dS )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        rL  r   Nr   rM  r6   )
r  rF  r   r   r   r   rV  r   r   r  )r1   r   r   r>  r   rK  r{   rQ  r@  rO  rN  rb  r6   r6   r7   r   H  s,   


z(ModernBertForTokenClassification.forwardrS  )r   r   r   r'   rz   r    r   r   r   r   r   r   r   r   r   r   r6   r6   r4   r7   r  6  0    r  c                       s   e Zd Zdef fddZee					ddejdB dejdB dejdB dejdB d	ejdB d
e	e
 deej eB fddZ  ZS )r  r   c                    rc  r   rd  r   r4   r6   r7   rz   s  s   

z'ModernBertForQuestionAnswering.__init__Nr   r   r>  start_positionsend_positionsr{   r   c                 K   s   | j |f||d|}|d }| |}| |}| |}	|	jddd\}
}|
d }
|d }d }|d urL|d urL| j|
|||fi |}t||
||j	|j
dS )N)r   r>  r   r   r   r   )rN  start_logits
end_logitsr   r  )r  rF  r   r   splitr`  r   rP  r   r   r  )r1   r   r   r>  rf  rg  r{   rQ  r@  rO  rh  ri  rN  r6   r6   r7   r   ~  s2   


z&ModernBertForQuestionAnswering.forwardrS  )r   r   r   r'   rz   r    r   r   r   r   r   r   r   r   r   r6   r6   r4   r7   r  q  s0    r  z
    The ModernBert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks.
    c                       rU  )r  r   c                    sR   t  | || _t|| _t|| _tj	|j
| _t|jd| _|   d S )Nr   )r/   rz   r   r2  r  r  rF  r   r   r   rh   r   r   rJ   r   r:  r   r4   r6   r7   rz     s   

z$ModernBertForMultipleChoice.__init__Nr   r   r>  r   rK  r{   r   c                 K   s  |dur	|j d n|j d }|dur|d|dnd}|dur*|d|dnd}|dur9|d|dnd}|durL|d|d|dnd}| jd||||d|}|d }	| jjdkrtj|	j d |	jd}
|dur|j	dd	
|	j}n
tjdtj|	jd
}|	|
|f }	n| jjdkr|jddd}|	|d jdd	| }	| |	}| |}| |}|d|}d}|durt }|||}t|||j|jdS )a&  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors.
        Nr   r   rL  r   rG   r?  r   )r   r   rg   TrW  rM  r6   )r   r   sizer  r   re   r   rA  r   argmaxr   tensorr_  r]  r   rF  r   r   r   r   r   r   r  )r1   r   r   r>  r   rK  r{   num_choicesrQ  r@  	indices_0cls_masknum_non_pad_tokensra  rO  reshaped_logitsrN  rb  r6   r6   r7   r     sP   



z#ModernBertForMultipleChoice.forwardrS  )r   r   r   r'   rz   r    r   r   r   r   r   r   r   r   r   r   r6   r6   r4   r7   r    re  r  )r'   r2  r   r  r  r  r  r  )r   )Kr  typingr   r   r   r   torch.nnr   r   r    r	   r  activationsr
   configuration_utilsr   r   integrationsr   r   masking_utilsr   r   modeling_layersr   modeling_outputsr   r   r   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.genericr    r!   utils.output_capturingr"   align.modeling_alignr$   gemma3.modeling_gemma3r%   r&   
get_loggerr   r-   r'   r  r   r   r   r   r   r   r   r2  r  r  r  r  r  r  __all__r6   r6   r6   r7   <module>   sn    
 ~Q MEBV64U