o
    i                    @   s   d dl Z d dlZd dlmZ d dlmZmZmZ d dlZd dl	m
  mZ d dlm
Z
 d dlmZmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZmZmZmZ ddl m!Z! ddl"m#Z#m$Z$m%Z% ddl&m'Z' ddl(m)Z)m*Z* e$ rd dl+m,Z, d dl-m.Z. d dl/m0Z0 ne1Z.e%2e3Z4G dd deZ5		d[dej6dej6deej6 deej6 de7ej6ej6ej6e8eej6 eej6 f f
ddZ9dej6dej6de8de8dej6f
d d!Z:G d"d# d#ej;j<Z=		d[d$eej6 d%ee8 fd&d'Z>G d(d) d)e.Z?G d*d+ d+e
j@ZAG d,d- d-e
j@ZBG d.d/ d/e)ZC	0d\d1d2d3ej6dej6d4ej6deejD d5e7e8e8f d6e8d7e8d8eeE dee7ej6ej6f e7ej6 f fd9d:ZFejGfd1d2d3ej6d;e?d$ej6d%e8d5e7e8e8f d6e8d7e8d<ejHde7ej6 fd=d>ZId1d2d3ej6dej6d4ej6deejD d5e7e8e8f d6e8d7e8de7ej6 fd?d@ZJeIeFeJdAZKG dBd2 d2e
j@ZLG dCdD dDeZMe#G dEdF dFe!ZNe#G dGdH dHeNZOG dIdJ dJe
j@ZPe#dKdLG dMdN dNeNZQe#dOdLG dPdQ dQeNZRe#dRdLG dSdT dTeNZSe#G dUdV dVeNZTe#dWdLG dXdY dYeNZUg dZZVdS )]    N)nullcontext)LiteralOptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)PretrainedConfig)_prepare_4d_attention_mask)GradientCheckpointingLayer)BaseModelOutputMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)auto_docstringis_flash_attn_2_availablelogging)is_triton_available   )GemmaRotaryEmbeddingapply_rotary_pos_emb) flash_attn_varlen_qkvpacked_func)RotaryEmbedding)apply_rotaryc                       s   e Zd ZdZdZddiZdgZ						
																														d#ded f fdd Z fd!d"Z	  Z
S )$ModernBertConfiga  
    This is the configuration class to store the configuration of a [`ModernBertModel`]. It is used to instantiate an ModernBert
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the ModernBERT-base.
    e.g. [answerdotai/ModernBERT-base](https://huggingface.co/answerdotai/ModernBERT-base)

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        vocab_size (`int`, *optional*, defaults to 50368):
            Vocabulary size of the ModernBert model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`ModernBertModel`]
        hidden_size (`int`, *optional*, defaults to 768):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 1152):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 22):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 12):
            Number of attention heads for each attention layer in the Transformer decoder.
        hidden_activation (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the decoder. Will default to `"gelu"`
            if not specified.
        max_position_embeddings (`int`, *optional*, defaults to 8192):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        initializer_cutoff_factor (`float`, *optional*, defaults to 2.0):
            The cutoff factor for the truncated_normal_initializer for initializing all weight matrices.
        norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the rms normalization layers.
        norm_bias (`bool`, *optional*, defaults to `False`):
            Whether to use bias in the normalization layers.
        pad_token_id (`int`, *optional*, defaults to 50283):
            Padding token id.
        eos_token_id (`int`, *optional*, defaults to 50282):
            End of stream token id.
        bos_token_id (`int`, *optional*, defaults to 50281):
            Beginning of stream token id.
        cls_token_id (`int`, *optional*, defaults to 50281):
            Classification token id.
        sep_token_id (`int`, *optional*, defaults to 50282):
            Separation token id.
        global_rope_theta (`float`, *optional*, defaults to 160000.0):
            The base period of the global RoPE embeddings.
        attention_bias (`bool`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        global_attn_every_n_layers (`int`, *optional*, defaults to 3):
            The number of layers between global attention layers.
        local_attention (`int`, *optional*, defaults to 128):
            The window size for local attention.
        local_rope_theta (`float`, *optional*, defaults to 10000.0):
            The base period of the local RoPE embeddings.
        embedding_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the embeddings.
        mlp_bias (`bool`, *optional*, defaults to `False`):
            Whether to use bias in the MLP layers.
        mlp_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the MLP layers.
        decoder_bias (`bool`, *optional*, defaults to `True`):
            Whether to use bias in the decoder layers.
        classifier_pooling (`str`, *optional*, defaults to `"cls"`):
            The pooling method for the classifier. Should be either `"cls"` or `"mean"`. In local attention layers, the
            CLS token doesn't attend to all tokens on long sequences.
        classifier_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the classifier.
        classifier_bias (`bool`, *optional*, defaults to `False`):
            Whether to use bias in the classifier.
        classifier_activation (`str`, *optional*, defaults to `"gelu"`):
            The activation function for the classifier.
        deterministic_flash_attn (`bool`, *optional*, defaults to `False`):
            Whether to use deterministic flash attention. If `False`, inference will be faster but not deterministic.
        sparse_prediction (`bool`, *optional*, defaults to `False`):
            Whether to use sparse prediction for the masked language model instead of returning the full dense logits.
        sparse_pred_ignore_index (`int`, *optional*, defaults to -100):
            The index to ignore for the sparse prediction.
        reference_compile (`bool`, *optional*):
            Whether to compile the layers of the model which were compiled during pretraining. If `None`, then parts of
            the model will be compiled if 1) `triton` is installed, 2) the model is not on MPS, 3) the model is not
            shared between devices, and 4) the model is not resized after initialization. If `True`, then the model may
            be faster in some scenarios.
        repad_logits_with_grad (`bool`, *optional*, defaults to `False`):
            When True, ModernBertForMaskedLM keeps track of the logits' gradient when repadding for output. This only
            applies when using Flash Attention 2 with passed labels. Otherwise output logits always have a gradient.

    Examples:

    ```python
    >>> from transformers import ModernBertModel, ModernBertConfig

    >>> # Initializing a ModernBert style configuration
    >>> configuration = ModernBertConfig()

    >>> # Initializing a model from the modernbert-base style configuration
    >>> model = ModernBertModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
modernbert
rope_thetaglobal_rope_thetapast_key_values             gelu    {Gz?       @h㈵>Fk  j  i       A        r
           @TclsNclassifier_poolingr6   meanc$           %         s   t  jd|||||d|$ || _|| _|| _|| _|| _|| _|| _|	| _	|
| _
|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _| | _|!| _|"| _|#| _| jdvrwt d| j dd S )N)pad_token_idbos_token_ideos_token_idcls_token_idsep_token_idr9   zQInvalid value for `classifier_pooling`, should be either "cls" or "mean", but is . )!super__init__
vocab_sizemax_position_embeddingshidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsinitializer_rangeinitializer_cutoff_factornorm_eps	norm_biasr#   attention_biasattention_dropouthidden_activationglobal_attn_every_n_layerslocal_attentionlocal_rope_thetaembedding_dropoutmlp_biasmlp_dropoutdecoder_biasr8   classifier_dropoutclassifier_biasclassifier_activationdeterministic_flash_attnsparse_predictionsparse_pred_ignore_indexreference_compilerepad_logits_with_grad
ValueError)%selfrD   rF   rG   rH   rI   rP   rE   rJ   rK   rL   rM   r;   r=   r<   r>   r?   r#   rN   rO   rQ   rR   rS   rT   rU   rV   rW   r8   rX   rY   rZ   r[   r\   r]   r^   r_   kwargs	__class__rA   n/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/transformers/models/modernbert/modular_modernbert.pyrC      sX   '
zModernBertConfig.__init__c                    s   t   }|dd  |S )Nr^   )rB   to_dictpopra   outputrc   rA   re   rf      s   
zModernBertConfig.to_dict)#r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   Fr/   r0   r1   r1   r0   r2   Fr3   r
   r4   r5   r3   Fr3   Tr6   r3   Fr*   FFr7   NF)__name__
__module____qualname____doc__
model_typeattribute_mapkeys_to_ignore_at_inferencer   rC   rf   __classcell__rA   rA   rc   re   r    7   sX    gSr    inputsattention_maskposition_idslabelsreturnc                 C   s   |j dtjd}tj| dd }t|  }tjj	
tj|dtjdd}|  dkr7|  | }n| j^}	}
}|	|
 }| j|g|R  | }|durV| | nd}|durb| | nd}||||||fS )	a  
    Remove padding from input sequences.

    Args:
        inputs: (batch, seqlen, ...) or (batch, seqlen)
        attention_mask: (batch, seqlen), bool / int, 1 means valid and 0 means not valid.
        position_ids: (batch, seqlen), int, position ids
        labels: (batch, seqlen), int, labels

    Returns:
        unpadded_inputs: (total_nnz, ...), where total_nnz = number of tokens selected in attention_mask.
        indices: (total_nnz)
        cu_seqlens: (batch + 1), the cumulative sequence lengths
        max_seqlen_in_batch: int
        unpadded_position_ids: (total_nnz) or None
        unpadded_labels: (total_nnz) or None
    dimdtypeF)as_tupler   )   r   r   N)sumtorchint32nonzeroflattenintmaxitemr   
functionalpadcumsumry   shapeview)rr   rs   rt   ru   seqlens_in_batchindicesmax_seqlen_in_batch
cu_seqlensunpadded_inputsbatchseqlenrestr   unpadded_position_idsunpadded_labelsrA   rA   re   _unpad_modernbert_input   s   r   r   r   r   c                 C   s   |   dkrtj|| | j| jd}| ||< |||}|S | j^}}tj|| g|R | j| jd}| ||< |j||g|R  }|S )aQ  
    Add padding to sequences.

    Args:
        inputs: (total_nnz, ...) or (total_nnz,), where total_nnz = number of tokens selected in attention_mask.
        indices: (total_nnz)
        batch: int, batch size
        seqlen: int, max sequence length

    Returns:
        padded_inputs: (batch, seqlen, ...) or (batch, seqlen)
    r|   rz   device)ry   r~   zerosrz   r   r   r   )rr   r   r   r   ri   padded_inputs_r   rA   rA   re   _pad_modernbert_output%  s   
"r   c                   @   s>   e Zd Ze		ddeej dee fddZedd Z	dS )	ApplyRotaryEmbUnpadNr   
max_seqlenc              
   C   sd   |  }|j\}}}}	|d d d df |d|	}
t|
||d||ddd | ||| || _|S )Nr   rw   r   FT)seqlen_offsetsr   r   interleavedinplace)
contiguousr   r   r   save_for_backwardr   )ctxqkvcossinr   r   	total_nnz_three_nheadsheaddimqkrA   rA   re   forwardE  s    
zApplyRotaryEmbUnpad.forwardc           
      C   sn   | j \}}}| }|j\}}}}|d d d df |d|}	t|	||d|| jdddd	 |d d d d d d fS )Nr   rw   r   FT)r   r   r   r   r   	conjugate)saved_tensorsr   r   r   r   r   )
r   dor   r   r   r   r   r   r   dqkrA   rA   re   backwardd  s    zApplyRotaryEmbUnpad.backwardNN)
rj   rk   rl   staticmethodr   r~   Tensorr   r   r   rA   rA   rA   re   r   D  s    r   r   r   c                 C   s   t | ||||S )a  
    Arguments:
        qkv: (total_nnz, 3, nheads, headdim) - input tensor for packed QKV.
        cos, sin: (seqlen_rotary, rotary_dim / 2)
        interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead
            of 1st half and 2nd half (GPT-NeoX style).
        inplace: if True, apply rotary embedding in-place.
        seqlen_offsets: (batch_size,) or int. Each sequence in x is shifted by this amount.
            Most commonly used in inference when we have KV cache.
        cu_seqlens: (batch + 1,) or None
        max_seqlen: int
    Return:
        out: (total_nnz, dim)
    rotary_dim must be <= headdim
    Apply rotary embedding to the first rotary_dim of x.
    )r   apply)r   r   r   r   r   rA   rA   re   apply_rotary_unpadded{  s   r   c                       s   e Zd ZdZ				ddededee deej deej	 f
 fd	d
Z
	ddejdejdee deejeejejf f fddZdefddZ  ZS )!ModernBertUnpaddedRotaryEmbeddingzP
    The rotary position embeddings applied directly to unpadded sequences.
    r5   Nry   baser   r   rz   c                    sR   t  j|||dd || _|dur#|dur%|dur'| j|||d dS dS dS dS )a  
        max_seqlen: if max_seqlen, device, and dtype are provided, we precompute the cos_sin_cache
            up to max_seqlen. If the max_seqlen, device, or dtype during training/inference differ,
            the cos_sin_cache will be recomputed during the forward pass.
        F)ry   r   r   r   Nr   rz   )rB   rC   r   _update_cos_sin_cache)ra   ry   r   r   r   rz   rc   rA   re   rC     s
   z*ModernBertUnpaddedRotaryEmbedding.__init__r   r   rv   c                 C   s6   |dur| j ||j|jd t|| j| j||d}|S )z
        Apply rotary embedding *inplace* to qkv.
        qkv: (total_nnz, 3, nheads, headdim)
        cu_seqlens: (batch + 1,) cumulative sequence lengths
        max_seqlen: int max seq length in the batch
        Nr   r   r   )r   r   rz   r   _cos_cached_sin_cached)ra   r   r   r   rA   rA   re   r     s   z)ModernBertUnpaddedRotaryEmbedding.forwardc                 C   s   d| j  d| j d| j S )Nzdim=z, base=z, scale_base=)ry   r   
scale_basera   rA   rA   re   
extra_repr  s   z,ModernBertUnpaddedRotaryEmbedding.extra_repr)r5   NNNN)rj   rk   rl   rm   r   floatr   r~   r   rz   rC   r   r   tupler   strr   rq   rA   rA   rc   re   r     s8    
r   c                       sp   e Zd ZdZdef fddZejdddejdej	fd	d
Z
	ddeej deej	 dej	fddZ  ZS )ModernBertEmbeddingszV
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    configc                    sR   t    || _tj|j|j|jd| _tj	|j|j
|jd| _t|j| _d S )N)padding_idxepsbias)rB   rC   r   r   	EmbeddingrD   rF   r;   tok_embeddings	LayerNormrL   rM   normDropoutrT   dropra   r   rc   rA   re   rC     s
   
zModernBertEmbeddings.__init__Tdynamic	input_idsrv   c                 C      |  | | |S r   )r   r   r   )ra   r   rA   rA   re   compiled_embeddings  s   z(ModernBertEmbeddings.compiled_embeddingsNinputs_embedsc                 C   sH   |d ur|  | |}|S | jjr| |n
|  | | |}|S r   )r   r   r   r^   r   r   )ra   r   r   hidden_statesrA   rA   re   r     s   zModernBertEmbeddings.forwardr   )rj   rk   rl   rm   r    rC   r~   compile
LongTensorr   r   r   r   rq   rA   rA   rc   re   r     s    
r   c                       s<   e Zd ZdZdef fddZdejdejfddZ  Z	S )	ModernBertMLPa6  Applies the GLU at the end of each ModernBERT layer.

    Compared to the default BERT architecture, this block replaces :class:`~transformers.model.bert.modeling_bert.BertIntermediate`
    and :class:`~transformers.model.bert.modeling_bert.SelfOutput` with a single module that has similar functionality.
    r   c                    sf   t    || _tj|jt|jd |jd| _	t
|j | _t|j| _tj|j|j|jd| _d S )Nr   r   )rB   rC   r   r   LinearrF   r   rG   rU   Wir   rP   actr   rV   r   Wor   rc   rA   re   rC     s   
 zModernBertMLP.__init__r   rv   c                 C   s2   |  |jddd\}}| | | || S )Nr   rw   ry   )r   chunkr   r   r   )ra   r   inputgaterA   rA   re   r     s   zModernBertMLP.forward)
rj   rk   rl   rm   r    rC   r~   r   r   rq   rA   rA   rc   re   r     s    r   c                   @   s   e Zd ZdS )ModernBertRotaryEmbeddingN)rj   rk   rl   rA   rA   rA   re   r     s    r   FmoduleModernBertAttentionr   sliding_window_maskrR   bsry   output_attentionsc	                 K   s   | j ||d\}
}|ddjdd\}}}t|||
|\}}| jd }t||dd| }|dkr6|}|| }tjj	|dtj
d	|j}tjj|| j| jd
}t||}|dd }||d|}|rn||fS |fS )Nrt   r
   r|   r   r         ࿩rw   rw   rw   rx   )ptraining)
rotary_emb	transposeunbindr   head_dimr~   matmulr   r   softmaxfloat32torz   dropoutrO   r   r   r   )r   r   rs   r   rt   rR   r   ry   r   _kwargsr   r   querykeyvaluescaleattn_weightsattn_outputrA   rA   re   eager_attention_forward   s    
r   r   target_dtypec	                 K   s   ||||d}|j tjtjfv}
|
r1|j }||}t|||| jr$| jnd| j|d}||}nt|||| jr;| jnd| j|d}|	||fS )Nr   r3   )r   r   	dropout_pdeterministicwindow_size)
rz   r~   float16bfloat16r   r   r   rO   r[   r   )r   r   r   r   r   rR   r   ry   r   r   convert_dtype
orig_dtypeattnrA   rA   re   flash_attention_forward%  s.   
r	  c                 K   s   | j ||d\}	}
|ddjdd\}}}t|||	|
\}}|dkr%|}tj|||| jr0| jnd|ddd }|	|d	|}|fS )
Nr   r
   r|   r   r   r   r3   )r  	attn_maskrw   )
r   r   r   r   Fscaled_dot_product_attentionr   rO   r   r   )r   r   rs   r   rt   rR   r   ry   r   r   r   r   r   r   r   rA   rA   re   sdpa_attention_forwardP  s"   r  )flash_attention_2eagersdpac                       sR   e Zd ZdZddedee f fddZ	ddej	d	ee
 d
ej	fddZ  ZS )r   a  Performs multi-headed self attention on a batch of unpadded sequences.

    If Flash Attention 2 is installed, this module uses Flash Attention to improve throughput.
    If Flash Attention 2 is not installed, the implementation will use PyTorch's SDPA kernel,
    which requires padding and unpadding inputs, adding some overhead.

    See `forward` method for additional details.
    Nr   layer_idc                    sb  t    || _|| _|j|j dkr td|j d|j d|j| _|j| _|j| _	|j|j | _
| j
| j	 | _tj|jd| j |jd| _||j dkri|jd |jd f| _|jd urb|jn|j}|j}n	d| _|j}|j}|jd	krt| j
||d
| _nt|}||_t|d| _tj|j|j|jd| _|jdkrt|jnt | _t  | _!d S )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()r
   r   r   r   r  )ry   r   r   r   r3   )"rB   rC   r   r  rF   rI   r`   rO   r[   	num_headsr   all_head_sizer   r   rN   WqkvrQ   rR   rS   r#   rE   _attn_implementationr   r   copydeepcopyr"   r   r   r   Identityout_dropsetpruned_heads)ra   r   r  r"   rE   config_copyrc   rA   re   rC     s<   



 zModernBertAttention.__init__Fr   r   rv   c              	   K   s   |  |}|jd }| jjdkr|dd| j| j}n||dd| j| j}t| jj | f|| j| j	|| j
|d|}|d }| | |}|f|dd   S )Nr   r  rw   r
   )r   r   rR   r   ry   r   r|   )r  r   r   r  r   r  r   MODERNBERT_ATTENTION_FUNCTIONr   rR   r  r  r   )ra   r   r   rb   r   r   attn_outputsrA   rA   re   r     s(   



zModernBertAttention.forwardr   F)rj   rk   rl   rm   r    r   r   rC   r~   r   boolr   rq   rA   rA   rc   re   r   z  s    	*c                       s   e Zd Zddedee f fddZejdddej	d	ej	fd
dZ
						ddej	deej	 deej	 deej deej	 dee dee d	ej	fddZ  ZS )ModernBertEncoderLayerNr   r  c                    sp   t    || _|dkrt | _ntj|j|j|j	d| _t
||d| _tj|j|j|j	d| _t|| _d S )Nr   r   )r   r  )rB   rC   r   r   r  	attn_normr   rF   rL   rM   r   r  mlp_normr   mlp)ra   r   r  rc   rA   re   rC     s   
zModernBertEncoderLayer.__init__Tr   r   rv   c                 C      |  | |S r   )r&  r%  ra   r   rA   rA   re   compiled_mlp     z#ModernBertEncoderLayer.compiled_mlpFrs   r   rt   r   r   r   c           
   	   C   sf   | j | |||||||d}||d  }| jjr| |n| | |}	||	 }|f|dd   S )Nrs   r   rt   r   r   r   r   r|   )r  r$  r   r^   r)  r&  r%  )
ra   r   rs   r   rt   r   r   r   r   
mlp_outputrA   rA   re   r     s    
	zModernBertEncoderLayer.forwardr   )NNNNNF)rj   rk   rl   r    r   r   rC   r~   r   r   r)  r   r"  r   rq   rA   rA   rc   re   r#    s6    
	r#  c                       s|   e Zd ZU eed< dZdZddgZdZdZ	dZ
dejfdd	Z	dd
ee dedef fddZdd Z fddZ  ZS )ModernBertPreTrainedModelr   modelTr   r#  Fr   c                    sx  | j j  d u r
d dtjdtf fdd}| j j| j jtd| j j  | j j| j j	d d}t
|tr?||j|d	  d S t
|trV||j|d
  ||j|d  d S t
|trm||j|d
  ||j|d  d S t
|tr|||j|d  d S t
|tr||j|d  d S t
|ttttfr||j|d  d S t
|tjr|jjd |jd ur|jj   d S d S d S )Nr
   r   stdc                    sR   t jj| jd|  |  | d t| t jr%| jd ur't j| j d S d S d S )Nr3   )r:   r/  ab)r   inittrunc_normal_weight
isinstancer   r   zeros_)r   r/  cutoff_factorrA   re   init_weight  s   
z<ModernBertPreTrainedModel._init_weights.<locals>.init_weightr-   r   )inout	embedding	final_outr<  r:  r;  r=  g      ?)!r   rK   r   Moduler   rJ   mathsqrtrH   rF   r5  r   r   r   r   r   r   r  ModernBertPredictionHeaddenseModernBertForMaskedLMdecoder#ModernBertForSequenceClassificationModernBertForMultipleChoice ModernBertForTokenClassificationModernBertForQuestionAnswering
classifierr   r4  datafill_r   zero_)ra   r   r9  stdsrA   r7  re   _init_weights  sH   





	
z'ModernBertPreTrainedModel._init_weightsattn_implementationis_init_checkrv   c              	      sD   z|du r|   rdn|}W n ttfy   Y nw t j||dS )zR
        Checks and dispatches to hhe requested attention implementation.
        Nr  )rO  rP  )_flash_attn_2_can_dispatchr`   ImportErrorrB   %_check_and_adjust_attn_implementation)ra   rO  rP  rc   rA   re   rS  5  s   z?ModernBertPreTrainedModel._check_and_adjust_attn_implementationc                 C   s   | j jdu rd S t| dr!t| jdkr!| j jrtd d| j _| jjdkr4| j jr0td d| j _| jjdkrG| j jrCtd d| j _| j jd u rTt	 | j _d S d S )	NFhf_device_mapr|   zqIf `accelerate` split the model across devices, `torch.compile` will not work. Falling back to non-compiled mode.mpsz|Compiling the model with `torch.compile` and using a `torch.mps` device is not supported. Falling back to non-compiled mode.cpuz|Compiling the model with `torch.compile` and using a `torch.cpu` device is not supported. Falling back to non-compiled mode.)
r   r^   hasattrlenrT  loggerwarning_oncer   typer   r   rA   rA   re   _maybe_set_compileL  s.   z,ModernBertPreTrainedModel._maybe_set_compilec                    s<   t  j|i |}| jjdv r| jjrtd d| j_|S )N>   NTzcResizing token embeddings with `torch.compile` is not supported. Falling back to non-compiled mode.F)rB   resize_token_embeddingsr   r^   rY  rZ  )ra   argsrb   model_embedsrc   rA   re   r]  k  s   z1ModernBertPreTrainedModel.resize_token_embeddingsr!  )rj   rk   rl   r    __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_flash_attn_supports_sdpa_supports_flex_attnr   r>  rN  r   r   r"  rS  r\  r]  rq   rA   rA   rc   re   r-    s&   
 5r-  c                !       s  e Zd Zdef fddZdd Zdd Ze													dd	ee	j
 d
ee	j dee	j dee	j
 dee	j dee	j dee	j dee dee dee dee dee dee deee	jdf ef fddZd
e	jdede	jfddZ  ZS )ModernBertModelr   c                    sf   t     | _t | _t fddt jD | _	tj
 j j jd| _d| _|   d S )Nc                    s   g | ]}t  |qS rA   )r#  ).0r  r  rA   re   
<listcomp>  s    z,ModernBertModel.__init__.<locals>.<listcomp>r   F)rB   rC   r   r   
embeddingsr   
ModuleListrangerH   layersr   rF   rL   rM   
final_normgradient_checkpointing	post_initr   rc   r  re   rC   z  s   
zModernBertModel.__init__c                 C   s   | j jS r   rj  r   r   rA   rA   re   get_input_embeddings  s   z$ModernBertModel.get_input_embeddingsc                 C   s   || j _d S r   rq  )ra   r   rA   rA   re   set_input_embeddings  s   z$ModernBertModel.set_input_embeddingsNr   rs   r   rt   r   r   r   r   
batch_sizeseq_lenr   output_hidden_statesreturn_dictrv   .c              
      s  |dur|n| j j}|dur|n| j j}|dur|n| j j}|du |duA r*td|r.dnd}|r4dnd}|   |durD| ||  du rcdu rc|durZ|jdd \ n	|jdd \ |durj|jn|j}|du r|t	j
 f|t	jd}d}| j jdkrdu r|du r|du rd}|du rt	  t||d	^}}}}W d   n1 sw   Y  n#t||d	^}}}}n|du rt	j|d
d}| j||d\}}| j||d}| jD ])}|r||f }||||||||d}|d }|rt|dkr||d f }q|r||f }| |}|r8t| d}|dur7t fdd|D }n#| j jdkr[|dur[|d  dkr[|d}tdd |D }|sjtdd |||fD S t|||dS )  
        sliding_window_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding or far-away tokens. In ModernBert, only every few layers
            perform global attention, while the rest perform local attention. This mask is used to avoid attending to
            far-away tokens in the local attention layers when not using Flash Attention.
        indices (`torch.Tensor` of shape `(total_unpadded_tokens,)`, *optional*):
            Indices of the non-padding tokens in the input sequence. Used for unpadding the output.
        cu_seqlens (`torch.Tensor` of shape `(batch + 1,)`, *optional*):
            Cumulative sequence lengths of the input sequences. Used to index the unpadded tensors.
        max_seqlen (`int`, *optional*):
            Maximum sequence length in the batch excluding padding tokens. Used to unpad input_ids and pad output tensors.
        batch_size (`int`, *optional*):
            Batch size of the input sequences. Used to pad the output tensors.
        seq_len (`int`, *optional*):
            Sequence length of the input sequences including padding tokens. Used to pad the output tensors.
        Nz:You must specify exactly one of input_ids or inputs_embedsrA   r   r   Fr  T)rr   rs   r   r   )r   )r   r   r+  r|   rr   r   r   r   c                 3   s     | ]}t | d V  qdS )rz  N)r   rh  hsrt  r   ru  rA   re   	<genexpr>  s
    
z*ModernBertModel.forward.<locals>.<genexpr>rw   c                 s   s    | ]}| d V  qdS )r   N)	unsqueezer{  rA   rA   re   r~    s    c                 s   s    | ]	}|d ur|V  qd S r   rA   )rh  vrA   rA   re   r~    s    )last_hidden_stater   
attentions)r   r   rv  use_return_dictr`   r\  %warn_if_padding_and_no_attention_maskr   r   r~   onesr"  r  no_gradr   aranger  _update_attention_maskrj  rm  rX  rn  r   r   ry   r   )ra   r   rs   r   rt   r   r   r   r   rt  ru  r   rv  rw  all_hidden_statesall_self_attentionsr   repadr   r   encoder_layerlayer_outputsrA   r}  re   r     s   !



	




zModernBertModel.forwardc                 C   s   |r#| j jdkrtd d| j _n| j jdkr#td| j j d t|| j}t|jd 	d}t
||j }|| j jd k	d	d|j}|| t| jj}||fS )Nr  zOutputting attentions is only supported with the 'eager' attention implementation, not with "sdpa". Falling back to `attn_implementation="eager"`.r  zZOutputting attentions is only supported with the eager attention implementation, not with zT. Consider setting `attn_implementation="eager"`. Setting `output_attentions=False`.r   r   )r   r  rY  rZ  r   rz   r~   r  r   r  absTrR   r   r   masked_filllogical_notfinfomin)ra   rs   r   global_attention_maskrowsdistancewindow_maskr   rA   rA   re   r    s&   
"z&ModernBertModel._update_attention_maskNNNNNNNNNNNNN)rj   rk   rl   r    rC   rr  rs  r   r   r~   r   r   r   r"  r   r   r   r   r  rq   rA   rA   rc   re   rg  x  sb    	
 "rg  c                       s8   e Zd Zdef fddZdejdejfddZ  ZS )rA  r   c                    sN   t    || _t|j|j|j| _t|j	 | _
tj|j|j|jd| _d S )Nr   )rB   rC   r   r   r   rF   rY   rB  r   rZ   r   r   rL   rM   r   r   rc   rA   re   rC   0  s
   
z!ModernBertPredictionHead.__init__r   rv   c                 C   r   r   )r   r   rB  r(  rA   rA   re   r   7  s   z ModernBertPredictionHead.forward)	rj   rk   rl   r    rC   r~   r   r   rq   rA   rA   rc   re   rA  /  s    rA  zd
    The ModernBert Model with a decoder head on top that is used for masked language modeling.
    )custom_introc                "       s   e Zd ZdgZdef fddZdd Zdejfdd	Z	e
jd
dde
jde
jfddZe														d!dee
j dee
j dee
j dee
j dee
j dee
j dee
j dee
j dee dee dee dee dee dee deee
j ef fdd Z  ZS )"rC  zdecoder.weightr   c                    s^   t  | || _t|| _t|| _tj|j	|j
|jd| _| jj| _| jj| _|   d S )Nr   )rB   rC   r   rg  r.  rA  headr   r   rF   rD   rW   rD  r\   r]   rp  r   rc   rA   re   rC   C  s   



zModernBertForMaskedLM.__init__c                 C   s   | j S r   rD  r   rA   rA   re   get_output_embeddingsP  s   z+ModernBertForMaskedLM.get_output_embeddingsnew_embeddingsc                 C   s
   || _ d S r   r  )ra   r  rA   rA   re   set_output_embeddingsS  s   
z+ModernBertForMaskedLM.set_output_embeddingsTr   ri   rv   c                 C   r'  r   )rD  r  rh   rA   rA   re   compiled_headV  r*  z#ModernBertForMaskedLM.compiled_headNr   rs   r   rt   r   ru   r   r   r   rt  ru  r   rv  rw  c                 K   s  |dur|n| j j}|   | j jdkr|du r|du r|	du r|
du r?|du r?|dur6|jdd \}
}n	|jdd \}
}|durF|jn|j}|du rXtj|
|f|tjd}|du rt	  t
||||d\}}}}	}}W d   n1 syw   Y  nt
||||d\}}}}	}}| j||||||||	|
||||d}|d }| jr|dur|d}||jd d}|| jk}|| }|| }| j jr| |n| | |}d}|dur| j||fd	| j ji|}| j jdkrO| j js|du rt nt	  t|||
|d
}W d   n	1 sw   Y  t|dddurOg }|jD ]"}| dkr=|jd dkr=|d}|t|||
|d
 q't||_|sa|f}|dur_|f| S |S t|||j|jdS )rx  Nr  r   r   )rr   rs   rt   ru   r   rs   r   rt   r   r   r   r   rt  ru  r   rv  rw  r   rw   rD   rz  r   r
   r|   losslogitsr   r  ) r   r  r\  r  r   r   r~   r  r"  r  r   r.  r\   r   r]   r^   r  rD  r  loss_functionrD   r_   r   r   getattrr   ry   squeezeappendr   r   r  )ra   r   rs   r   rt   r   ru   r   r   r   rt  ru  r   rv  rw  rb   r   outputsr  mask_tokensr  r  padded_hidden_statesr|  ri   rA   rA   re   r   Z  s   #


 


zModernBertForMaskedLM.forwardNNNNNNNNNNNNNN)rj   rk   rl   _tied_weights_keysr    rC   r  r   r   r  r~   r   r   r  r   r   r   r   r"  r   r   r   r   rq   rA   rA   rc   re   rC  ;  sj    
	
rC  z`
    The ModernBert Model with a sequence classification head on top that performs pooling.
    c                "          e Zd Zdef fddZe														ddeej deej	 deej	 deej	 d	eej	 d
eej	 deej	 deej	 dee
 dee
 dee
 dee dee dee deeej	 ef fddZ  ZS )rE  r   c                    s\   t  | |j| _|| _t|| _t|| _tj	
|j| _t	|j|j| _|   d S r   )rB   rC   
num_labelsr   rg  r.  rA  r  r~   r   r   rX   r   r   rF   rI  rp  r   rc   rA   re   rC     s   

z,ModernBertForSequenceClassification.__init__Nr   rs   r   rt   r   ru   r   r   r   rt  ru  r   rv  rw  rv   c                 K   s\  |dur|n| j j}|   |dur| || |
du r7|du r7|dur.|jdd \}
}n	|jdd \}
}|dur>|jn|j}|du rPtj|
|f|tjd}| j	||||||||	|
||||d}|d }| j j
dkru|dddf }n| j j
dkr||d jd	d
|jd	dd }| |}| |}| |}d}|dur| j jdu r| jd	krd| j _n| jd	kr|jtjks|jtjkrd| j _nd| j _| j jdkrt }| jd	kr|| | }n-|||}n'| j jdkrt }||d| j|d}n| j jdkrt }|||}|s$|f}|dur"|f| S |S t|||j|jdS )aB  
        sliding_window_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding or far-away tokens. In ModernBert, only every few layers
            perform global attention, while the rest perform local attention. This mask is used to avoid attending to
            far-away tokens in the local attention layers when not using Flash Attention.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        indices (`torch.Tensor` of shape `(total_unpadded_tokens,)`, *optional*):
            Indices of the non-padding tokens in the input sequence. Used for unpadding the output.
        cu_seqlens (`torch.Tensor` of shape `(batch + 1,)`, *optional*):
            Cumulative sequence lengths of the input sequences. Used to index the unpadded tensors.
        max_seqlen (`int`, *optional*):
            Maximum sequence length in the batch excluding padding tokens. Used to unpad input_ids and pad output tensors.
        batch_size (`int`, *optional*):
            Batch size of the input sequences. Used to pad the output tensors.
        seq_len (`int`, *optional*):
            Sequence length of the input sequences including padding tokens. Used to pad the output tensors.
        Nr   r   r  r   r6   r:   rw   r|   r   Try   keepdim
regressionsingle_label_classificationmulti_label_classificationr  )r   r  r\  r  r   r   r~   r  r"  r.  r8   r  r}   r  r   rI  problem_typer  rz   longr   r	   r  r   r   r   r   r   r  )ra   r   rs   r   rt   r   ru   r   r   r   rt  ru  r   rv  rw  rb   r   r  r  pooled_outputr  r  loss_fctri   rA   rA   re   r     s   '





"


z+ModernBertForSequenceClassification.forwardr  )rj   rk   rl   r    rC   r   r   r~   r   r   r   r"  r   r   r   r   rq   rA   rA   rc   re   rE    s`    	
rE  zv
    The ModernBert Model with a token classification head on top, e.g. for Named Entity Recognition (NER) tasks.
    c                "       r  )rG  r   c                    V   t  | |j| _t|| _t|| _tj	|j
| _t|j|j| _|   d S r   rB   rC   r  rg  r.  rA  r  r~   r   r   rX   r   r   rF   rI  rp  r   rc   rA   re   rC   e  s   

z)ModernBertForTokenClassification.__init__Nr   rs   r   rt   r   ru   r   r   r   rt  ru  r   rv  rw  rv   c                 C   s   |dur|n| j j}|   | j||||||||	|
||||d}|d }| |}| |}| |}d}|durIt }||d| j	|d}|s_|f|dd  }|dur]|f| S |S t
|||j|jdS )a  
        sliding_window_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding or far-away tokens. In ModernBert, only every few layers
            perform global attention, while the rest perform local attention. This mask is used to avoid attending to
            far-away tokens in the local attention layers when not using Flash Attention.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        indices (`torch.Tensor` of shape `(total_unpadded_tokens,)`, *optional*):
            Indices of the non-padding tokens in the input sequence. Used for unpadding the output.
        cu_seqlens (`torch.Tensor` of shape `(batch + 1,)`, *optional*):
            Cumulative sequence lengths of the input sequences. Used to index the unpadded tensors.
        max_seqlen (`int`, *optional*):
            Maximum sequence length in the batch excluding padding tokens. Used to unpad input_ids and pad output tensors.
        batch_size (`int`, *optional*):
            Batch size of the input sequences. Used to pad the output tensors.
        seq_len (`int`, *optional*):
            Sequence length of the input sequences including padding tokens. Used to pad the output tensors.
        Nr  r   rw   r|   r  )r   r  r\  r.  r  r   rI  r   r   r  r   r   r  )ra   r   rs   r   rt   r   ru   r   r   r   rt  ru  r   rv  rw  r  r  r  r  r  ri   rA   rA   re   r   q  sD   $


z(ModernBertForTokenClassification.forwardr  )rj   rk   rl   r    rC   r   r   r~   r   r   r   r"  r   r   r   r   rq   rA   rA   rc   re   rG  _  s`    	
rG  c                "       s   e Zd Zdef fddZe													ddeej deej deej deej d	eej d
eej deej deej dee	 dee	 dee	 dee
 dee
 dee
 deeej ef fddZ  ZS )rH  r   c                    r  r   r  r   rc   rA   re   rC     s   

z'ModernBertForQuestionAnswering.__init__Nr   rs   r   rt   start_positionsend_positionsr   r   r   rt  ru  r   rv  rw  rv   c                 K   s  |dur|n| j j}|   | j|||||||	|
||||d}|d }| |}| |}| |}|jddd\}}|d	 }|d	 }d}|dur_|dur_| j
||||fi |}|sv||f|dd  }|durt|f| S |S t||||j|jdS )rx  N)rs   r   rt   r   r   r   rt  ru  r   rv  rw  r   r|   rw   r   )r  start_logits
end_logitsr   r  )r   r  r\  r.  r  r   rI  splitr  r   r  r   r   r  )ra   r   rs   r   rt   r  r  r   r   r   rt  ru  r   rv  rw  rb   r  r  r  r  r  r  ri   rA   rA   re   r     sH   #


z&ModernBertForQuestionAnswering.forwardr  )rj   rk   rl   r    rC   r   r   r~   r   r   r"  r   r   r   r   rq   rA   rA   rc   re   rH    s^    	
rH  z
    The ModernBert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks.
    c                "       r  )rF  r   c                    sR   t  | || _t|| _t|| _tj	|j
| _t|jd| _|   d S )Nr|   )rB   rC   r   rg  r.  rA  r  r~   r   r   rX   r   r   rF   rI  rp  r   rc   rA   re   rC      s   

z$ModernBertForMultipleChoice.__init__Nr   rs   r   rt   r   ru   r   r   r   rt  ru  r   rv  rw  rv   c                 K   s  |dur|n| j j}|dur|jd n|jd }|dur%|d|dnd}|dur4|d|dnd}|durC|d|dnd}|durV|d|d|dnd}|   | j||||||||	|
||||d}|d }| j jdkrtj	|jd |j
d}|dur|jdd	|j
}n
tjdtj|j
d
}|||f }n| j jdkr|jddd}||d jdd	| }| |}| |}| |}|d|}d}|durt }|||}|s|f|dd  }|dur|f| S |S t|||j|jdS )a  
        sliding_window_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding or far-away tokens. In ModernBert, only every few layers
            perform global attention, while the rest perform local attention. This mask is used to avoid attending to
            far-away tokens in the local attention layers when not using Flash Attention.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors.
        indices (`torch.Tensor` of shape `(total_unpadded_tokens,)`, *optional*):
            Indices of the non-padding tokens in the input sequence. Used for unpadding the output.
        cu_seqlens (`torch.Tensor` of shape `(batch + 1,)`, *optional*):
            Cumulative sequence lengths of the input sequences. Used to index the unpadded tensors.
        max_seqlen (`int`, *optional*):
            Maximum sequence length in the batch excluding padding tokens. Used to unpad input_ids and pad output tensors.
        batch_size (`int`, *optional*):
            Batch size of the input sequences. Used to pad the output tensors.
        seq_len (`int`, *optional*):
            Sequence length of the input sequences including padding tokens. Used to pad the output tensors.
        Nr|   rw   r  r   r6   ry  r   r   r:   Tr  r  )r   r  r   r   sizer\  r.  r8   r~   r  r   argmaxr   tensorr  r}   r  r  r   rI  r   r   r   r   r  )ra   r   rs   r   rt   r   ru   r   r   r   rt  ru  r   rv  rw  rb   num_choicesr  r  	indices_0cls_masknum_non_pad_tokensr  r  reshaped_logitsr  r  ri   rA   rA   re   r   ,  sh   &



z#ModernBertForMultipleChoice.forwardr  )rj   rk   rl   r    rC   r   r   r~   r   r   r   r"  r   r   r   r   rq   rA   rA   rc   re   rF    s`    	
rF  )r    rg  r-  rC  rE  rG  rH  rF  r   r!  )Wr  r?  
contextlibr   typingr   r   r   r~   torch.nn.functionalr   r   r  torch.nnr   r   r	   activationsr   configuration_utilsr   modeling_attn_mask_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   r   modeling_utilsr   utilsr   r   r   utils.import_utilsr   gemma.modeling_gemmar   r   flash_attn.flash_attn_interfacer   flash_attn.layers.rotaryr   flash_attn.ops.triton.rotaryr   object
get_loggerrj   rY  r    r   r   r   r   r   autogradFunctionr   r   r   r>  r   r   r   r   r"  r   r  rz   r	  r  r  r   r#  r-  rg  rA  rC  rE  rG  rH  rF  __all__rA   rA   rA   re   <module>   s<   
 I$
)
;
5
	
.
	
+


$O.  7  Z[z