o
    	۷i                    @   s   d dl Z d dlZd dlmZ d dlmZmZ d dlZd dlm	  m
Z d dlm	Z	 d dlmZmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZmZmZmZmZ ddlmZmZ ddl m!Z! ddl"m#Z#m$Z$m%Z% ddl&m'Z' ddl(m)Z) e$ rd dl*m+Z+ d dl,m-Z- d dl.m/Z/ ne0Z-e%1e2Z3G dd dej4j5Z6		d]deej7 dee8 fddZ9G dd de-Z:G dd de	j;Z<G dd de	j;Z=G d d! d!e	j;Z>d"d# Z?d^d$d%Z@	&d_d'd(d)ej7d*ej7d+ej7d,eejA d-eBe8e8f d.e8d/e8d0eeC d1eeBej7ej7f eBej7 f fd2d3ZDejEfd'd(d)ej7d4e:dej7de8d-eBe8e8f d.e8d/e8d5ejFd1eBej7 fd6d7ZGd'd(d)ej7d*ej7d+ej7d,eejA d-eBe8e8f d.e8d/e8d1eBej7 fd8d9ZHeGeDeHd:ZIG d;d( d(e	j;ZJG d<d= d=eZKe#G d>d? d?e!ZL		d]d@ej7d*ej7d,eej7 dAeej7 d1eBej7ej7ej7e8eej7 eej7 f f
dBdCZMd@ej7dDej7dEe8dFe8d1ej7f
dGdHZNe#G dIdJ dJeLZOG dKdL dLe	j;ZPe#dMdNG dOdP dPeLZQe#dQdNG dRdS dSeLZRe#dTdNG dUdV dVeLZSe#G dWdX dXeLZTe#dYdNG dZd[ d[eLZUg d\ZVdS )`    N)nullcontext)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)_prepare_4d_attention_mask)GradientCheckpointingLayer)BaseModelOutputMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)PreTrainedModel)auto_docstringis_flash_attn_2_availablelogging)is_triton_available   )ModernBertConfig) flash_attn_varlen_qkvpacked_func)RotaryEmbedding)apply_rotaryc                   @   s>   e Zd Ze		ddeej dee fddZedd Z	dS )	ApplyRotaryEmbUnpadN
cu_seqlens
max_seqlenc              
   C   sd   |  }|j\}}}}	|d d d df |d|	}
t|
||d||ddd | ||| || _|S )N   r   FT)seqlen_offsetsr    r!   interleavedinplace)
contiguousshapeviewr   save_for_backwardr!   )ctxqkvcossinr    r!   	total_nnz_three_nheadsheaddimqk r4   h/home/ubuntu/vllm_env/lib/python3.10/site-packages/transformers/models/modernbert/modeling_modernbert.pyforward>   s    
zApplyRotaryEmbUnpad.forwardc           
      C   sn   | j \}}}| }|j\}}}}|d d d df |d|}	t|	||d|| jdddd	 |d d d d d d fS )Nr"   r#   r   FT)r$   r    r!   r%   r&   	conjugate)saved_tensorsr'   r(   r)   r   r!   )
r+   dor-   r.   r    r/   r0   r1   r2   dqkr4   r4   r5   backward]   s    zApplyRotaryEmbUnpad.backwardNN)
__name__
__module____qualname__staticmethodr   torchTensorintr6   r;   r4   r4   r4   r5   r   =   s    r   r    r!   c                 C   s   t | ||||S )a  
    Arguments:
        qkv: (total_nnz, 3, nheads, headdim) - input tensor for packed QKV.
        cos, sin: (seqlen_rotary, rotary_dim / 2)
        interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead
            of 1st half and 2nd half (GPT-NeoX style).
        inplace: if True, apply rotary embedding in-place.
        seqlen_offsets: (batch_size,) or int. Each sequence in x is shifted by this amount.
            Most commonly used in inference when we have KV cache.
        cu_seqlens: (batch + 1,) or None
        max_seqlen: int
    Return:
        out: (total_nnz, dim)
    rotary_dim must be <= headdim
    Apply rotary embedding to the first rotary_dim of x.
    )r   apply)r,   r-   r.   r    r!   r4   r4   r5   apply_rotary_unpaddedt   s   rE   c                       s   e Zd ZdZ				ddededee deej deej	 f
 fd	d
Z
	ddejdejdee deejeejejf f fddZdefddZ  ZS )!ModernBertUnpaddedRotaryEmbeddingzP
    The rotary position embeddings applied directly to unpadded sequences.
         @Ndimbaser!   devicedtypec                    sR   t  j|||dd || _|dur#|dur%|dur'| j|||d dS dS dS dS )a  
        max_seqlen: if max_seqlen, device, and dtype are provided, we precompute the cos_sin_cache
            up to max_seqlen. If the max_seqlen, device, or dtype during training/inference differ,
            the cos_sin_cache will be recomputed during the forward pass.
        F)rH   rI   rJ   r%   NrJ   rK   )super__init__r!   _update_cos_sin_cache)selfrH   rI   r!   rJ   rK   	__class__r4   r5   rN      s
   z*ModernBertUnpaddedRotaryEmbedding.__init__r,   r    returnc                 C   s6   |dur| j ||j|jd t|| j| j||d}|S )z
        Apply rotary embedding *inplace* to qkv.
        qkv: (total_nnz, 3, nheads, headdim)
        cu_seqlens: (batch + 1,) cumulative sequence lengths
        max_seqlen: int max seq length in the batch
        NrL   r    r!   )rO   rJ   rK   rE   _cos_cached_sin_cached)rP   r,   r    r!   r4   r4   r5   r6      s   z)ModernBertUnpaddedRotaryEmbedding.forwardc                 C   s   d| j  d| j d| j S )Nzdim=z, base=z, scale_base=)rH   rI   
scale_baserP   r4   r4   r5   
extra_repr   s   z,ModernBertUnpaddedRotaryEmbedding.extra_repr)rG   NNNN)r=   r>   r?   __doc__rC   floatr   rA   rJ   rK   rN   rB   r   tupler6   strrY   __classcell__r4   r4   rQ   r5   rF      s8    
rF   c                       sp   e Zd ZdZdef fddZejdddejdej	fd	d
Z
	ddeej deej	 dej	fddZ  ZS )ModernBertEmbeddingszV
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    configc                    sR   t    || _tj|j|j|jd| _tj	|j|j
|jd| _t|j| _d S )N)padding_idxepsbias)rM   rN   ra   r   	Embedding
vocab_sizehidden_sizepad_token_idtok_embeddings	LayerNormnorm_eps	norm_biasnormDropoutembedding_dropoutdroprP   ra   rQ   r4   r5   rN      s
   
zModernBertEmbeddings.__init__Tdynamic	input_idsrS   c                 C      |  | | |S rZ   )rq   rn   rj   )rP   ru   r4   r4   r5   compiled_embeddings   s   z(ModernBertEmbeddings.compiled_embeddingsNinputs_embedsc                 C   sH   |d ur|  | |}|S | jjr| |n
|  | | |}|S rZ   )rq   rn   ra   reference_compilerw   rj   )rP   ru   rx   hidden_statesr4   r4   r5   r6      s   zModernBertEmbeddings.forwardr<   )r=   r>   r?   r[   r   rN   rA   compile
LongTensorrB   rw   r   r6   r_   r4   r4   rQ   r5   r`      s    
r`   c                       s<   e Zd ZdZdef fddZdejdejfddZ  Z	S )	ModernBertMLPa6  Applies the GLU at the end of each ModernBERT layer.

    Compared to the default BERT architecture, this block replaces :class:`~transformers.model.bert.modeling_bert.BertIntermediate`
    and :class:`~transformers.model.bert.modeling_bert.SelfOutput` with a single module that has similar functionality.
    ra   c                    sf   t    || _tj|jt|jd |jd| _	t
|j | _t|j| _tj|j|j|jd| _d S )Nr"   re   )rM   rN   ra   r   Linearrh   rC   intermediate_sizemlp_biasWir
   hidden_activationactro   mlp_dropoutrq   Worr   rQ   r4   r5   rN      s   
 zModernBertMLP.__init__rz   rS   c                 C   s2   |  |jddd\}}| | | || S )Nr"   r#   rH   )r   chunkr   rq   r   )rP   rz   inputgater4   r4   r5   r6      s   zModernBertMLP.forward)
r=   r>   r?   r[   r   rN   rA   rB   r6   r_   r4   r4   rQ   r5   r}      s    r}   c                       sD   e Zd ZU ejed< ddef fddZe e	dd Z
  ZS )	ModernBertRotaryEmbeddinginv_freqNra   c                    s   t    t|drt|jtr|jd|jd| _nd| _|j| _	|j| _
|| _t| j | _| | j|\}| _| jd|dd | j| _d S )Nrope_scaling	rope_typetypedefaultr   F)
persistent)rM   rN   hasattr
isinstancer   dictgetr   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenra   r   rope_init_fnattention_scalingregister_bufferr   original_inv_freq)rP   ra   rJ   r   rQ   r4   r5   rN      s   
z"ModernBertRotaryEmbedding.__init__c           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtr6|jjdkr6|jjnd}t	j
|dd+ | |  dd}t	j||fdd	}| | j }| | j }	W d    n1 smw   Y  |j|jd
|	j|jd
fS )Nr   r#   r   mpscpuF)device_typeenabledr"   r   )rK   )r   r\   expandr(   torJ   r   r   r^   rA   autocast	transposecatr-   r   r.   rK   )
rP   xposition_idsinv_freq_expandedposition_ids_expandedr   freqsembr-   r.   r4   r4   r5   r6   	  s   0&z!ModernBertRotaryEmbedding.forwardrZ   )r=   r>   r?   rA   rB   __annotations__r   rN   no_gradr   r6   r_   r4   r4   rQ   r5   r      s   
 
r   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..Nr#   r"   r   )r(   rA   r   )r   x1x2r4   r4   r5   rotate_half  s   r   c                 C   sD   | |}| |}| | t| |  }|| t||  }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer   )qkr-   r.   r   unsqueeze_dimq_embedk_embedr4   r4   r5   apply_rotary_pos_emb   s
   

r   FmoduleModernBertAttentionr,   attention_masksliding_window_maskr   local_attentionbsrH   output_attentionsrS   c	                 K   s   | j ||d\}
}|ddjdd\}}}t|||
|\}}| jd }t||dd| }|dkr6|}|| }tjj	|dtj
d	|j}tjj|| j| jd
}t||}|dd }||d|}|rn||fS |fS )Nr   r	   r   r"   r         ࿩r#   r#   r#   rH   rK   )ptraining)
rotary_embr   unbindr   head_dimrA   matmulr   
functionalsoftmaxfloat32r   rK   dropoutattention_dropoutr   r'   r)   )r   r,   r   r   r   r   r   rH   r   _kwargsr-   r.   querykeyvaluescaleattn_weightsattn_outputr4   r4   r5   eager_attention_forward;  s    
r   r   target_dtypec	                 K   s   ||||d}|j tjtjfv}
|
r1|j }||}t|||| jr$| jnd| j|d}||}nt|||| jr;| jnd| j|d}|	||fS )NrT           )r    r!   	dropout_pdeterministicwindow_size)
rK   rA   float16bfloat16r   r   r   r   deterministic_flash_attnr)   )r   r,   r   r    r!   r   r   rH   r   r   convert_dtype
orig_dtypeattnr4   r4   r5   flash_attention_forward`  s.   
r   c                 K   s   | j ||d\}	}
|ddjdd\}}}t|||	|
\}}|dkr%|}tj|||| jr0| jnd|ddd }|	|d	|}|fS )
Nr   r	   r   r"   r   r   r   )r   	attn_maskr#   )
r   r   r   r   Fscaled_dot_product_attentionr   r   r'   r)   )r   r,   r   r   r   r   r   rH   r   r-   r.   r   r   r   r   r4   r4   r5   sdpa_attention_forward  s"   r   )flash_attention_2eagersdpac                       sR   e Zd ZdZddedee f fddZ	ddej	d	ee
 d
ej	fddZ  ZS )r   a  Performs multi-headed self attention on a batch of unpadded sequences.

    If Flash Attention 2 is installed, this module uses Flash Attention to improve throughput.
    If Flash Attention 2 is not installed, the implementation will use PyTorch's SDPA kernel,
    which requires padding and unpadding inputs, adding some overhead.

    See `forward` method for additional details.
    Nra   layer_idc                    sb  t    || _|| _|j|j dkr td|j d|j d|j| _|j| _|j| _	|j|j | _
| j
| j	 | _tj|jd| j |jd| _||j dkri|jd |jd f| _|jd urb|jn|j}|j}n	d| _|j}|j}|jd	krt| j
||d
| _nt|}||_t|d| _tj|j|j|jd| _|jdkrt|jnt | _t  | _!d S )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()r	   r~   r"   r   r   )rH   r!   rI   ra   r   )"rM   rN   ra   r   rh   num_attention_heads
ValueErrorr   r   	num_headsr   all_head_sizer   r   attention_biasWqkvglobal_attn_every_n_layersr   local_rope_thetaglobal_rope_thetar   _attn_implementationrF   r   copydeepcopy
rope_thetar   r   ro   Identityout_dropsetpruned_heads)rP   ra   r   r   r   config_copyrQ   r4   r5   rN     s<   



 zModernBertAttention.__init__Frz   r   rS   c              	   K   s   |  |}|jd }| jjdkr|dd| j| j}n||dd| j| j}t| jj | f|| j| j	|| j
|d|}|d }| | |}|f|dd   S )Nr   r   r#   r	   )r,   r   r   r   rH   r   r   )r   r(   ra   r   r)   r   r   MODERNBERT_ATTENTION_FUNCTIONr   r   r   r   r   )rP   rz   r   kwargsr,   r   attn_outputsr4   r4   r5   r6     s(   



zModernBertAttention.forwardrZ   F)r=   r>   r?   r[   r   r   rC   rN   rA   rB   boolr6   r_   r4   r4   rQ   r5   r     s    	*c                       s   e Zd Zddedee f fddZejdddej	d	ej	fd
dZ
						ddej	deej	 deej	 deej deej	 dee dee d	ej	fddZ  ZS )ModernBertEncoderLayerNra   r   c                    sp   t    || _|dkrt | _ntj|j|j|j	d| _t
||d| _tj|j|j|j	d| _t|| _d S )Nr   rc   )ra   r   )rM   rN   ra   r   r   	attn_normrk   rh   rl   rm   r   r   mlp_normr}   mlp)rP   ra   r   rQ   r4   r5   rN     s   
zModernBertEncoderLayer.__init__Trs   rz   rS   c                 C      |  | |S rZ   )r  r  rP   rz   r4   r4   r5   compiled_mlp     z#ModernBertEncoderLayer.compiled_mlpFr   r   r   r    r!   r   c           
   	   C   sf   | j | |||||||d}||d  }| jjr| |n| | |}	||	 }|f|dd   S )Nr   r   r   r    r!   r   r   r   )r   r  ra   ry   r	  r  r  )
rP   rz   r   r   r   r    r!   r   r   
mlp_outputr4   r4   r5   r6     s    
	zModernBertEncoderLayer.forwardrZ   )NNNNNF)r=   r>   r?   r   r   rC   rN   rA   r{   rB   r	  r|   r  r6   r_   r4   r4   rQ   r5   r    s6    
	r  c                       s|   e Zd ZU eed< dZdZddgZdZdZ	dZ
dejfdd	Z	dd
ee dedef fddZdd Z fddZ  ZS )ModernBertPreTrainedModelra   modelTr`   r  Fr   c                    sx  | j j  d u r
d dtjdtf fdd}| j j| j jtd| j j  | j j| j j	d d}t
|tr?||j|d	  d S t
|trV||j|d
  ||j|d  d S t
|trm||j|d
  ||j|d  d S t
|tr|||j|d  d S t
|tr||j|d  d S t
|ttttfr||j|d  d S t
|tjr|jjd |jd ur|jj   d S d S d S )Nr	   r   stdc                    sR   t jj| jd|  |  | d t| t jr%| jd ur't j| j d S d S d S )Nr   )meanr  ab)r   inittrunc_normal_weightr   r   re   zeros_)r   r  cutoff_factorr4   r5   init_weightA  s   
z<ModernBertPreTrainedModel._init_weights.<locals>.init_weightg       @r   )inout	embedding	final_outr  r  r  r  g      ?)!ra   initializer_cutoff_factorr   Moduler\   initializer_rangemathsqrtnum_hidden_layersrh   r   r`   rj   r}   r   r   r   r   ModernBertPredictionHeaddenseModernBertForMaskedLMdecoder#ModernBertForSequenceClassificationModernBertForMultipleChoice ModernBertForTokenClassificationModernBertForQuestionAnswering
classifierrk   r  datafill_re   zero_)rP   r   r  stdsr4   r  r5   _init_weights<  sH   





	
z'ModernBertPreTrainedModel._init_weightsattn_implementationis_init_checkrS   c              	      sD   z|du r|   rdn|}W n ttfy   Y nw t j||dS )zR
        Checks and dispatches to hhe requested attention implementation.
        Nr   )r2  r3  )_flash_attn_2_can_dispatchr   ImportErrorrM   %_check_and_adjust_attn_implementation)rP   r2  r3  rQ   r4   r5   r6  p  s   z?ModernBertPreTrainedModel._check_and_adjust_attn_implementationc                 C   s   | j jdu rd S t| dr!t| jdkr!| j jrtd d| j _| jjdkr4| j jr0td d| j _| jjdkrG| j jrCtd d| j _| j jd u rTt	 | j _d S d S )	NFhf_device_mapr   zqIf `accelerate` split the model across devices, `torch.compile` will not work. Falling back to non-compiled mode.r   z|Compiling the model with `torch.compile` and using a `torch.mps` device is not supported. Falling back to non-compiled mode.r   z|Compiling the model with `torch.compile` and using a `torch.cpu` device is not supported. Falling back to non-compiled mode.)
ra   ry   r   lenr7  loggerwarning_oncerJ   r   r   rX   r4   r4   r5   _maybe_set_compile  s.   z,ModernBertPreTrainedModel._maybe_set_compilec                    s<   t  j|i |}| jjdv r| jjrtd d| j_|S )N>   NTzcResizing token embeddings with `torch.compile` is not supported. Falling back to non-compiled mode.F)rM   resize_token_embeddingsra   ry   r9  r:  )rP   argsr   model_embedsrQ   r4   r5   r<    s   z1ModernBertPreTrainedModel.resize_token_embeddingsr  )r=   r>   r?   r   r   base_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_flash_attn_supports_sdpa_supports_flex_attnr   r  r1  r   r^   r  r6  r;  r<  r_   r4   r4   rQ   r5   r  2  s&   
 5r  inputslabelsc                 C   s   |j dtjd}tj| dd }t|  }tjj	
tj|dtjdd}|  dkr7|  | }n| j^}	}
}|	|
 }| j|g|R  | }|durV| | nd}|durb| | nd}||||||fS )	a  
    Remove padding from input sequences.

    Args:
        inputs: (batch, seqlen, ...) or (batch, seqlen)
        attention_mask: (batch, seqlen), bool / int, 1 means valid and 0 means not valid.
        position_ids: (batch, seqlen), int, position ids
        labels: (batch, seqlen), int, labels

    Returns:
        unpadded_inputs: (total_nnz, ...), where total_nnz = number of tokens selected in attention_mask.
        indices: (total_nnz)
        cu_seqlens: (batch + 1), the cumulative sequence lengths
        max_seqlen_in_batch: int
        unpadded_position_ids: (total_nnz) or None
        unpadded_labels: (total_nnz) or None
    r#   r   F)as_tupler   )r   r   r"   N)sumrA   int32nonzeroflattenrC   maxitemr   r   padcumsumrH   r(   r)   )rE  r   r   rF  seqlens_in_batchindicesmax_seqlen_in_batchr    unpadded_inputsbatchseqlenrestr(   unpadded_position_idsunpadded_labelsr4   r4   r5   _unpad_modernbert_input  s   rY  rQ  rT  rU  c                 C   s   |   dkrtj|| | j| jd}| ||< |||}|S | j^}}tj|| g|R | j| jd}| ||< |j||g|R  }|S )aQ  
    Add padding to sequences.

    Args:
        inputs: (total_nnz, ...) or (total_nnz,), where total_nnz = number of tokens selected in attention_mask.
        indices: (total_nnz)
        batch: int, batch size
        seqlen: int, max sequence length

    Returns:
        padded_inputs: (batch, seqlen, ...) or (batch, seqlen)
    r   rK   rJ   )rH   rA   zerosrK   rJ   r)   r(   )rE  rQ  rT  rU  outputpadded_inputs_rV  r4   r4   r5   _pad_modernbert_output  s   
"r_  c                !       s  e Zd Zdef fddZdd Zdd Ze													dd	ee	j
 d
ee	j dee	j dee	j
 dee	j dee	j dee	j dee dee dee dee dee dee deee	jdf ef fddZd
e	jdede	jfddZ  ZS )ModernBertModelra   c                    sf   t     | _t | _t fddt jD | _	tj
 j j jd| _d| _|   d S )Nc                    s   g | ]}t  |qS r4   )r  ).0r   r   r4   r5   
<listcomp>  s    z,ModernBertModel.__init__.<locals>.<listcomp>rc   F)rM   rN   ra   r`   
embeddingsr   
ModuleListranger#  layersrk   rh   rl   rm   
final_normgradient_checkpointing	post_initrr   rQ   r   r5   rN     s   
zModernBertModel.__init__c                 C   s   | j jS rZ   rc  rj   rX   r4   r4   r5   get_input_embeddings  s   z$ModernBertModel.get_input_embeddingsc                 C   s   || j _d S rZ   rj  )rP   r   r4   r4   r5   set_input_embeddings  s   z$ModernBertModel.set_input_embeddingsNru   r   r   r   rx   rQ  r    r!   
batch_sizeseq_lenr   output_hidden_statesreturn_dictrS   .c              
      s  |dur|n| j j}|dur|n| j j}|dur|n| j j}|du |duA r*td|r.dnd}|r4dnd}|   |durD| ||  du rcdu rc|durZ|jdd \ n	|jdd \ |durj|jn|j}|du r|t	j
 f|t	jd}d}| j jdkrdu r|du r|du rd}|du rt	  t||d	^}}}}W d   n1 sw   Y  n#t||d	^}}}}n|du rt	j|d
d}| j||d\}}| j||d}| jD ])}|r||f }||||||||d}|d }|rt|dkr||d f }q|r||f }| |}|r8t| d}|dur7t fdd|D }n#| j jdkr[|dur[|d  dkr[|d}tdd |D }|sjtdd |||fD S t|||dS )  
        sliding_window_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding or far-away tokens. In ModernBert, only every few layers
            perform global attention, while the rest perform local attention. This mask is used to avoid attending to
            far-away tokens in the local attention layers when not using Flash Attention.
        indices (`torch.Tensor` of shape `(total_unpadded_tokens,)`, *optional*):
            Indices of the non-padding tokens in the input sequence. Used for unpadding the output.
        cu_seqlens (`torch.Tensor` of shape `(batch + 1,)`, *optional*):
            Cumulative sequence lengths of the input sequences. Used to index the unpadded tensors.
        max_seqlen (`int`, *optional*):
            Maximum sequence length in the batch excluding padding tokens. Used to unpad input_ids and pad output tensors.
        batch_size (`int`, *optional*):
            Batch size of the input sequences. Used to pad the output tensors.
        seq_len (`int`, *optional*):
            Sequence length of the input sequences including padding tokens. Used to pad the output tensors.
        Nz:You must specify exactly one of input_ids or inputs_embedsr4   r"   rL   Fr   T)rE  r   rJ   r   )r   )ru   rx   r  r   rE  rQ  rT  rU  c                 3   s     | ]}t | d V  qdS )rs  N)r_  ra  hsrm  rQ  rn  r4   r5   	<genexpr>|  s
    
z*ModernBertModel.forward.<locals>.<genexpr>r#   c                 s   s    | ]}| d V  qdS )r   N)r   rt  r4   r4   r5   rw    s    c                 s   s    | ]	}|d ur|V  qd S rZ   r4   )ra  vr4   r4   r5   rw    s    )last_hidden_staterz   
attentions)ra   r   ro  use_return_dictr   r;  %warn_if_padding_and_no_attention_maskr(   rJ   rA   onesr  r   r   rY  aranger   _update_attention_maskrc  rf  r8  rg  r_  r]   rH   r   )rP   ru   r   r   r   rx   rQ  r    r!   rm  rn  r   ro  rp  all_hidden_statesall_self_attentionsrJ   repadr^  rz   encoder_layerlayer_outputsr4   rv  r5   r6     s   !



	




zModernBertModel.forwardc                 C   s   |r#| j jdkrtd d| j _n| j jdkr#td| j j d t|| j}t|jd 	d}t
||j }|| j jd k	d	d|j}|| t| jj}||fS )Nr   zOutputting attentions is only supported with the 'eager' attention implementation, not with "sdpa". Falling back to `attn_implementation="eager"`.r   zZOutputting attentions is only supported with the eager attention implementation, not with zT. Consider setting `attn_implementation="eager"`. Setting `output_attentions=False`.r"   r   )ra   r   r9  r:  r   rK   rA   r~  r(   r   absTr   r   rJ   masked_filllogical_notfinfomin)rP   r   r   global_attention_maskrowsdistancewindow_maskr   r4   r4   r5   r    s&   
"z&ModernBertModel._update_attention_maskNNNNNNNNNNNNN)r=   r>   r?   r   rN   rk  rl  r   r   rA   r|   rB   rC   r  r   r]   r   r6   r  r_   r4   r4   rQ   r5   r`    sb    	
 "r`  c                       s8   e Zd Zdef fddZdejdejfddZ  ZS )r$  ra   c                    sN   t    || _t|j|j|j| _t|j	 | _
tj|j|j|jd| _d S )Nrc   )rM   rN   ra   r   r   rh   classifier_biasr%  r
   classifier_activationr   rk   rl   rm   rn   rr   rQ   r4   r5   rN     s
   
z!ModernBertPredictionHead.__init__rz   rS   c                 C   rv   rZ   )rn   r   r%  r  r4   r4   r5   r6     s   z ModernBertPredictionHead.forward)	r=   r>   r?   r   rN   rA   rB   r6   r_   r4   r4   rQ   r5   r$    s    r$  zd
    The ModernBert Model with a decoder head on top that is used for masked language modeling.
    )custom_introc                "       s   e Zd ZdgZdef fddZdd Zdejfdd	Z	e
jd
dde
jde
jfddZe														d!dee
j dee
j dee
j dee
j dee
j dee
j dee
j dee
j dee dee dee dee dee dee deee
j ef fdd Z  ZS )"r&  zdecoder.weightra   c                    s^   t  | || _t|| _t|| _tj|j	|j
|jd| _| jj| _| jj| _|   d S )Nr~   )rM   rN   ra   r`  r  r$  headr   r   rh   rg   decoder_biasr'  sparse_predictionsparse_pred_ignore_indexri  rr   rQ   r4   r5   rN     s   



zModernBertForMaskedLM.__init__c                 C   s   | j S rZ   r'  rX   r4   r4   r5   get_output_embeddings  s   z+ModernBertForMaskedLM.get_output_embeddingsnew_embeddingsc                 C   s
   || _ d S rZ   r  )rP   r  r4   r4   r5   set_output_embeddings  s   
z+ModernBertForMaskedLM.set_output_embeddingsTrs   r\  rS   c                 C   r  rZ   )r'  r  )rP   r\  r4   r4   r5   compiled_head  r
  z#ModernBertForMaskedLM.compiled_headNru   r   r   r   rx   rF  rQ  r    r!   rm  rn  r   ro  rp  c                 K   s  |dur|n| j j}|   | j jdkr|du r|du r|	du r|
du r?|du r?|dur6|jdd \}
}n	|jdd \}
}|durF|jn|j}|du rXtj|
|f|tjd}|du rt	  t
||||d\}}}}	}}W d   n1 syw   Y  nt
||||d\}}}}	}}| j||||||||	|
||||d}|d }| jr|dur|d}||jd d}|| jk}|| }|| }| j jr| |n| | |}d}|dur| j||fd	| j ji|}| j jdkrO| j js|du rt nt	  t|||
|d
}W d   n	1 sw   Y  t|dddurOg }|jD ]"}| dkr=|jd dkr=|d}|t|||
|d
 q't||_|sa|f}|dur_|f| S |S t|||j|jdS )rq  Nr   r"   rL   )rE  r   r   rF  ru   r   r   r   rx   rQ  r    r!   rm  rn  r   ro  rp  r   r#   rg   rs  rz   r	   r   losslogitsrz   rz  ) ra   r{  r;  r   r(   rJ   rA   r}  r  r   rY  r  r  r)   r  ry   r  r'  r  loss_functionrg   repad_logits_with_gradr   r_  getattrrz   rH   squeezeappendr]   r   rz  )rP   ru   r   r   r   rx   rF  rQ  r    r!   rm  rn  r   ro  rp  r   rJ   outputsry  mask_tokensr  r  padded_hidden_statesru  r\  r4   r4   r5   r6     s   #


 


zModernBertForMaskedLM.forwardNNNNNNNNNNNNNN)r=   r>   r?   _tied_weights_keysr   rN   r  r   r   r  rA   r{   rB   r  r   r   r|   rC   r  r   r]   r   r6   r_   r4   r4   rQ   r5   r&    sj    
	
r&  z`
    The ModernBert Model with a sequence classification head on top that performs pooling.
    c                "          e Zd Zdef fddZe														ddeej deej	 deej	 deej	 d	eej	 d
eej	 deej	 deej	 dee
 dee
 dee
 dee dee dee deeej	 ef fddZ  ZS )r(  ra   c                    s\   t  | |j| _|| _t|| _t|| _tj	
|j| _t	|j|j| _|   d S rZ   )rM   rN   
num_labelsra   r`  r  r$  r  rA   r   ro   classifier_dropoutrq   r   rh   r,  ri  rr   rQ   r4   r5   rN   _  s   

z,ModernBertForSequenceClassification.__init__Nru   r   r   r   rx   rF  rQ  r    r!   rm  rn  r   ro  rp  rS   c                 K   s\  |dur|n| j j}|   |dur| || |
du r7|du r7|dur.|jdd \}
}n	|jdd \}
}|dur>|jn|j}|du rPtj|
|f|tjd}| j	||||||||	|
||||d}|d }| j j
dkru|dddf }n| j j
dkr||d jd	d
|jd	dd }| |}| |}| |}d}|dur| j jdu r| jd	krd| j _n| jd	kr|jtjks|jtjkrd| j _nd| j _| j jdkrt }| jd	kr|| | }n-|||}n'| j jdkrt }||d| j|d}n| j jdkrt }|||}|s$|f}|dur"|f| S |S t|||j|jdS )aB  
        sliding_window_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding or far-away tokens. In ModernBert, only every few layers
            perform global attention, while the rest perform local attention. This mask is used to avoid attending to
            far-away tokens in the local attention layers when not using Flash Attention.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        indices (`torch.Tensor` of shape `(total_unpadded_tokens,)`, *optional*):
            Indices of the non-padding tokens in the input sequence. Used for unpadding the output.
        cu_seqlens (`torch.Tensor` of shape `(batch + 1,)`, *optional*):
            Cumulative sequence lengths of the input sequences. Used to index the unpadded tensors.
        max_seqlen (`int`, *optional*):
            Maximum sequence length in the batch excluding padding tokens. Used to unpad input_ids and pad output tensors.
        batch_size (`int`, *optional*):
            Batch size of the input sequences. Used to pad the output tensors.
        seq_len (`int`, *optional*):
            Sequence length of the input sequences including padding tokens. Used to pad the output tensors.
        Nr"   rL   r  r   clsr  r#   r   r   TrH   keepdim
regressionsingle_label_classificationmulti_label_classificationr  )ra   r{  r;  r|  r(   rJ   rA   r}  r  r  classifier_poolingr   rH  r  rq   r,  problem_typer  rK   longrC   r   r  r   r)   r   r   rz   rz  )rP   ru   r   r   r   rx   rF  rQ  r    r!   rm  rn  r   ro  rp  r   rJ   r  ry  pooled_outputr  r  loss_fctr\  r4   r4   r5   r6   l  s   '





"


z+ModernBertForSequenceClassification.forwardr  )r=   r>   r?   r   rN   r   r   rA   r|   rB   rC   r  r   r]   r   r6   r_   r4   r4   rQ   r5   r(  Y  s`    	
r(  zv
    The ModernBert Model with a token classification head on top, e.g. for Named Entity Recognition (NER) tasks.
    c                "       r  )r*  ra   c                    V   t  | |j| _t|| _t|| _tj	|j
| _t|j|j| _|   d S rZ   rM   rN   r  r`  r  r$  r  rA   r   ro   r  rq   r   rh   r,  ri  rr   rQ   r4   r5   rN     s   

z)ModernBertForTokenClassification.__init__Nru   r   r   r   rx   rF  rQ  r    r!   rm  rn  r   ro  rp  rS   c                 C   s   |dur|n| j j}|   | j||||||||	|
||||d}|d }| |}| |}| |}d}|durIt }||d| j	|d}|s_|f|dd  }|dur]|f| S |S t
|||j|jdS )a  
        sliding_window_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding or far-away tokens. In ModernBert, only every few layers
            perform global attention, while the rest perform local attention. This mask is used to avoid attending to
            far-away tokens in the local attention layers when not using Flash Attention.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        indices (`torch.Tensor` of shape `(total_unpadded_tokens,)`, *optional*):
            Indices of the non-padding tokens in the input sequence. Used for unpadding the output.
        cu_seqlens (`torch.Tensor` of shape `(batch + 1,)`, *optional*):
            Cumulative sequence lengths of the input sequences. Used to index the unpadded tensors.
        max_seqlen (`int`, *optional*):
            Maximum sequence length in the batch excluding padding tokens. Used to unpad input_ids and pad output tensors.
        batch_size (`int`, *optional*):
            Batch size of the input sequences. Used to pad the output tensors.
        seq_len (`int`, *optional*):
            Sequence length of the input sequences including padding tokens. Used to pad the output tensors.
        Nr  r   r#   r   r  )ra   r{  r;  r  r  rq   r,  r   r)   r  r   rz   rz  )rP   ru   r   r   r   rx   rF  rQ  r    r!   rm  rn  r   ro  rp  r  ry  r  r  r  r\  r4   r4   r5   r6     sD   $


z(ModernBertForTokenClassification.forwardr  )r=   r>   r?   r   rN   r   r   rA   r|   rB   rC   r  r   r]   r   r6   r_   r4   r4   rQ   r5   r*    s`    	
r*  c                "       s   e Zd Zdef fddZe													ddeej deej deej deej d	eej d
eej deej deej dee	 dee	 dee	 dee
 dee
 dee
 deeej ef fddZ  ZS )r+  ra   c                    r  rZ   r  rr   rQ   r4   r5   rN   C  s   

z'ModernBertForQuestionAnswering.__init__Nru   r   r   r   start_positionsend_positionsrQ  r    r!   rm  rn  r   ro  rp  rS   c                 K   s  |dur|n| j j}|   | j|||||||	|
||||d}|d }| |}| |}| |}|jddd\}}|d	 }|d	 }d}|dur_|dur_| j
||||fi |}|sv||f|dd  }|durt|f| S |S t||||j|jdS )rq  N)r   r   r   rQ  r    r!   rm  rn  r   ro  rp  r   r   r#   r   )r  start_logits
end_logitsrz   rz  )ra   r{  r;  r  r  rq   r,  splitr  r'   r  r   rz   rz  )rP   ru   r   r   r   r  r  rQ  r    r!   rm  rn  r   ro  rp  r   r  ry  r  r  r  r  r\  r4   r4   r5   r6   N  sH   #


z&ModernBertForQuestionAnswering.forwardr  )r=   r>   r?   r   rN   r   r   rA   rB   rC   r  r   r]   r   r6   r_   r4   r4   rQ   r5   r+  A  s^    	
r+  z
    The ModernBert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks.
    c                "       r  )r)  ra   c                    sR   t  | || _t|| _t|| _tj	|j
| _t|jd| _|   d S Nr   )rM   rN   ra   r`  r  r$  r  rA   r   ro   r  rq   r   rh   r,  ri  rr   rQ   r4   r5   rN     s   

z$ModernBertForMultipleChoice.__init__Nru   r   r   r   rx   rF  rQ  r    r!   rm  rn  r   ro  rp  rS   c                 K   s  |dur|n| j j}|dur|jd n|jd }|dur%|d|dnd}|dur4|d|dnd}|durC|d|dnd}|durV|d|d|dnd}|   | j||||||||	|
||||d}|d }| j jdkrtj	|jd |j
d}|dur|jdd	|j
}n
tjdtj|j
d
}|||f }n| j jdkr|jddd}||d jdd	| }| |}| |}| |}|d|}d}|durt }|||}|s|f|dd  }|dur|f| S |S t|||j|jdS )a  
        sliding_window_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding or far-away tokens. In ModernBert, only every few layers
            perform global attention, while the rest perform local attention. This mask is used to avoid attending to
            far-away tokens in the local attention layers when not using Flash Attention.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors.
        indices (`torch.Tensor` of shape `(total_unpadded_tokens,)`, *optional*):
            Indices of the non-padding tokens in the input sequence. Used for unpadding the output.
        cu_seqlens (`torch.Tensor` of shape `(batch + 1,)`, *optional*):
            Cumulative sequence lengths of the input sequences. Used to index the unpadded tensors.
        max_seqlen (`int`, *optional*):
            Maximum sequence length in the batch excluding padding tokens. Used to unpad input_ids and pad output tensors.
        batch_size (`int`, *optional*):
            Batch size of the input sequences. Used to pad the output tensors.
        seq_len (`int`, *optional*):
            Sequence length of the input sequences including padding tokens. Used to pad the output tensors.
        Nr   r#   r  r   r  rr  r   rZ  r  Tr  r  )ra   r{  r(   r)   sizer;  r  r  rA   r~  rJ   argmaxr   tensorr  rH  r   r  rq   r,  r   r   r   rz   rz  )rP   ru   r   r   r   rx   rF  rQ  r    r!   rm  rn  r   ro  rp  r   num_choicesr  ry  	indices_0cls_masknum_non_pad_tokensr  r  reshaped_logitsr  r  r\  r4   r4   r5   r6     sh   &



z#ModernBertForMultipleChoice.forwardr  )r=   r>   r?   r   rN   r   r   rA   r|   rB   rC   r  r   r]   r   r6   r_   r4   r4   rQ   r5   r)    s`    	
r)  )r`  r  r&  r(  r*  r+  r)  r<   r  r  )Wr   r!  
contextlibr   typingr   r   rA   torch.nn.functionalr   r   r   torch.nnr   r   r   activationsr
   modeling_attn_mask_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   utilsr   r   r   utils.import_utilsr   configuration_modernbertr   flash_attn.flash_attn_interfacer   flash_attn.layers.rotaryr   flash_attn.ops.triton.rotaryr   object
get_loggerr=   r9  autogradFunctionr   rB   rC   rE   rF   r  r`   r}   r   r   r   r|   r]   r  r   r   rK   r   r   r   r   r  r  rY  r_  r`  r$  r&  r(  r*  r+  r)  __all__r4   r4   r4   r5   <module>   s<   
;
5$
$
	
.
	
+


$O. $
)
 7  Z[z