o
    -i)@                     @   s  d dl mZ d dlZd dlmZ d dlmZ d dlmZ d dlm	Z	 d dl
mZmZ d dlmZ d d	lmZ d d
lmZmZ d dlmZ d dlmZ d dlmZmZmZ d dlmZ d dlmZ d dl m!Z! d dl"m#Z# d dl$m%Z% ddl&m'Z' ddl(m)Z)m*Z* ddl+m,Z,m-Z-m.Z. G dd dej/Z0G dd dej/Z1G dd dej/Z2G dd dej/Z3G dd  d ej/Z4e	e*d!d"G d#d$ d$ej/Z5G d%d& d&eZ6e*d!d"G d'd( d(ej/e'Z7G d)d* d*ej/Z8e)d+e*d,d-G d.d/ d/ej/Z9dS )0    )IterableN)nn)ModernBertConfig)ACT2FN)support_torch_compile)ModelConfig
VllmConfig)$get_tensor_model_parallel_world_size)EncoderOnlyAttention)QKVParallelLinearRowParallelLinear)DispatchPooler)LambdaPoolerActivation)EmbeddingPoolerHeadSequencePoolerget_seq_pooling_method)pooler_for_token_classify)get_rope)VocabParallelEmbedding)default_weight_loader)IntermediateTensors   )SupportsCrossEncoding)	attn_typedefault_pooling_type)AutoWeightsLoaderWeightsMappermaybe_prefixc                       s\   e Zd Zdef fddZdejdejfddZ	ddejd	ejdB dejfd
dZ  Z	S )ModernBertEmbeddingsconfigc                    sV   t    || _t|j|j| _t|dd pt|dd pd}tj	|j||j
d| _d S )Nnorm_epslayer_norm_epsh㈵>epsbias)super__init__r   r   
vocab_sizehidden_sizetok_embeddingsgetattrr   	LayerNorm	norm_biasnorm)selfr   r$   	__class__ b/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/model_executor/models/modernbert.pyr'   $   s   

zModernBertEmbeddings.__init__	input_idsreturnc                 C   s
   |  |S N)r*   r/   r4   r2   r2   r3   embed_input_ids1   s   
z$ModernBertEmbeddings.embed_input_idsNinputs_embedsc                 C   s*   |d ur	|  |S | |}|  |}|S r6   )r.   r*   )r/   r4   r9   
embeddingsr2   r2   r3   forward4   s
   


zModernBertEmbeddings.forwardr6   )
__name__
__module____qualname__r   r'   torchTensorr8   r;   __classcell__r2   r2   r0   r3   r   #   s    r   c                       sN   e Zd Z	ddededB def fddZdejd	ejd
ejfddZ	  Z
S )ModernBertAttentionN r   layer_idprefixc           
         sh  t    || _|j| _t }|| _|j| _|j| _| j| dks#J |j|j | _	| j	| j | _
| j	d | _t|j| j	| j|j| dd| _t|dd  }re|| }|j| }d }|dkrd|jd }n"d }||j dkr|jd }|jd ur{|jn|j}	n|j}	d|	d	}t| j	|j|tjd
| _t| j| j	| j| d|d| _t|j|j|j| dd| _d S )Nr   g      z.Wqkvr%   rE   layer_typessliding_attention   default)	rope_type
rope_theta)	head_sizemax_positionrope_parametersdtype.attn)rE   per_layer_sliding_window.Wo)r&   r'   r   r)   r	   rD   deterministic_flash_attnnum_attention_heads	num_headshead_dimall_head_sizescalingr   attention_biasWqkvr+   rO   local_attentionglobal_attn_every_n_layerslocal_rope_thetaglobal_rope_thetar   max_position_embeddingsr?   float16
rotary_embr
   attnr   Wo)
r/   r   rD   rE   tp_sizerG   
layer_typerO   sliding_windowrL   r0   r2   r3   r'   B   sj   





zModernBertAttention.__init__hidden_statesposition_idsr5   c           	      C   s`   |  |\}}|j| jgd dd\}}}| |||\}}| |||}|}| |\}}|S )N   dim)r[   splitrX   rb   rc   rd   )	r/   rh   ri   qkv_qkvattn_outputsr2   r2   r3   r;      s   zModernBertAttention.forward)NrC   )r<   r=   r>   r   intstrr'   r?   r@   r;   rA   r2   r2   r0   r3   rB   A   s     ?rB   c                       s>   e Zd Zd
dedef fddZdejdejfdd	Z  Z	S )ModernBertMLPrC   r   rE   c                    s\   t    || _tj|jt|jd |jd| _	t
 | _t|j|j|j| dd| _d S )NrI   r%   rS   rF   )r&   r'   r   r   Linearr)   ru   intermediate_sizemlp_biasWiGELUactr   rd   )r/   r   rE   r0   r2   r3   r'      s   

zModernBertMLP.__init__rh   r5   c                 C   s0   |  |jddd\}}| | || d S )NrI   rk   rl   r   )r|   chunkrd   r~   )r/   rh   inputgater2   r2   r3   r;      s   zModernBertMLP.forwardrC   )
r<   r=   r>   r   rv   r'   r?   r@   r;   rA   r2   r2   r0   r3   rw      s    rw   c                       sN   e Zd Z	ddedededB f fddZdejd	ejd
ejfddZ	  Z
S )ModernBertLayerrC   Nr   rE   rD   c                    s   t    || _|dkrt | _ntj|j|j|j	d| _t
||| dd| _tj|j|j|j	d| _t|| dd| _d S )Nr   r#   rQ   r   rD   rE   z.mlprE   )r&   r'   r   r   Identity	attn_normr,   r)   r    r-   rB   rc   mlp_normrw   mlp)r/   r   rE   rD   r0   r2   r3   r'      s   
zModernBertLayer.__init__rh   ri   r5   c                 C   s8   | j | ||d}|| }| | |}|| }|S )Nrh   ri   )rc   r   r   r   )r/   rh   ri   rt   
mlp_outputr2   r2   r3   r;      s   
zModernBertLayer.forward)rC   N)r<   r=   r>   r   rv   ru   r'   r?   r@   r;   rA   r2   r2   r0   r3   r      s     r   c                       sD   e Zd Zddedef fddZdejdejdejfd	d
Z  Z	S )ModernBertEncoderLayerrC   vllm_configrE   c                    s8   t    |jj t fddt jD | _d S )Nc                    s$   g | ]}t  | d | dqS )z.layers.r   )r   ).0rD   r   rE   r2   r3   
<listcomp>   s    z3ModernBertEncoderLayer.__init__.<locals>.<listcomp>)	r&   r'   model_config	hf_configr   
ModuleListrangenum_hidden_layerslayers)r/   r   rE   r0   r   r3   r'      s   

zModernBertEncoderLayer.__init__rh   ri   r5   c                 C   s"   t | jD ]	\}}|||}q|S r6   )	enumerater   )r/   rh   ri   ilayerr2   r2   r3   r;      s   zModernBertEncoderLayer.forwardr   )
r<   r=   r>   r   rv   r'   r?   r@   r;   rA   r2   r2   r0   r3   r      s    r   CLS)seq_pooling_typec                       s   e Zd ZeddidZ	ddedef fddZd	ej	d
ej	fddZ
deeeej	f  d
ee fddZ		dd	ej	dej	dedB dej	dB d
ej	f
ddZ  ZS )ModernBertModelzlayers.zencoder_layer.layers.)orig_to_new_prefixrC   r   rE   c                    sR   t    |jj}|| _t|| _t|| dd| _t	j
|j|j|jd| _d S )Nz.encoder_layerr   r#   )r&   r'   r   r   r   r   r:   r   encoder_layerr   r,   r)   r    r-   
final_norm)r/   r   rE   r   r0   r2   r3   r'      s   


zModernBertModel.__init__r4   r5   c                 C      | j |S r6   )r:   r8   r7   r2   r2   r3   r8         zModernBertModel.embed_input_idsweightsc                 C   sl   | j |}t|  }t }|D ]"\}}|dr||vrq|| }t|dt}||| || q|S )Nz.biasweight_loader)	hf_to_vllm_mapperapplydictnamed_parameterssetendswithr+   r   add)r/   r   params_dictloaded_paramsnameloaded_weightparamr   r2   r2   r3   load_weights   s   
zModernBertModel.load_weightsN	positionsintermediate_tensorsr9   c                 C   s8   |d ur|}n| j ||d}| j||d}| |}|S )N)r4   r9   r   )r:   r   r   )r/   r4   r   r   r9   rh   outputsnorm_outputsr2   r2   r3   r;     s   
zModernBertModel.forwardr   NN)r<   r=   r>   r   r   r   rv   r'   r?   r@   r8   r   tupler   r   r   r;   rA   r2   r2   r0   r3   r      s2    $r   c                       s"   e Zd Zdef fddZ  ZS )ModernBertPoolerr   c                    s   |j }|d us	J |j}|j }t jt|t d |j	}tj
|j|j|j|d _t  _tj|j|j|j|d _t| fddt fddd _d S )N)poolingheadrP   )r$   r%   rP   c                    s
     | S r6   )densexr/   r2   r3   <lambda>>  s   
 z+ModernBertPooler.__init__.<locals>.<lambda>c                    s      | S r6   )r.   r~   r   r   r2   r3   r   ?  s    )
head_dtype	projector
activation)pooler_configr   classifier_poolingupperr&   r'   r   r   r   r   ry   r)   classifier_biasr   r}   r~   r,   r    r-   r.   r   r   r   )r/   r   r   r   hf_pooling_typer   r0   r   r3   r'     s6   


zModernBertPooler.__init__)r<   r=   r>   r   r'   rA   r2   r2   r0   r3   r     s    r   c                       s   e Zd ZdZdddedef fddZdejd	ejfd
dZ	de
eeejf  fddZ		ddejdB dejdedB dejdB d	ejf
ddZ  ZS )#ModernBertForSequenceClassificationTrC   r   r   rE   c                   s   t    |jj}|| _t|t|dd| _tj	|j
|j|jjd| _|jj}|d us-J t|j| _tj|| j| jd| _d S )N
modernbertr   rE   r   )r   
classifier)r&   r'   r   r   r   r   r   modelr   ry   r)   
num_labelsr   r   r   r   r   r   for_seq_clspoolerr/   r   rE   r   r   r0   r2   r3   r'   G  s&   

z,ModernBertForSequenceClassification.__init__r4   r5   c                 C   r   r6   r   r8   r7   r2   r2   r3   r8   a  r   z3ModernBertForSequenceClassification.embed_input_idsr   c                    s   g   fdd}| j |  t|  } D ]6\}}|dr0|| }t|dt}||| |drN|d|tdd d    }t|dt}||| qd S )Nc                  3   sD    D ]\} }|  dr| tdd  |fV  q | |f qd S )Nzmodel.)
startswithlenappend)r   weightself_weightsr   r2   r3   weight_filterg  s   
zGModernBertForSequenceClassification.load_weights.<locals>.weight_filterr   r   r   zpooling.r   )r   r   r   r   r   r+   r   r   )r/   r   r   r   r   r   r   r   r2   r   r3   r   d  s   



z0ModernBertForSequenceClassification.load_weightsNr   r   r9   c                 C   s   | j |||dS )N)r4   r9   r   )r   )r/   r4   r   r   r9   r2   r2   r3   r;   |  s
   z+ModernBertForSequenceClassification.forwardr   )r<   r=   r>   is_pooling_modelr   rv   r'   r?   r@   r8   r   r   r   
LongTensorr   r;   rA   r2   r2   r0   r3   r   C  s$    r   c                       s2   e Zd Z fddZdejdejfddZ  ZS )ModernBertPredictionHeadc                    s\   t    || _tj|j|j|jd| _t|j	 | _
tj|jt|ddt|ddd| _d S )Nrx   r    r"   r-   Tr#   )r&   r'   r   r   ry   r)   r   r   r   classifier_activationr~   r,   r+   r.   )r/   r   r0   r2   r3   r'     s   


z!ModernBertPredictionHead.__init__rh   r5   c                 C   s   |  | | |S r6   )r.   r~   r   )r/   rh   r2   r2   r3   r;     s   z ModernBertPredictionHead.forward)r<   r=   r>   r'   r?   r@   r;   rA   r2   r2   r0   r3   r     s    r   encoder_onlyALL)tok_pooling_typec                       s   e Zd ZdZdddedef fddZdejd	ejfd
dZ	de
eeejf  fddZ		ddejdB dejdedB dejdB d	ejf
ddZ  ZS ) ModernBertForTokenClassificationTrC   r   r   rE   c                   s|   t    |jj}|jj| _|j| _t|t|dd| _t	|| _
tj|j|j| jd| _|jj}|d us7J t|| _d S )Nr   r   r   )r&   r'   r   r   r   r   r   r   r   r   r   r   ry   r)   r   r   r   r   r   r0   r2   r3   r'     s   



z)ModernBertForTokenClassification.__init__r4   r5   c                 C   r   r6   r   r7   r2   r2   r3   r8     r   z0ModernBertForTokenClassification.embed_input_idsr   c                 C   s   t | dgd}||}|S )Ndrop)skip_prefixes)r   r   )r/   r   loaderr   r2   r2   r3   r     s   
z-ModernBertForTokenClassification.load_weightsNr   r   r9   c                 C   s2   | j ||||d}| |}|| j}| |S )N)r4   r   r9   r   )r   r   tor   r   )r/   r4   r   r   r9   rh   r2   r2   r3   r;     s   

z(ModernBertForTokenClassification.forwardr   )r<   r=   r>   r   r   rv   r'   r?   r@   r8   r   r   r   r   r;   rA   r2   r2   r0   r3   r     s$    	r   ):collections.abcr   r?   r   transformersr   transformers.activationsr   vllm.compilation.decoratorsr   vllm.configr   r   vllm.distributedr	   ;vllm.model_executor.layers.attention.encoder_only_attentionr
   !vllm.model_executor.layers.linearr   r   !vllm.model_executor.layers.poolerr   -vllm.model_executor.layers.pooler.activationsr   )vllm.model_executor.layers.pooler.seqwiser   r   r   )vllm.model_executor.layers.pooler.tokwiser   +vllm.model_executor.layers.rotary_embeddingr   3vllm.model_executor.layers.vocab_parallel_embeddingr   -vllm.model_executor.model_loader.weight_utilsr   vllm.sequencer   
interfacesr   interfaces_baser   r   utilsr   r   r   Moduler   rB   rw   r   r   r   r   r   r   r   r2   r2   r2   r3   <module>   sH   N";(F