o
    i                     @   s  d dl mZ d dlZd dlmZ d dlmZ d dlmZ d dlm	Z	m
Z
 d dlmZ d dlmZ d d	lmZ d d
lmZmZmZ d dlmZ d dlmZ d dlmZmZmZmZmZ d dlm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z& d dl'm(Z(m)Z) d dl*m+Z+ d dl,m-Z- ddl.m/Z/ ddl0m1Z1m2Z2m3Z3m4Z4m5Z5 ee6Z7e8e9 j:Z;G dd dej<Z=G dd dej<Z>G dd dej<Z?G dd dej<Z@G d d! d!ej<ZAed"d# d$G d%d& d&ej<ZBed'd# d$G d(d) d)ej<ZCed*d# d$G d+d, d,ej<e/ZDG d-d. d.ej<ZEdS )/    )IterableN)nn)Gemma3nTextConfig)support_torch_compile)CacheConfig
VllmConfig)$get_tensor_model_parallel_world_size)get_forward_context)init_logger)_ACTIVATION_REGISTRY
GeluAndMulGeluAndMulSparse)	Attention)RMSNorm)ColumnParallelLinearMergedColumnParallelLinearQKVParallelLinearReplicatedLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)get_rope)VocabParallelEmbedding)default_weight_loadermaybe_remap_kv_scale_name)IntermediateTensors)KVSharingFastPrefillMetadata   )SupportsQuant)AutoWeightsLoaderextract_layer_indexis_pp_missing_parametermake_layersmaybe_prefixc                       s   e Zd ZdZdededededededef fd	d
Zde	j
de	j
fddZde	j
de	j
fddZde	j
de	j
fddZde	j
de	j
de	j
fddZ  ZS )Gemma3nAltUpa  Alternating updates (Altup)
    The AltUp module wraps transformer layers. The `predict` step modifies the
    input to the transformer layer, and the `correct` step propagates the output
    of the transformer layer to the sparsely updated dimensions.
    See more in the research paper:
    https://proceedings.neurips.cc/paper_files/paper/2023/file/f2059277ac6ce66e7e5543001afa8bb5-Paper-Conference.pdf
    hidden_sizerms_norm_epsaltup_num_inputsaltup_coef_clipaltup_active_idxquant_configprefixc                    s   t    || _|| _|| _t||d|| ddd| _t||d d|| ddd| _t||d|| ddd| _t	||d| _
tj|d | jjjd	| _ttj|tjd	| _d S )
NFz.correction_coefsbiasr*   r+   return_bias   z.prediction_coefsz.modality_routerr%   epsg      dtype)super__init__r'   r)   r(   r   correction_coefsprediction_coefsmodality_routerr   router_normtorchtensorweightr3   router_input_scaler   	Parameterzerosfloat32correct_output_scale)selfr%   r&   r'   r(   r)   r*   r+   	__class__ X/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/model_executor/models/gemma3n.pyr5   M   sL   


zGemma3nAltUp.__init__xreturnc                 C   s.   |  || j }| |}t| |S N)r9   r=   r8   r:   tanhfloattype_as)rB   rG   router_inputsroutedrE   rE   rF   _compute_router_modalities   s   
z'Gemma3nAltUp._compute_router_modalities	correctedc                 C   s   | | j| j  |S rI   )rL   rA   )rB   rP   rE   rE   rF   scale_corrected_output   s   z#Gemma3nAltUp.scale_corrected_outputhidden_statesc                 C   sj   |  || j }| |}|d| j| jddd}t|ddd|}|ddd}||7 }| S )Nr   r/   r   )	rO   r)   r7   reshaper'   permuter:   matmul
contiguous)rB   rR   
modalities	all_coefsall_coefs_TpredictionsrE   rE   rF   predict   s   

zGemma3nAltUp.predictr[   	activatedc                 C   s^   |  |}||| j  }|| jdd}| |d }|jd}t||}||7 }|	 S )Nr         ?rS   )
rO   r)   repeatr'   r6   T	unsqueezer:   mulrW   )rB   r[   r]   rX   
innovationrY   rP   rE   rE   rF   correct   s   
zGemma3nAltUp.correct)__name__
__module____qualname____doc__intrK   r   strr5   r:   TensorrO   rQ   r\   rd   __classcell__rE   rE   rC   rF   r$   D   s6    3r$   c                       sZ   e Zd ZdZdddededededB ded	df fd
dZde	j
d	e	j
fddZ  ZS )Gemma3nLaurelBlockz Learned Augmented Residual LayerN)r*   r%   laurel_rankr&   r*   r+   rH   c                   sT   t    t||d|| ddd| _t||d|| ddd| _t||d| _d S )NFz.linear_leftr,   z.linear_rightr0   )r4   r5   r   linear_leftr   linear_rightr   post_laurel_norm)rB   r%   rn   r&   r*   r+   rC   rE   rF   r5      s*   
	zGemma3nLaurelBlock.__init__rG   c                 C   s&   |  |}| |}| |}|| S rI   )ro   rp   rq   )rB   rG   laurel_xnormed_laurel_xrE   rE   rF   forward   s   


zGemma3nLaurelBlock.forward)re   rf   rg   rh   ri   rK   r   rj   r5   r:   rk   rt   rl   rE   rE   rC   rF   rm      s"     rm   c                       s\   e Zd Z			ddedededededB d	ed
df fddZdej	d
ej	fddZ
  ZS )
Gemma3nMLP        N r%   intermediate_sizehidden_activationactivation_sparsityr*   r+   rH   c                    s~   t    t||gd d|| dd| _t||d|| dd| _|dkr*td|dkr7t|d	d
| _	d S td	d| _	d S )Nr/   Fz.gate_up_projr-   r*   r+   z
.down_projgelu_pytorch_tanhzGemma3 uses `gelu_pytorch_tanh` as the hidden activation function. Please set `hidden_act` and `hidden_activation` to `gelu_pytorch_tanh`.rv   rJ   )rz   approximate)r}   )
r4   r5   r   gate_up_projr   	down_proj
ValueErrorr   r   act_fn)rB   r%   rx   ry   rz   r*   r+   rC   rE   rF   r5      s4   
	
zGemma3nMLP.__init__rG   c                 C   s*   |  |\}}| |}| |\}}|S rI   )r~   r   r   )rB   rG   gate_up_rE   rE   rF   rt     s   
zGemma3nMLP.forward)rv   Nrw   )re   rf   rg   ri   rj   rK   r   r5   r:   rk   rt   rl   rE   rE   rC   rF   ru      s(    'ru   c                       sr   e Zd Z			ddedededededed	edB d
edB deddf fddZde	j
de	j
de	j
fddZ  ZS )Gemma3nAttentionNrw   configr%   	num_headsnum_kv_headshead_dimmax_position_embeddingscache_configr*   r+   rH   c
                    s@  t    || _|| _t }
|| _| j|
 dksJ | j|
 | _|| _| j|
kr2| j|
 dks1J n	|
| j dks;J td| j|
 | _	|| _
| j| j
 | _| j	| j
 | _t|| j
| j| j|j||	 dd| _t| j| j
 ||j||	 dd| _t| j
|jd| _t| j
|jd| _t| j
|jdd| _t|	}|j| }|d	k}|r|jnd | _||jv r|j| }n|j }|r|j|d
< |j|j }||k| _d }| jr| jd urdnd}|| }|dkrd|	v r|	 dd }nt!d|	 d| d| d}t"| j
||dd| _#t$| j| j
d| j	||| j||	 dd	| _%d S )Nr   r   z	.qkv_projr{   z.o_projr0   F)r%   r1   
has_weightsliding_attention
rope_thetar/   z.layers.z0Unexpected prefix format for Gemma3nAttention: 'zc'. The prefix is expected to contain '.layers.' to correctly determine the KV sharing target layer.z.self_attn.attnT)max_positionrope_parametersis_neox_styler^   z.attn)	r   	head_sizescaler   r   r*   per_layer_sliding_windowkv_sharing_target_layer_namer+   )&r4   r5   r   r%   r   total_num_headsr   total_num_kv_headsmaxr   r   q_sizekv_sizer   attention_biasqkv_projr   o_projr   r&   q_normk_normv_normr    layer_typessliding_windowr   copyrope_local_base_freqnum_hidden_layersnum_kv_shared_layersis_kv_sharedsplitr   r   
rotary_embr   attn)rB   r   r%   r   r   r   r   r   r*   r+   tp_size	layer_idx
layer_type
is_slidingr   first_kv_shared_layer_idxr   offsetkv_shared_layer_indexparam_name_before_layersrC   rE   rF   r5     s   

	







zGemma3nAttention.__init__	positionsrR   c                 K   s   |  |\}}|j| j| j| jgdd\}}}|d| j| jf}| |}|dd}|d| j	| jf}| 
|}|dd}|d| j	| jf}| |}|dd}| |||\}}| |||}	| |	\}
}|
S )NrS   dim)r   r   r   r   	unflattenr   r   r   flattenr   r   r   r   r   r   )rB   r   rR   kwargsqkvr   qkvattn_outputoutputrE   rE   rF   rt     s    


zGemma3nAttention.forwardNNrw   )re   rf   rg   r   ri   r   r   rj   r5   r:   rk   rt   rl   rE   rE   rC   rF   r     s@    		
}r   c                       sn   e Zd Z			ddededB dedB deddf
 fdd	Zd
ej	dej	dej	de
ej	ej	f fddZ  ZS )Gemma3nDecoderLayerNrw   r   r   r*   r+   rH   c                    s|  t    t|tsJ |j| _|jsJ t|j|j|j	|j
|j|| dd| _t||j|j|j|j|j||| dd	| _t|j|jt| |j||jt| | dd| _t|j|j|j|| dd| _t|j|jd	|| d
d	d| _t|j|jd	|| dd	d| _t|j|jd| _ t|j|jd| _!t|j|jd| _"t|j|jd| _#t|j|jd| _$t%|j | _&d S )Nz.altup)r%   r&   r'   r(   r)   r*   r+   z
.self_attn)	r   r%   r   r   r   r   r   r*   r+   z.mlp)r%   rx   ry   r*   rz   r+   z.laurel)r%   rn   r&   r*   r+   Fz.per_layer_input_gater,   z.per_layer_projectionr1   )'r4   r5   
isinstancer   r)   altup_correct_scaler$   r%   r&   r'   r(   altupr   num_attention_headsnum_key_value_headsr   r   	self_attnru   rx   r    ry   activation_sparsity_patternmlprm   rn   laurelr   hidden_size_per_layer_inputper_layer_input_gateper_layer_projectionr   input_layernormpost_attention_layernormpre_feedforward_layernormpost_feedforward_layernormpost_per_layer_input_normr   r   )rB   r   r   r*   r+   rC   rE   rF   r5     s   

	
zGemma3nDecoderLayer.__init__r   rR   per_layer_inputc                 K   s   | j |}|| j }| |}| |}| jd||d|}	| |	}	|	| }
|
| tt	d }| 
|}| |}| |}|| }| j ||}|| j }| j |}| |}| |}t||}| |}| |}|dd   |7  < |S )N)r   rR          @r   rE   )r   r\   r)   r   r   r   r   r:   sqrtr;   r   r   r   rd   rQ   r   r   rb   r   r   )rB   r   rR   r   r   r[   active_predictionactive_prediction_normedlaurel_outputr   
attn_gatedattn_laurel	attn_normattn_ffwattn_ffw_normattn_ffw_laurel_gatedcorrected_predictionsfirst_predictionrE   rE   rF   rt     s6   











zGemma3nDecoderLayer.forwardr   )re   rf   rg   r   r   r   rj   r5   r:   rk   tuplert   rl   rE   rE   rC   rF   r     s0    _r   c                 C      | j jS rI   r   kv_sharing_fast_prefillvllm_configrE   rE   rF   <lambda>?      r   )	enable_ifc                       s   e Zd ZdZdddededee def fdd	Z	d
e
jde
jfddZde
jde
jdB de
jfddZd
e
jde
jfddZde
jde
jfddZ		dd
e
jdB de
jde
jdB de
jdB dee
je
jf f
ddZ  ZS )Gemma3nSelfDecoderz:
    Includes altup embedding and self decoder layers
    rw   r+   r   r+   decoder_layerslayer_idx_startc             	      s@  t    || _|| _|jj  | _|jt j	 j
 dd| _tj j
d | jjjd| _t j j j  dd| _tj jd | jjjd| _t j
 j j ddd dd	| _t j jd
| _ttd| jjj| _tj j
d | jjjd| _t  fddt!d| jj"D | _#d S )Nz.embed_tokens)r*   r+         ?r2   z.per_layer_embed_tokensFTz.per_layer_model_projectionr-   gather_outputr.   r*   r+   r0   r   c                    4   g | ]}t  j jd dd  d|d  dqS )FTz.altup_projections.r   r   r   r%   .0idxr   r+   r*   rE   rF   
<listcomp>      
z/Gemma3nSelfDecoder.__init__.<locals>.<listcomp>r   )$r4   r5   r   r   model_config	hf_configr   r*   r   
vocab_sizer%   embed_tokensr:   r;   r<   r3   embed_scalevocab_size_per_layer_inputr   r   embed_tokens_per_layerembed_scale_per_layerr   per_layer_model_projectionr   r&   per_layer_projection_normrsqrttoper_layer_input_scaleper_layer_projection_scaler   
ModuleListranger'   altup_projectionsrB   r   r+   r   r   rC   r   rF   r5   F  sf   


	

zGemma3nSelfDecoder.__init__	input_idsrH   c                 C   s<   t |dk|| jjk }t ||t |}| || j S )Nr   )r:   logical_andr   r   where
zeros_liker   r   )rB   r  per_layer_inputs_maskper_layer_inputs_tokensrE   rE   rF   get_per_layer_input_embeddings  s   z1Gemma3nSelfDecoder.get_per_layer_input_embeddingshidden_states_0per_layer_inputsNc                 C   sd   |  |}|jg |jd d | jj| jjR  }| |}|d ur.|| }|| j9 }|S |}|S )NrS   )r  rT   shaper   r   r   r  r  )rB   r  r  r   rE   rE   rF   get_per_layer_inputs  s   


z'Gemma3nSelfDecoder.get_per_layer_inputsc                 C   s   |  || j S rI   )r   r   rB   r  rE   rE   rF   embed_input_ids  s   z"Gemma3nSelfDecoder.embed_input_idsc              	   C   s   |g| j j }tj|d dddd }td| j jD ]+}| j|d  || ||< tj|| d dddd }||  |t|t 9  < qtj|dd}|S )Nr/   rS   Tr   keepdimr   r   r   )	r   r'   r:   meanr  r	  maximumEPSstack)rB   r  rR   target_magnitudeinew_magnituderE   rE   rF   altup_embed  s   zGemma3nSelfDecoder.altup_embedr   inputs_embedsc              
   K   s   |d ur|}n|  |}| ||}| |}|ddd}t| jD ]\}	}
|	| j }|
d|||d d |d d f d|}q#|ddd}||fS Nr/   r   r   )r   rR   r   rE   )r  r  r!  rU   	enumerater   r   )rB   r  r   r"  r  r   r  adjusted_per_layer_inputsrR   r   layerr   rE   rE   rF   rt     s&   


zGemma3nSelfDecoder.forwardNN)re   rf   rg   rh   r   rj   listr   ri   r5   r:   rk   r  r  r  r!  r   rt   rl   rE   rE   rC   rF   r   >  sF    H
r   c                 C   r   rI   r   r   rE   rE   rF   r     r   c                	       s^   e Zd ZdZdddededee def fdd	Z	d
e
jde
jde
jde
jfddZ  ZS )Gemma3nCrossDecoderz
    Cross-decoder layers
    rw   r   r   r+   r   r   c                   s   t    || _|| _d S rI   )r4   r5   r   r   r
  rC   rE   rF   r5     s   

zGemma3nCrossDecoder.__init__r   rR   r  rH   c              
   K   sf   | ddd}t| jD ]\}}|| j }|d|||d d |d d f d|}q| ddd}|S r#  )rU   r$  r   r   )rB   r   rR   r  r   r   r&  r   rE   rE   rF   rt     s   
zGemma3nCrossDecoder.forward)re   rf   rg   rh   r   rj   r(  r   ri   r5   r:   rk   rt   rl   rE   rE   rC   rF   r)    s*    r)  c                 C   s
   | j j S rI   r   r   rE   rE   rF   r     s   
 c                       sZ  e Zd Zdddedef fddZedd Zd	ej	d
ej	fddZ
d	ej	d
ej	fddZ		d d	ej	dB dej	dej	dB dej	dB d
ej	f
ddZ		d d	ej	dB dej	dej	dB dej	dB d
ej	f
ddZdej	d
ej	fddZ			d!d	ej	dB dej	dej	dB dedB dej	dB d
ej	eB fddZdeeeej	f  d
ee fddZ  ZS )"Gemma3nTextModelrw   r   r   r+   c                   s  t    |jj|j |j| _| _tfddt	d| jj
D | _tj fdd dd\| _| _| _jj }dd	lm} |d
 t| d| jd | dd| _W d    n1 slw   Y  |d t| d| j|d  |d| _W d    n1 sw   Y  tjjd| _ j| _| jr|jj}t | ! j"}t#j$|t#j%|d| _&t#j$|j| jj
f| j'j(j)|d| _*t#j$|| jj| jj+f| j'j(j)|d| _,d S d S )Nc                    r   )FTz.altup_unembed_projections.r   r   r   r   r   rE   rF   r      r   z-Gemma3nTextModel.__init__.<locals>.<listcomp>r   c                    s   t  | dS )Nr   )r   r   )r   r   r*   rE   rF   r   1  s    z+Gemma3nTextModel.__init__.<locals>.<lambda>z.layersr   r   )set_model_tagself_decoderz.self_decoder)r   r+   r   r   cross_decoderz.cross_decoderr   r3   device)-r4   r5   r   r   r   r*   r   r   r  r  r'   altup_unembed_projectionsr"   r   start_layer	end_layerlayersr   vllm.compilation.backendsr+  r   r,  r)  r-  r   r%   r&   normr   fast_prefill_enabledscheduler_configmax_num_batched_tokensnext
parametersr/  r:   r?   int64r   r   r<   r3   rR   r   r  )rB   r   r+   r   r+  max_num_tokensr/  rC   )r   r   r+   r*   rF   r5     sx   


	



zGemma3nTextModel.__init__c                 C   r   rI   )r,  r   )rB   rE   rE   rF   r   n  s   zGemma3nTextModel.embed_tokensr  rH   c                 C      | j |S rI   )r,  r  r  rE   rE   rF   r  r     z/Gemma3nTextModel.get_per_layer_input_embeddingsc                 C   r=  rI   )r,  r  r  rE   rE   rF   r  u  r>  z Gemma3nTextModel.embed_input_idsNr   r"  r  c                 K   s  d\}}t  j}| jr+|d ur+t|tsJ || jd jjj }	t|	t	r+|	j
}|	j}|d}
| jd |
 | | jd|| jd |
 ||d|\}}|d u r_tj|d|j|jd}| }|d}| jd | ||  | jd | ||  | jd | ||  | jd| jd | | jd | | jd | d|}|d ur|dksJ |d | ||d | < |S |}|S )Nr'  rS   r   r  r   r"  r  r.  r   rR   r  rE   )r	   attn_metadatar6  r   dictr3  r   r   
layer_namer   logits_indices_paddednum_logits_indicessizer   copy_r,  r:   aranger3   r/  clonerR   r  r-  )rB   r  r   r"  r  r   rD  rE  rA  layer_attn_metadata
batch_sizeself_decoder_hidden_statesper_layer_inputs_adjustedrR   num_padded_logits_indicescross_decoder_hidden_statesrE   rE   rF   fast_prefill_forwardx  sf   




z%Gemma3nTextModel.fast_prefill_forwardc                 K   s:   | j d||||d|\}}| jd|||d|}|S )Nr?  r@  rE   )r,  r-  )rB   r  r   r"  r  r   rR   rE   rE   rF   normal_forward  s    
zGemma3nTextModel.normal_forwardrR   c              	   C   s   t j|d d dddd }td| jjD ]3}| j|d  |d|f |d|f< t j|d|f d dddd }|d|f  |t |t 9  < qt j|dd	}|S )
N).r   r/   rS   Tr  r   r   .r   )r:   r  r  r   r'   r0  r  r  )rB   rR   r  r  r   rE   rE   rF   altup_unembed  s   
zGemma3nTextModel.altup_unembedintermediate_tensorsc                 K   sL   | j r| j||||fi |}n| j||||fi |}| |}| |S rI   )r6  rP  rQ  rR  r5  rB   r  r   r  rS  r"  r   rR   rE   rE   rF   rt     s&   	

zGemma3nTextModel.forwardweightsc                 C   sh  g d}t |  }t }|D ]\}}|ds'|ds'|ds'd| }| jd urM| j| }rM|| }t|dt}	|d }|	|| || q|D ]3\}
}}||vrYqOd|v r^qO|	||
}|
d	rn||vrnqOt|| rtqO|| }|j}	|	|||  n)|
d	r||vrqt||}|d u rqt|| rq|| }t|dt}	|	|| || q|S )
N))r   q_projr   )r   k_projr   )r   v_projr   )r~   	gate_projr   )r~   up_projr   r3  r0  r5  zself_decoder.weight_loaderr   r	  z.bias)rB  named_parametersset
startswithr*   get_cache_scalegetattrr   addreplaceendswithr!   r[  r   )rB   rU  stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
scale_nameparamr[  
param_name
shard_nameshard_idrE   rE   rF   load_weights  s\   







zGemma3nTextModel.load_weightsr'  )NNN)re   rf   rg   r   rj   r5   propertyr   r:   rk   r  r  rP  rQ  rR  r   rt   r   r   r]  rn  rl   rE   rE   rC   rF   r*    sl    W

Q


,r*  c                       s   e Zd Zg dddgdZdddedef fd	d
ZdejdejfddZ	dddddejdB dejdejdB de
dB dejdB deje
B fddZdejdejdB fddZdeeeejf  dee fddZ  ZS )Gemma3nForCausalLM)rV  rW  rX  rY  rZ  )r   r~   rw   r   r   r+   c                   sJ   |j j}t   || _|j| _t|t|dd| _t	|j
|jd| _d S )Nmodel)r   r+   )soft_cap)r   r   r4   r5   r   r   r*  r#   rq  r   r   final_logit_softcappinglogits_processor)rB   r   r+   r   rC   rE   rF   r5   \  s   

zGemma3nForCausalLM.__init__r  rH   c                 C   r=  rI   )rq  r  r  rE   rE   rF   r  i  r>  z"Gemma3nForCausalLM.embed_input_idsNr  rS  r"  r   r  rS  r"  c                K   s    | j ||f|||d|}|S )Nru  )rq  rT  rE   rE   rF   rt   l  s   
zGemma3nForCausalLM.forwardrR   c                 C   s   |  | jj|}|S rI   )rt  rq  r   )rB   rR   logitsrE   rE   rF   compute_logits  s   z!Gemma3nForCausalLM.compute_logitsrU  c                 C   s   t | g dd}||S )N)zembed_audio.zembed_vision.zaudio_tower.zvision_tower.)skip_substrs)r   rn  )rB   rU  loaderrE   rE   rF   rn    s
   
zGemma3nForCausalLM.load_weights)re   rf   rg   packed_modules_mappingr   rj   r5   r:   rk   r  r   rt   rw  r   r   r]  rn  rl   rE   rE   rC   rF   rp  O  s<    	

,rp  )Fcollections.abcr   r:   r   1transformers.models.gemma3n.configuration_gemma3nr   vllm.compilation.decoratorsr   vllm.configr   r   vllm.distributedr   vllm.forward_contextr	   vllm.loggerr
   %vllm.model_executor.layers.activationr   r   r   $vllm.model_executor.layers.attentionr   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   r   r   r   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.rotary_embeddingr   3vllm.model_executor.layers.vocab_parallel_embeddingr   -vllm.model_executor.model_loader.weight_utilsr   r   vllm.sequencer    vllm.v1.attention.backends.utilsr   
interfacesr   utilsr   r    r!   r"   r#   re   loggerr;   finfominr  Moduler$   rm   ru   r   r   r   r)  r*  rp  rE   rE   rE   rF   <module>   s^   y*/   ')  ;