o
    پi                     @   s  d dl mZmZmZmZ d dlZd dlm  mZ	 d dlmZ d dl
mZmZmZmZ d dlmZ d dlmZ d dlmZ d dlmZmZmZmZ d d	lmZ d d
lmZ d dlmZ d dl m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z'm(Z( d dl)m*Z* d dl+m,Z,m-Z- dd Z.G dd deZ/G dd de*Z0G dd dej1Z2G dd dej1Z3G dd dej1Z4G dd dej1Z5G d d! d!ej1Z6G d"d# d#eZ7G d$d% d%eZ8e8Z9ej:ee8d&d' dS )(    )IterableOptionalSetTupleN)nn)	AutoModelGemma3nTextConfigPretrainedConfigPreTrainedModel)$get_tensor_model_parallel_world_size)
GeluAndMul)RMSNorm)ColumnParallelLinearMergedColumnParallelLinearQKVParallelLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)RadixAttention)get_rope)ParallelLMHead)ForwardBatch)default_weight_loadermaybe_remap_kv_scale_name)Gemma3TextScaledWordEmbedding)
add_prefixmake_layersc                 C   s
   | j d S )N   )sliding_windowconfig r!   T/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/gemma3n_causal.py!get_attention_sliding_window_size!      
r#   c                	       sN   e Zd Z		ddedededdf fdd	Zd
ejdejf fddZ	  Z
S )Gemma3nRMSNormư>Tdimeps
with_scalereturnNc                    s>   t  j||d |s| `| jdtj|t ddd d S d S )Nr(   weightdtypeF
persistent)super__init__r,   register_buffertorchonesget_default_dtype)selfr'   r(   r)   	__class__r!   r"   r2   &   s   
zGemma3nRMSNorm.__init__xc                    s4   |j }| d|d }t |}||}|S )N)shape
contiguousreshaper1   forward)r7   r:   original_shapex_2dr8   r!   r"   r?   5   s
   
zGemma3nRMSNorm.forward)r&   T)__name__
__module____qualname__intfloatboolr2   r4   Tensorr?   __classcell__r!   r!   r8   r"   r%   %   s    "r%   c                   @   s   e Zd ZdS )Gemma3nTextScaledWordEmbeddingN)rB   rC   rD   r!   r!   r!   r"   rJ   =   s    rJ   c                       sr   e Zd Z			ddededededee d	ed
df fddZde	j
d
e	j
fddZde	j
d
e	j
fddZ  ZS )Gemma3nTextMLP        N hidden_sizeintermediate_sizehidden_activationactivation_sparsityquant_configprefixr*   c                    s   t    t||gd d|td|d| _t||d|td|d| _|dkr*tdt | _	|| _
| jdtj| j
tjd	dd
 d S )N   Fgate_up_projbiasrR   rS   	down_projgelu_pytorch_tanhzzGemma3n uses `gelu_pytorch_tanh` as the hidden activation function. Please set `hidden_activation` to `gelu_pytorch_tanh`.target_sparsity_tensorr-   r/   )r1   r2   r   r   rU   r   rX   
ValueErrorr   act_fnrQ   r3   r4   tensorfloat32)r7   rN   rO   rP   rQ   rR   rS   r8   r!   r"   r2   B   s4   
	
zGemma3nTextMLP.__init__r:   c                 C   sb   |  |\}}|jddd\}}| jdkr| |}tj||gdd}| |}| |\}}|S )NrT   r;   r'   rL   )rU   chunkrQ   _gaussian_topkr4   catr\   rX   )r7   r:   gate_up_	gate_projup_projr!   r!   r"   r?   i   s   


zGemma3nTextMLP.forwardinputsc                 C   sd   t jjdd}|| j}||j}t j|ddd}t j	|dddd}|||  }t
|| S )Nr   r   r;   Tr'   keepdimF)r'   ri   unbiased)r4   distributionsnormalNormalicdfrZ   typer.   meanstdFrelu)r7   rg   normal_diststd_multiplierinputs_mean
inputs_stdcutoff_xr!   r!   r"   ra   z   s   zGemma3nTextMLP._gaussian_topk)rL   NrM   )rB   rC   rD   rE   strrF   r   r   r2   r4   rH   r?   ra   rI   r!   r!   r8   r"   rK   A   s*    'rK   c                       sN   e Zd ZdZ		ddedee def fddZd	e	j
d
e	j
fddZ  ZS )Gemma3nLaurelBlockz Learned Augmented Residual LayerNrM   r    rR   rS   c                    sb   t    || _t|j|jd|td|d| _t|j|jd|td|d| _	t
|j|jd| _d S )NFlinear_leftrV   linear_rightr'   r(   )r1   r2   r    r   rN   laurel_rankr   r{   r   r|   r%   rms_norm_epspost_laurel_normr7   r    rR   rS   r8   r!   r"   r2      s(   
zGemma3nLaurelBlock.__init__r:   r*   c                 C   s.   |  |\}}| |\}}| |}|| S N)r{   r|   r   )r7   r:   laurel_xrd   normed_laurel_xr!   r!   r"   r?      s   
zGemma3nLaurelBlock.forwardNrM   )rB   rC   rD   __doc__r   r   r   ry   r2   r4   rH   r?   rI   r!   r!   r8   r"   rz      s    rz   c                       s   e Zd ZdZ		ddedee def fddZd	e	j
d
e	j
fddZde	j
d
e	j
fddZde	j
de	j
d
e	j
fddZde	j
d
e	j
fddZde	j
de	j
d
ee	j
e	j
f fddZ  ZS )Gemma3nAltUpzAlternating Updates (AltUp)NrM   r    rR   rS   c                    s   t    || _ttj|jtjd| _	t
|j|jd|td|d| _t
|j|jd d|td|d| _t
|j|jd|td|d| _t|j|jd| _| jd	t|jd
 dd d S )Nr-   Fcorrection_coefsrV   rT   prediction_coefsmodality_routerr}   router_input_scaleg      r/   )r1   r2   r    r   	Parameterr4   zerosrN   r^   correct_output_scaler   altup_num_inputsr   r   r   r   r%   r   router_normr3   r]   r   r8   r!   r"   r2      sF   

zGemma3nAltUp.__init__r:   r*   c                 C   s>   |  || j| j jj }| |\}}t| 	|S r   )
r   r   tor,   r.   r   r4   tanhrF   type_as)r7   r:   router_inputsroutedrd   r!   r!   r"   compute_router_modalities   s
   z&Gemma3nAltUp.compute_router_modalitieshidden_statesc                 C   s   |  || jj }| jjdur| jjj| jj | jj | |\}}|jg |j	dd | jj
| jj
R  ddd}t|ddd|}|ddd}||7 }| |S )zPredicts the output of a layer using a trainable map.
        hidden_states: [num_altup_inputs, num_tokens, hidden_size]
        Nr;   r   rT   r   )r   r    altup_active_idxaltup_coef_clipr   r,   dataclamp_r>   r<   r   permuter4   matmulr=   r   )r7   r   
modalities	all_coefsrd   predictionsr!   r!   r"   predict   s2   


zGemma3nAltUp.predictr   	activatedc                 C   s   |  |}||| jj  }|| jjdd}| jjdur*| jjj	| jj | jj | |\}}|d 
ddd}t||}||7 }| |S )z:Corrects the predictions relative to the activated inputs.r   N      ?r   r;   )r   r    r   repeatr   r   r   r,   r   r   r   	unsqueezer4   mulr=   r   )r7   r   r   r   
innovationr   rd   	correctedr!   r!   r"   correct  s&   

zGemma3nAltUp.correctr   c                 C   s   || j |j S )zScales the provided 3D tensor.)r   r   r.   )r7   r   r!   r!   r"   scale_corrected_output%  s   z#Gemma3nAltUp.scale_corrected_outputc                 C   s>   |  |}| j||d}|| jj }| jjr| |}||fS )zPredicts, correct, and optionally scales the output of a layer using trainable maps.

        hidden_states: [num_altup_inputs, num_tokens, hidden_size]
        )r   r   )r   r   r    r   altup_correct_scaler   )r7   r   r   r   r   outputr!   r!   r"   r?   )  s   

zGemma3nAltUp.forwardr   )rB   rC   rD   r   r   r   r   ry   r2   r4   rH   r   r   r   r   r   r?   rI   r!   r!   r8   r"   r      s8    - 
r   c                       sn   e Zd ZdZ		ddedededee ded	df fd
dZ	de
jdee
je
jf ded	e
jfddZ  ZS )Gemma3nAttentionz=Multi-headed attention from 'Attention Is All You Need' paperNrM   layer_idr    max_position_embeddingsrR   rS   r*   c           
         s2  t    || _|| _t }|j| _| j| dksJ | j| | _|j| _	t
d| j	| | _| j	|kr=| j	| dks<J n	|| j	 dksFJ |j}t|d||j }|| _| j| j | _| j| j | _d| _t|| j| j| j	|j|td|d| _t| j| j ||j|td|d| _|j| dk| _|j|j }	||	k| _| jsd | _n| jr|	d	 | _n|	d | _| jrt| j| j|j|j d
did| _!nt| j| j|j|j"|j#d| _!| jr|j$nd | _$t%| j| j| j| j| js|n| jd| j$|td|d	| _&t'|j|j(d| _)t'|j|j(d| _*t'|j|j(dd| _+d S )Nr   r   head_dimr   qkv_projrV   o_projsliding_attentionrT   	rope_typedefault)
rotary_dimmax_positionbaserope_scalingrL   attn)num_kv_headsr   	logit_capsliding_window_sizerR   rS   r}   F)r'   r(   r)   ),r1   r2   r   r    r   num_attention_headstotal_num_heads	num_headsnum_key_value_headstotal_num_kv_headsmaxr   rN   getattrr   q_sizekv_sizescalingr   attention_biasr   r   r   r   layer_types
is_slidingnum_hidden_layersnum_kv_shared_layersis_kv_shared_layerkv_shared_layer_indexr   r   rope_local_base_freq
rotary_emb
rope_thetar   r   r   r   r%   r   q_normk_normv_norm)
r7   r   r    r   rR   rS   tp_sizerN   r   first_kv_shared_layer_idxr8   r!   r"   r2   <  s   

	
	



zGemma3nAttention.__init__r   	positionsforward_batchc                 K   sV  |  |\}}|j| j| j| jgdd\}}}	|d| j| jf}| |}| jr3| j	d ur3d }d }	n|d| j
| jf}| |}|	d| j
| jf}	| |	}	|dd}|d uru|dd}| |||\}}|d| j
| jf}nt|d d d | jf }
| |||
\}}|d| j| jf}| j|||	|| j d}| |\}}|S )Nr;   r_   )r   save_kv_cache)r   splitr   r   	unflattenr   r   r   r   r   r   r   r   flattenr   r4   
zeros_liker   r   )r7   r   r   r   kwargsqkvrd   qkvdummy_kattn_outputr   r!   r!   r"   r?     s<    


zGemma3nAttention.forwardr   )rB   rC   rD   r   rE   r   r   r   ry   r2   r4   rH   r   r   r?   rI   r!   r!   r8   r"   r   9  s4    vr   c                       sb   e Zd Z		ddededee deddf
 fdd	Zd
e	j
de	j
de	j
dede	j
f
ddZ  ZS )Gemma3nDecoderLayerNrM   r   r    rR   rS   r*   c              	      sR  t    |j| _|| _|j| | _|| _t|||j|t	d|d| _
|j| }|j| }t| j||j||t	d|d| _t| j|jd| _t| j|jd| _t| j|jd| _t| j|jd| _|j| _t||t	d|d| _t||t	d|d| _t| j| jd	|t	d
|d| _t| j| jd	|t	d|d| _t| j|jd| _| j
j | _ d S )N	self_attn)r   r    r   rR   rS   mlp)rN   rO   rP   rQ   rR   rS   r+   altuprS   laurelFper_layer_input_gaterV   per_layer_projection)!r1   r2   rN   r   r   attention_typer    r   r   r   r   rO   activation_sparsity_patternrK   rP   r   r%   r   input_layernormpost_attention_layernormpre_feedforward_layernormpost_feedforward_layernormhidden_size_per_layer_inputr   r   rz   r   r   r   r   r   post_per_layer_input_normr   )r7   r   r    rR   rS   rO   rQ   r8   r!   r"   r2     sr   


	zGemma3nDecoderLayer.__init__r   r   per_layer_inputr   c                 K   s*  | j |}|| jj }| |}| |}	| jd|||d|}
| |
}
||
 }||	 t	t
d }| |}| |}| |}|| }| j ||}|| jj }| jjra| j |}|| jjj}| |\}}tj|dd}t||}| |\}}| |}|dd   |7  < |S )N)r   r   r          @r   )approximater   r!   )r   r   r    r   r   r   r   r   r4   sqrtr]   r   r   r   r   r   r   r   r   r,   r.   rr   gelumultiplyr   r   )r7   r   r   r   r   r   r   active_predictionactive_prediction_normedlaurel_outputr   
attn_gatedattn_laurel	attn_normattn_ffwattn_ffw_normattn_ffw_laurel_gatedcorrected_predictionsfirst_predictionrd   r!   r!   r"   r?   7  sP   



zGemma3nDecoderLayer.forwardr   )rB   rC   rD   rE   r	   r   r   ry   r2   r4   rH   r   r?   rI   r!   r!   r8   r"   r     s2    Gr   c                       s   e Zd Z		ddedee deddf fddZdej	fd	d
Z
dejfddZdejdejfddZ	ddejdeej dejfddZ		ddejdejdedejdeej dejfddZ  ZS )Gemma3nTextModelNrM   r    rR   rS   r*   c                    s~  t  j d  __ j_ j_t j jjjjd d_	t
 j jd_t j fddtd|d_ j_ j_t j j j jjjd d_tj j j d	td
|d_t
 j jd_tjjd fddtd|d_tjjd fddtd|d_jdtjd d	d jdttdd	d   d S )Nr         ?)embed_scaler+   c                    s   t |  |dS )N)r   r    rR   rS   )r   idxrS   )r    rR   r!   r"   <lambda>  s    z+Gemma3nTextModel.__init__.<locals>.<lambda>layersr   Fper_layer_model_projectionrV   r}   r   c                       t jjd |dS NFrV   r   rN   r
  rR   r7   r!   r"   r        altup_projectionsc                    r  r  r  r
  r  r!   r"   r    r  altup_unembed_projectionsper_layer_projection_scaleg      r/   per_layer_input_scaler   )r1   r2   r    rR   
vocab_sizepad_token_idpadding_idxrJ   rN   embed_tokensr%   r   normr   r   r   r  r   vocab_size_per_layer_inputembed_tokens_per_layerr   r  per_layer_projection_normr   r  r  r3   r4   r]   rsqrt	post_initr   r8   )r    rR   r7   r"   r2   u  st   





zGemma3nTextModel.__init__c                 C   s   | j S r   )r  r7   r!   r!   r"   get_input_embeddings  s   z%Gemma3nTextModel.get_input_embeddingsc                 C      t |  jS r   next
parametersr.   r"  r!   r!   r"   r.        zGemma3nTextModel.dtype	input_idsc                 C   s*   |  |}|jg |j| jj| jR  S r   )r  r>   r<   r    r   r   )r7   r)  
embeddingsr!   r!   r"   get_per_layer_inputs  s   
z%Gemma3nTextModel.get_per_layer_inputsinputs_embedsper_layer_inputsc                 C   s   |  |\}}|| j|j9 }|jg |jd d | jj| jR  }| 	|}|d u r/|S |j|jkrB|dd | jjd d f }|| | j
|j S )Nr;   .)r  r  ro   r.   r>   r<   r    r   r   r  r  )r7   r,  r-  r   rd   r!   r!   r"   project_per_layer_inputs  s&   
z)Gemma3nTextModel.project_per_layer_inputsr   r   input_embedsc                 K   s  |d u |d uA rt d|d ur| |}| |}| ||}| dkr+|d}tj|d dddd }tt	|j
j}|}	|	g}
td| jjD ].}| j|d  |	\}}||	j
}tj|d dddd }||t||  }|
| qMtj|
dd	}t| jD ]\}}|d d |d d f }|d||||d
|}qtj|d d dddd }|d g}
td| jjD ]0}| j|d  || \}}||	j
}tj|d dddd }||t||  }|
| qt|
}tj|dd	}| |}|S )Nz:You must specify exactly one of input_ids or inputs_embedsr   r   rT   r;   Trh   r  r_   )r   r   r   r   r!   )r[   r  r+  r.  r'   r   r4   rp   r]   finfor.   minranger    r   r  ro   maximumappendstack	enumerater  r  r  )r7   r)  r   r   r/  r-  r   target_magnitudeepsilon_tensorhidden_states_0temp_hidden_statesi
altup_projrd   current_hidden_statenew_magnituder   	layer_idxlayerr   altup_unemb_projr!   r!   r"   r?     sl   	






zGemma3nTextModel.forwardr   r   NN)rB   rC   rD   r   r   r   ry   r2   r   	Embeddingr#  r4   r.   
LongTensorrH   r+  r.  r   r?   rI   r!   r!   r8   r"   r  t  sL    _
 r  c                       s   e Zd ZeZdgZddiZddgdgfiZeZdZg dZ	dd	d
dddZ
g dddgdZg dZi Zg ZdZ		d,dedee deddf fddZdejfddZdd Zdejfd d!Ze 		d-d"ejd#ejd$ed%ejd&eej defd'd(Zd)ee eejf  fd*d+Z!  Z"S ).Gemma3nForCausalLMlm_head.weightlm_headcolwise_repr   logitslanguage_model)z.gate_proj.z.down_proj.z	.up_proj.z.q_proj.z.k_proj.z.v_proj.z.o_proj.)	.qkv_projr   )rK  r   )rK  rT   ).gate_up_projr   )rL  r   ).q_proj.k_proj.v_proj
.gate_proj.up_proj)rM  rN  rO  rP  rQ  )rK  rL  )rK  z.o_projrL  z
.down_projTNrM   r    rR   rS   r*   c                    sv   t  j|d || _|| _t||td|d| _t|| _| jj	r'| jj
| _nt|j|j|td|d| _|   d S )Nr   model)r    rR   rS   rG  )rR   rS   )r1   r2   r    rR   r  r   rR  r   logits_processortie_word_embeddingsr  rG  r   r  rN   r!  r   r8   r!   r"   r2     s$   
zGemma3nForCausalLM.__init__c                 C   s   | j jS r   )rR  r  r"  r!   r!   r"   r#    s   z'Gemma3nForCausalLM.get_input_embeddingsc                 C   s
   t | jS r   )r#   r    r"  r!   r!   r"   r#     r$   z4Gemma3nForCausalLM.get_attention_sliding_window_sizec                 C   r$  r   r%  r"  r!   r!   r"   r.     r(  zGemma3nForCausalLM.dtyper)  r   r   r/  r-  c                 K   s.   | j |||||fi |}| ||| j j|S r   )rR  rS  r  )r7   r)  r   r   r/  r-  r   r   r!   r!   r"   r?     s   
	zGemma3nForCausalLM.forwardweightsc                 C   s   g d}t |  }t }|D ]l\}}|dd}|D ]-\}}}	||vr%q|||}|dr5||vr5q||vr:q|| }
|
j}||
||	  n-d|v rNq|drX||vrXqt||}|d u rbq||vrgq|| }
t|
dt}||
| |	| q|S )N))rK  rM  r   )rK  rN  r   )rK  rO  r   )rL  rP  r   )rL  rQ  r   zmodel.language_model.zmodel.z.biasrF  weight_loader)
dictnamed_parameterssetreplaceendswithrV  r   r   r   add)r7   rU  stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
param_name
shard_nameshard_idparamrV  r!   r!   r"   load_weights  s>   

zGemma3nForCausalLM.load_weightsr   rB  )#rB   rC   rD   r   config_class_tied_weights_keys_tp_plan_pp_planbase_model_prefix#default_bitsandbytes_target_modules#bitsandbytes_stacked_params_mappingpacked_modules_mappingsupported_lora_modulesembedding_modulesembedding_padding_modulessupports_lorar   r   ry   r2   r   rC  r#  r#   r4   r.   no_gradrH   r   r   r?   r   r   rf  rI   r!   r!   r8   r"   rE  Q  sl    
	$rE  T)exist_ok);typingr   r   r   r   r4   torch.nn.functionalr   
functionalrr   transformersr   r   r	   r
   sglang.srt.distributedr   sglang.srt.layers.activationr   sglang.srt.layers.layernormr   sglang.srt.layers.linearr   r   r   r   "sglang.srt.layers.logits_processorr   *sglang.srt.layers.quantization.base_configr   !sglang.srt.layers.radix_attentionr   "sglang.srt.layers.rotary_embeddingr   *sglang.srt.layers.vocab_parallel_embeddingr   ,sglang.srt.model_executor.forward_batch_infor   $sglang.srt.model_loader.weight_utilsr   r   sglang.srt.models.gemma3_causalr   sglang.srt.utilsr   r   r#   r%   rJ   ModulerK   rz   r   r   r   r  rE  
EntryClassregisterr!   r!   r!   r"   <module>   sF    C'  7  ^ !