o
    پi6e                     @   s  d dl Z d dlmZmZmZmZ d dlZd dlZd dlmZ d dl	m
Z
mZmZmZ d dlmZ d dlmZ d dlmZ d dlmZmZmZ d d	lmZ d d
lmZ d dlmZmZ d dlm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z&m'Z' d dl(m)Z)m*Z* dd Z+de,de-fddZ.G dd dej/Z0G dd dej/Z1G dd dej/Z2G dd dej/Z3G dd  d ej4Z5G d!d" d"eZ6G d#d$ d$eZ7e7Z8dS )%    N)IterableOptionalSetTuple)nn)ROPE_INIT_FUNCTIONSGemma3TextConfigPretrainedConfigPreTrainedModel)$get_tensor_model_parallel_world_size)
GeluAndMul)Gemma3RMSNorm)MergedColumnParallelLinearQKVParallelLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)AttentionTypeRadixAttention)apply_rotary_pos_emb)ParallelLMHead)ForwardBatch)default_weight_loadermaybe_remap_kv_scale_name)
add_prefixmake_layersc                 C   s
   | j d S )N   )sliding_windowconfig r    S/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/gemma3_causal.py!get_attention_sliding_window_size2      
r"   prefixreturnc              	   C   sR   |  d}|D ]}|dr&| dd }zt|W   S  ty%   Y qw qdS )z-Extract the layer index from a prefix string..zlayers.)split
startswithint
ValueError)r$   partspart	layer_strr    r    r!   extract_layer_index8   s   

r/   c                       sV   e Zd Z		ddedededee deddf fd	d
Zdej	dej	fddZ
  ZS )	Gemma3MLPN hidden_sizeintermediate_sizehidden_activationquant_configr$   r%   c                    s`   t    t||gd d|td|d| _t||d|td|d| _|dkr*tdt | _	d S )N   Fgate_up_projbiasr5   r$   	down_projgelu_pytorch_tanhzyGemma3 uses `gelu_pytorch_tanh` as the hidden activation function. Please set `hidden_activation` to `gelu_pytorch_tanh`.)
super__init__r   r   r7   r   r:   r+   r   act_fn)selfr2   r3   r4   r5   r$   	__class__r    r!   r=   F   s(   
zGemma3MLP.__init__xc                 C   s*   |  |\}}| |}| |\}}|S N)r7   r>   r:   )r?   rB   gate_up_r    r    r!   forwarde   s   
zGemma3MLP.forwardNr1   )__name__
__module____qualname__r*   strr   r   r=   torchTensorrF   __classcell__r    r    r@   r!   r0   E   s"    r0   c                       sj   e Zd Z		ddedededee deddf fd	d
Zde	j
dee	j
e	j
f dede	j
fddZ  ZS )Gemma3AttentionNr1   layer_idr   max_position_embeddingsr5   r$   r%   c           	         s  t    || _|| _t }|j| _| j| dksJ | j| | _|j| _	t
d| j	| | _| j	|kr=| j	| dks<J n	|| j	 dksFJ |j}t|d||j }|| _| j| j | _| j| j | _|jd | _t|| j| j| j	|j|td|d| _t| j| j ||j|td|d| _|j| dk| _| jr|j| _d	d
i| _t|| _n|j| _|j| _d | _t | j| j| j| j|d| j|td|t!j"d
| _#t$|j|j%d| _&t$|j|j%d| _'d S )Nr   r   head_dimg      qkv_projr8   o_projsliding_attention	rope_typedefaultg        attn)num_kv_headsrP   	logit_capsliding_window_sizer5   r$   	attn_type)dimeps)(r<   r=   rP   r   r   num_attention_headstotal_num_heads	num_headsnum_key_value_headstotal_num_kv_headsmaxrY   r2   getattrrR   q_sizekv_sizequery_pre_attn_scalarscalingr   attention_biasr   rS   r   rT   layer_types
is_slidingrope_local_base_freq
rope_thetarope_scalingr"   r   r   r   DECODER_BIDIRECTIONALrX   r   rms_norm_epsq_normk_norm)	r?   rP   r   rQ   r5   r$   tp_sizer2   rR   r@   r    r!   r=   m   st   

	

zGemma3Attention.__init__hidden_statesposition_embeddingsforward_batchc                 K   s  |  |\}}|j| j| j| jgdd\}}}	|d| j| jf}|ddd}| 	|}|d| j
| jf}|ddd}| |}|\}
}t|||
|\}}|dddd}|dddd}| j|||	|d}| dkr|jd dkr|d}|d	d}| |\}}|S )
Nr'   r]   r   r   r6      )rw      )rS   r(   rf   rg   	unflattenra   rR   	transpose	unsqueezerr   rY   rs   r   permuterX   r]   shapesqueezeflattenrT   )r?   ru   rv   rw   kwargsqkvrE   qkvcossinattn_outputoutputr    r    r!   rF      s$    


zGemma3Attention.forwardrG   )rH   rI   rJ   r*   r   r   r   rK   r=   rL   rM   r   r   rF   rN   r    r    r@   r!   rO   l   s2    ]rO   c                       s   e Zd Z		ddededee deddf
 fdd	Zd
e	j
de	j
de	j
de	j
dedee	jeee	je	jf  f fddZ  ZS )Gemma3DecoderLayerNr1   rP   r   r5   r$   r%   c                    s   t    |j| _t|||j|td|d| _|j| _t| j|j|j	|td|d| _
t|j|jd| _t|j|jd| _t|j|jd| _t|j|jd| _| jj| _|| _d S )N	self_attn)rP   r   rQ   r5   r$   mlp)r2   r3   r4   r5   r$   r^   )r<   r=   r2   rO   rQ   r   r   r0   r3   r4   r   r   rq   input_layernormpost_attention_layernormpre_feedforward_layernormpost_feedforward_layernormrl   rP   )r?   rP   r   r5   r$   r@   r    r!   r=      s>   


zGemma3DecoderLayer.__init__	positionsru   position_embeddings_globalposition_embeddings_localrw   c           
      K   s   |}|  |}| jjr|}n|}| jd||||d|}| |}|| }|}| |}| |}| |}|| }|f}	|	S )N)r   ru   rv   rw   r    )r   r   rl   r   r   r   r   )
r?   r   ru   r   r   rw   r   residualrv   outputsr    r    r!   rF     s,   




zGemma3DecoderLayer.forwardrG   )rH   rI   rJ   r*   r	   r   r   rK   r=   rL   rM   r   tupleFloatTensorrF   rN   r    r    r@   r!   r      s:    'r   c                       s<   e Zd Zd	def fddZdd Ze dd Z  Z	S )
Gemma3RotaryEmbeddingNr   c                    s   t    t|dr|jd ur|jd|jdd| _nd| _| jd u r(d| _|j| _|j| _|| _	t
| j | _| | j	|\}| _| jd|dd | j| _d S )Nro   rV   typerW   inv_freqF
persistent)r<   r=   hasattrro   getrV   rQ   max_seq_len_cachedoriginal_max_seq_lenr   r   rope_init_fnattention_scalingregister_bufferr   original_inv_freq)r?   r   devicer   r@   r    r!   r=   E  s   

zGemma3RotaryEmbedding.__init__c                 C   s   t |d }|| jkr#| j| j||d\}| _| jd|dd || _|| jk rD| j| jkrF| j	|| _| jd| jdd | j| _dS dS dS )a  
        dynamic RoPE layers should recompute `inv_freq` in the following situations:
        1 - growing beyond the cached sequence length (allow scaling)
        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
        r   )seq_lenr   Fr   N)
rL   rd   r   r   r   r   r   r   r   to)r?   position_idsr   r   r   r    r    r!   _dynamic_frequency_update^  s    

z/Gemma3RotaryEmbedding._dynamic_frequency_updatec           
      C   s  d| j v r| j||jd | jd d d d f  |jd dd}|d d d d d f  }|jj}t|t	r=|dkr=|nd}t
j|dd	) | |j|  dd
}t
j||fdd}| }| }	W d    n1 srw   Y  || j }|	| j }	|j|jd|	j|jdfS )Ndynamic)r   r   r'   r   mpscpuF)device_typeenabledr6   rx   )dtype)rV   r   r   r   floatexpandr   r   
isinstancerK   rL   autocastr   r}   catr   r   r   r   )
r?   rB   r   inv_freq_expandedposition_ids_expandedr   freqsembr   r   r    r    r!   rF   x  s.   
&

	
zGemma3RotaryEmbedding.forwardrC   )
rH   rI   rJ   r   r=   r   rL   no_gradrF   rN   r    r    r@   r!   r   D  s
    r   c                
       sN   e Zd ZdZ	ddedededee f fddZd	ej	f fd
dZ
  ZS )Gemma3TextScaledWordEmbeddingz\
    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
          ?num_embeddingsembedding_dimpadding_idxembed_scalec                    s   t  ||| || _d S rC   )r<   r=   r   )r?   r   r   r   r   r@   r    r!   r=     s   
z&Gemma3TextScaledWordEmbedding.__init__	input_idsc                    s   t  || j S rC   )r<   rF   r   )r?   r   r@   r    r!   rF     s   z%Gemma3TextScaledWordEmbedding.forward)r   )rH   rI   rJ   __doc__r*   r   r   r=   rL   rM   rF   rN   r    r    r@   r!   r     s    	
r   c                       sb   e Zd Z		ddedee deddf fddZ	dd	ej	d
ej	de
dej	dej	f
ddZ  ZS )Gemma3TextModelNr1   r   r5   r$   r%   c                    s   t  j d  | _| _ j| _ j| _t j j| j| jjd d| _	t
 j jd| _t d| _d| _t   j _ddi _t d| _t j fdd	td
|d| _t
 j jd| _|   d S )Nr   g      ?)r   r   FrV   rW   c                    s   t |  |dS )N)rP   r   r5   r$   )r   )idxr$   r   r5   r    r!   <lambda>  s    z*Gemma3TextModel.__init__.<locals>.<lambda>layersr$   )r<   r=   r   r5   pad_token_idr   
vocab_sizer   r2   embed_tokensr   rq   normr   
rotary_embgradient_checkpointingcopydeepcopyrm   rn   ro   rotary_emb_localr   num_hidden_layersr   r   	post_initr?   r   r5   r$   r@   r   r!   r=     s2   



zGemma3TextModel.__init__r   r   rw   input_embedsc              	   K   s   |d u r
|  |}n|}| dkrt|d}| ||}| ||}| jD ]}	|	d|||||d|}
|
d }q'| |}|S )Nr   s -> 1 sr   r   r   ru   rw   r   r    )r   r]   einops	rearranger   r   r   r   )r?   r   r   rw   r   r   ru   r   r   layerlayer_outputsr    r    r!   rF     s(   


zGemma3TextModel.forwardrG   rC   )rH   rI   rJ   r   r   r   rK   r=   rL   rM   r   rF   rN   r    r    r@   r!   r     s0    1r   c                       sL  e Zd ZeZdgZddiZddgdgfiZeZdZg dZ	dd	d
dddZ
g dddgdZg dZi Zg ZdZ		d.dedee deddf fddZdejfddZdd Zdejfd d!Ze 	d/d"ejd#ejd$ed%ejdef
d&d'Ze 	d/d"ejd#ejd$ed(ee e f d%ejf
d)d*Z!d+e"eeejf  fd,d-Z#  Z$S )0Gemma3ForCausalLMlm_head.weightlm_headcolwise_repru   logitslanguage_model)z.gate_proj.z.down_proj.z	.up_proj.z.q_proj.z.k_proj.z.v_proj.z.o_proj.)rS   r   )rS   r   )rS   r6   )r7   r   )r7   r   )q_projk_projv_proj	gate_projup_proj)r   r   r   r   r   )rS   r7   )rS   rT   r7   r:   TNr1   r   r5   r$   r%   c                    sv   t  j|d || _|| _t||td|d| _t|| _| jj	r'| jj
| _nt|j|j|td|d| _|   d S )Nr   modelr   r   )r5   r$   )r<   r=   r   r5   r   r   r   r   logits_processortie_word_embeddingsr   r   r   r   r2   r   r   r@   r    r!   r=   .  s    
zGemma3ForCausalLM.__init__c                 C   s   | j jS rC   )r   r   r?   r    r    r!   get_input_embeddingsG  s   z&Gemma3ForCausalLM.get_input_embeddingsc                 C   s
   t | jS rC   )r"   r   r   r    r    r!   r"   J  r#   z3Gemma3ForCausalLM.get_attention_sliding_window_sizec                 C   s   t |  jS rC   )next
parametersr   r   r    r    r!   r   M  s   zGemma3ForCausalLM.dtyper   r   rw   r   c                 K   s,   | j ||||fi |}| ||| j j|S rC   )r   r   r   )r?   r   r   rw   r   r   ru   r    r    r!   rF   P  s   	zGemma3ForCausalLM.forwardsplit_intervalc                 C   s   |\}}|dkr9|d u r| j |}n|}| dkr!t|d}| j ||}	| j ||}
||_||	|
d|_t	||D ] }| j j
| }||jd |jd |jd |j|d}|d |_q>|| j jjkr{| j |j|_| ||j| j j|}|S d }|S )	Nr   r   r   )r   r   r   r   r   r   r   )r   r   r]   r   r   r   r   ru   model_specific_statesranger   r   r   r   r   )r?   r   r   rw   r   r   startendru   r   r   ir   layer_outputresultr    r    r!   forward_split_prefilla  sR   		z'Gemma3ForCausalLM.forward_split_prefillweightsc                 C   s   g d}t |  }t }|D ]\\}}|D ](\}}}	||vrq|||}|dr/||vr/q|| }
|
j}||
||	  n(d|v rCq|drM||vrMqt||}|d u rWq|| }
t|
dt}||
| |	| q|S )N))rS   r   r   )rS   r   r   )rS   r   r   )r7   r   r   )r7   r   r   z.biasr   weight_loader)
dictnamed_parameterssetreplaceendswithr   r   re   r   add)r?   r   stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
param_name
shard_nameshard_idparamr   r    r    r!   load_weights  s4   

zGemma3ForCausalLM.load_weightsrG   rC   )%rH   rI   rJ   r   config_class_tied_weights_keys_tp_plan_pp_planbase_model_prefix#default_bitsandbytes_target_modules#bitsandbytes_stacked_params_mappingpacked_modules_mappingsupported_lora_modulesembedding_modulesembedding_padding_modulessupports_lorar   r   rK   r=   r   	Embeddingr   r"   rL   r   r   rM   r   r   rF   r   r*   r   r   r
  rN   r    r    r@   r!   r     s    

$>r   )9r   typingr   r   r   r   r   rL   r   transformersr   r   r	   r
   sglang.srt.distributedr   sglang.srt.layers.activationr   sglang.srt.layers.layernormr   sglang.srt.layers.linearr   r   r   "sglang.srt.layers.logits_processorr   *sglang.srt.layers.quantization.base_configr   !sglang.srt.layers.radix_attentionr   r   "sglang.srt.layers.rotary_embeddingr   *sglang.srt.layers.vocab_parallel_embeddingr   ,sglang.srt.model_executor.forward_batch_infor   $sglang.srt.model_loader.weight_utilsr   r   sglang.srt.utilsr   r   r"   rK   r*   r/   Moduler0   rO   r   r   r  r   r   r   
EntryClassr    r    r    r!   <module>   s<   ' QTO Y