o
    پiC                     @   sL  d dl mZmZmZmZ d dlZd dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZmZmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlm Z m!Z! d dl"m#Z#m$Z$m%Z% e$ Z&dd Z'G dd dej(Z)G dd dej(Z*G dd dej(Z+G dd dej(Z,G dd dej(Z-e-Z.dS )    )IterableOptionalSetTupleN)nn)PretrainedConfig)$get_tensor_model_parallel_world_size)
GeluAndMul)GemmaRMSNorm)MergedColumnParallelLinearQKVParallelLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)RadixAttention)get_rope)VocabParallelEmbedding)ForwardBatch)default_weight_loadermaybe_remap_kv_scale_name)
add_prefixis_npumake_layersc                 C   s
   | j d S )N   )sliding_window)config r   L/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/gemma2.py!get_attention_sliding_window_size1      
r   c                       sZ   e Zd Z		ddededededee ded	df fd
dZdej	d	ej	fddZ
  ZS )	Gemma2MLPN hidden_sizeintermediate_size
hidden_acthidden_activationquant_configprefixreturnc                    st   t    t||gd d|td|d| _t||d|td|d| _||  kr/dks4td tdt | _	d S )N   Fgate_up_projbiasr&   r'   	down_projgelu_pytorch_tanhzGemma2 uses `gelu_pytorch_tanh` as the hidden activation function. Please set `hidden_act` and `hidden_activation` to `gelu_pytorch_tanh`.)
super__init__r   r   r*   r   r-   
ValueErrorr	   act_fn)selfr"   r#   r$   r%   r&   r'   	__class__r   r   r0   6   s0   
	zGemma2MLP.__init__xc                 C   s*   |  |\}}| |}| |\}}|S N)r*   r2   r-   )r3   r6   gate_up_r   r   r   forwardV   s   
zGemma2MLP.forwardNr!   )__name__
__module____qualname__intstrr   r   r0   torchTensorr:   __classcell__r   r   r4   r   r    5   s&     r    c                       st   e Zd Z		ddedededededed	ed
edee deddf fddZ	de
jde
jdede
jfddZ  ZS )Gemma2AttentionNr!   layer_idr   r"   	num_headsnum_kv_headshead_dimmax_position_embeddings
rope_thetar&   r'   r(   c                    s  t    || _|| _|| _t }|| _| j| dksJ | j| | _|| _| j|kr5| j| dks4J n	|| j dks>J t	d| j| | _
|| _| j| j | _| j
| j | _|jd | _|| _t|| j| j| j|j|	td|
d| _t| j| j ||j|	td|
d| _trd| jjvrt| j| j|| jdd	| _| jj}nt| j| j|| jdtjd
| _d}|d dkot|d}t| j| j| j| j
|||rt |nd |	td|
d	| _!d S )Nr   r   g      qkv_projr+   o_projGemma2ForSequenceClassificationT)
rotary_dimmax_positionbaseis_neox_style)rN   rO   rP   rQ   dtypeg        r)   r   attn)rG   rE   	logit_capsliding_window_sizer&   r'   )"r/   r0   rE   r   r"   r   total_num_headsrF   total_num_kv_headsmaxrG   rH   q_sizekv_sizequery_pre_attn_scalarscalingrJ   r   attention_biasr   rK   r   rL   _is_npuarchitecturesr   
rotary_embattn_logit_softcappingrA   float32hasattrr   r   rS   )r3   rE   r   r"   rF   rG   rH   rI   rJ   r&   r'   tp_sizerT   use_sliding_windowr4   r   r   r0   ^   s   

	


zGemma2Attention.__init__	positionshidden_statesforward_batchc                 C   sb   |  |\}}|j| j| j| jgdd\}}}| |||\}}| ||||}	| |	\}
}|
S )N)dim)rK   splitrY   rZ   r`   rS   rL   )r3   rf   rg   rh   qkvr9   qkvattn_outputoutputr   r   r   r:      s    zGemma2Attention.forwardr;   )r<   r=   r>   r?   r   floatr   r   r@   r0   rA   rB   r   r:   rC   r   r   r4   r   rD   ]   sF    	
]rD   c                       sp   e Zd Z		ddededee deddf
 fdd	Zd
e	j
de	j
dedee	j
 dee	j
e	j
f f
ddZ  ZS )Gemma2DecoderLayerNr!   rE   r   r&   r'   r(   c                    s   t    || _|j| _t||| j|j|j|j|j|j	|t
d|d
| _|j| _t| j|j|j|j|t
d|d| _t|j|jd| _t|j|jd| _t|j|jd| _t|j|jd| _d S )N	self_attn)
rE   r   r"   rF   rG   rH   rI   rJ   r&   r'   mlp)r"   r#   r$   r%   r&   r'   eps)r/   r0   rE   r"   rD   num_attention_headsnum_key_value_headsrH   rI   rJ   r   rt   r    r#   r$   r%   ru   r
   rms_norm_epsinput_layernormpost_attention_layernormpre_feedforward_layernormpost_feedforward_layernorm)r3   rE   r   r&   r'   r4   r   r   r0      sD   
zGemma2DecoderLayer.__init__rf   rg   rh   residualc                 C   sn   |d u r|}|  |}n|  ||\}}| j|||d}| |}| ||\}}| |}| |}||fS )N)rf   rg   rh   )r{   rt   r|   r}   ru   r~   )r3   rf   rg   rh   r   r   r   r   r:      s    


zGemma2DecoderLayer.forwardr;   )r<   r=   r>   r?   r   r   r   r@   r0   rA   rB   r   r   r:   rC   r   r   r4   r   rs      s2    *rs   c                       sb   e Zd Z		ddedee deddf fddZ	dd	ej	d
ej	de
dej	dej	f
ddZ  ZS )Gemma2ModelNr!   r   r&   r'   r(   c                    sv   t     | _t j j| _t j fddt	d|d| _
t j jd| _| jjd }| dt| d S )Nc                    s   t |  dS )N)rE   r   r&   )rs   )idxr'   r   r&   r   r   <lambda>  s
    z&Gemma2Model.__init__.<locals>.<lambda>layersr'   rv         ?
normalizer)r/   r0   r   r   
vocab_sizer"   embed_tokensr   num_hidden_layersr   r   r
   rz   normregister_bufferrA   tensor)r3   r   r&   r'   r   r4   r   r   r0     s   
	zGemma2Model.__init__	input_idsrf   rh   input_embedsc                 C   s   |d u r
|  |}n|}tj| jjd |jd}||9 }d }tt| jD ]}| j| }	|	||||\}}q%| 	||\}}
|S )Nr   rR   )
r   rA   r   r   r"   rR   rangelenr   r   )r3   r   rf   rh   r   rg   r   r   ilayerr9   r   r   r   r:   /  s$   

zGemma2Model.forwardr;   r7   )r<   r=   r>   r   r   r   r@   r0   rA   rB   r   r:   rC   r   r   r4   r   r     s0    $r   c                       s  e Zd Zg dZddddddZg dd	d
gdZg dZi Zg ZdZ			d$de
dee deddf fddZe 	d%dejdejdedejdejf
ddZe 	d%dejdejdedeeef dejf
ddZdd  Zd!eeeejf  fd"d#Z  ZS )&Gemma2ForCausalLM)z.gate_proj.z.down_proj.z	.up_proj.z.q_proj.z.k_proj.z.v_proj.z.o_proj.)rK   r   )rK   r   )rK   r)   )r*   r   )r*   r   )q_projk_projv_proj	gate_projup_proj)r   r   r   r   r   )rK   r*   )rK   rL   r*   r-   TNr!   r   r&   r'   r(   c                    s:   t    || _|| _t||td|d| _t|| _d S )Nmodelr   )	r/   r0   r   r&   r   r   r   r   logits_processor)r3   r   r&   r'   r4   r   r   r0   x  s   
zGemma2ForCausalLM.__init__r   rf   rh   r   c                 C   s$   |  ||||}| ||| j j|S r7   )r   r   r   )r3   r   rf   rh   r   rg   r   r   r   r:     s   zGemma2ForCausalLM.forwardsplit_intervalc                 C   s   |\}}|dkr+|d u r| j ||_n||_tj| j jjd tjd}| j|9  _t||D ]}	| j j	|	 }
|
||j||j
\|_|_
q0|| j jjkrf| j |j|j
\|_}| ||j| j j|}|S d }|S )Nr   r   r   )r   r   rg   rA   r   r   r"   float16r   r   r   r   r   r   )r3   r   rf   rh   r   r   startendr   r   r   r9   resultr   r   r   forward_split_prefill  s<   	
	z'Gemma2ForCausalLM.forward_split_prefillc                 C   s
   t | jS r7   )r   r   )r3   r   r   r   r     r   z3Gemma2ForCausalLM.get_attention_sliding_window_sizeweightsc                 C   s   g d}t |  }t }|D ]\\}}|D ](\}}}	||vrq|||}|dr/||vr/q|| }
|
j}||
||	  n(d|v rCq|drM||vrMqt||}|d u rWq|| }
t|
dt}||
| |	| qd S )N))rK   r   rm   )rK   r   rn   )rK   r   ro   )r*   r   r   )r*   r   r   z.biaszlm_head.weightweight_loader)
dictnamed_parameterssetreplaceendswithr   r   getattrr   add)r3   r   stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
param_name
shard_nameshard_idparamr   r   r   r   load_weights  s4   

zGemma2ForCausalLM.load_weightsr;   r7   )r<   r=   r>   #default_bitsandbytes_target_modules#bitsandbytes_stacked_params_mappingpacked_modules_mappingsupported_lora_modulesembedding_modulesembedding_padding_modulessupports_lorar   r   r   r@   r0   rA   no_gradrB   r   r:   r   r?   r   r   r   r   rC   r   r   r4   r   r   L  sp    

2$r   )/typingr   r   r   r   rA   r   transformersr   sglang.srt.distributedr   sglang.srt.layers.activationr	   sglang.srt.layers.layernormr
   sglang.srt.layers.linearr   r   r   "sglang.srt.layers.logits_processorr   *sglang.srt.layers.quantization.base_configr   !sglang.srt.layers.radix_attentionr   "sglang.srt.layers.rotary_embeddingr   *sglang.srt.layers.vocab_parallel_embeddingr   ,sglang.srt.model_executor.forward_batch_infor   $sglang.srt.model_loader.weight_utilsr   r   sglang.srt.utilsr   r   r   r^   r   Moduler    rD   rs   r   r   
EntryClassr   r   r   r   <module>   s2   (lF= (