o
    i@                     @   s  d dl mZ d dlmZ d dlZd dlmZ d dlmZ d dlm	Z	 d dl
mZmZ d dlmZmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZmZmZ d dlmZ d dlmZ d dl m!Z! d dl"m#Z# d dl$m%Z%m&Z& d dl'm(Z( ddl)m*Z*m+Z+ ddl,m-Z-m.Z.m/Z/m0Z0m1Z1m2Z2 ee3Z4G dd dej5Z6G dd dej5Z7G dd dej5Z8e	G dd dej5Z9G dd  d ej5e*e+Z:dS )!    )Iterable)isliceN)nn)Gemma2Config)support_torch_compile)CacheConfig
VllmConfig)get_pp_group$get_tensor_model_parallel_world_size)init_logger)
GeluAndMul)	Attention)GemmaRMSNorm)MergedColumnParallelLinearQKVParallelLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)get_rope)VocabParallelEmbedding)default_weight_loadermaybe_remap_kv_scale_name)IntermediateTensors   )SupportsLoRA
SupportsPP)AutoWeightsLoaderextract_layer_indexis_pp_missing_parameter'make_empty_intermediate_tensors_factorymake_layersmaybe_prefixc                       sZ   e Zd Z		ddedededededB ded	df fd
dZdejd	ejfddZ	  Z
S )	Gemma2MLPN hidden_sizeintermediate_size
hidden_acthidden_activationquant_configprefixreturnc                    sx   t    t||gd d|| dd| _t||d|| dd| _||  kr/dks4td tdtdd	| _d S )
N   Fz.gate_up_projbiasr(   r)   z
.down_projgelu_pytorch_tanhzGemma2 uses `gelu_pytorch_tanh` as the hidden activation function. Please set `hidden_act` and `hidden_activation` to `gelu_pytorch_tanh`.tanh)approximate)	super__init__r   gate_up_projr   	down_proj
ValueErrorr   act_fn)selfr$   r%   r&   r'   r(   r)   	__class__ W/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/model_executor/models/gemma2.pyr2   >   s0   
	zGemma2MLP.__init__xc                 C   s*   |  |\}}| |}| |\}}|S N)r3   r6   r4   )r7   r<   gate_up_r:   r:   r;   forward^   s   
zGemma2MLP.forward)Nr#   )__name__
__module____qualname__intstrr   r2   torchTensorr@   __classcell__r:   r:   r8   r;   r"   =   s&     r"   c                       s|   e Zd Z				ddedededededed	edB d
edB dedB deddf fddZ	de
jde
jde
jfddZ  ZS )Gemma2AttentionNr#   configr$   	num_headsnum_kv_headshead_dimmax_position_embeddingscache_configr(   attn_logits_soft_capr)   r*   c                    sh  t    || _|| _t }|| _| j| dksJ | j| | _|| _| j|kr2| j| dks1J n	|| j dks;J td| j| | _	|| _
| j| j
 | _| j	| j
 | _|jd | _t|| j
| j| j|j||
 dd| _t| j| j
 ||j||
 dd| _t| j
||jdd| _t|
}|j| d	k}|r|jnd }t| j| j
| j| j	|||	||
 d
d	| _d S )Nr   r   g      z	.qkv_projr,   z.o_projT)max_positionrope_parametersis_neox_stylesliding_attentionz.attn)rL   rO   r(   logits_soft_capper_layer_sliding_windowr)   )r1   r2   rJ   r$   r
   total_num_headsrK   total_num_kv_headsmaxrL   rM   q_sizekv_sizequery_pre_attn_scalarscalingr   attention_biasqkv_projr   o_projr   rR   
rotary_embr   layer_typessliding_windowr   attn)r7   rJ   r$   rK   rL   rM   rN   rO   r(   rP   r)   tp_size	layer_idx
is_slidingrc   r8   r:   r;   r2   f   sh   

	
zGemma2Attention.__init__	positionshidden_statesc           
      C   s`   |  |\}}|j| j| j| jgdd\}}}| |||\}}| |||}| |\}	}|	S )N)dim)r_   splitrZ   r[   ra   rd   r`   )
r7   rh   ri   qkvr?   qkvattn_outputoutputr:   r:   r;   r@      s    zGemma2Attention.forward)NNNr#   )rA   rB   rC   r   rD   r   r   floatrE   r2   rF   rG   r@   rH   r:   r:   r8   r;   rI   e   sF    		
JrI   c                       sr   e Zd Z			ddededB dedB deddf
 fdd	Zd
ej	dej	dej	dB de
ej	ej	f fddZ  ZS )Gemma2DecoderLayerNr#   rJ   rO   r(   r)   r*   c                    s   t    |j| _t|| j|j|j|j|j|||j| dd
| _	|j| _t
| j|j|j|j|| dd| _t|j|jd| _t|j|jd| _t|j|jd| _t|j|jd| _d S )Nz
.self_attn)
rJ   r$   rK   rL   rM   rN   rO   r(   rP   r)   z.mlp)r$   r%   r&   r'   r(   r)   eps)r1   r2   r$   rI   num_attention_headsnum_key_value_headsrM   rN   attn_logit_softcapping	self_attnr"   r%   r&   r'   mlpr   rms_norm_epsinput_layernormpost_attention_layernormpre_feedforward_layernormpost_feedforward_layernorm)r7   rJ   rO   r(   r)   r8   r:   r;   r2      sB   
zGemma2DecoderLayer.__init__rh   ri   residualc                 C   sl   |d u r|}|  |}n|  ||\}}| j||d}| |}| ||\}}| |}| |}||fS )N)rh   ri   )r}   rz   r~   r   r{   r   )r7   rh   ri   r   r:   r:   r;   r@      s   


zGemma2DecoderLayer.forward)NNr#   )rA   rB   rC   r   r   r   rE   r2   rF   rG   tupler@   rH   r:   r:   r8   r;   rt      s0    )rt   c                       s   e Zd Zdddedef fddZdejdejfd	d
Z	ddejdB dejde	dB dejdB deje	B f
ddZ
deeeejf  dee fddZ  ZS )Gemma2Modelr#   r)   vllm_configr)   c                   s   t    |jj|j |j| _| _tjj	| _
tj fdd| dd\| _| _| _tj	jd| _| jj	d }| jdt|dd	 td
dgj	| _d S )Nc                    s   t  | dS )Nr   )rt   r   rO   rJ   r(   r:   r;   <lambda>  s    z&Gemma2Model.__init__.<locals>.<lambda>z.layersr   ru   g      ?
normalizerF)
persistentri   r   )r1   r2   model_config	hf_configrO   r(   rJ   r   
vocab_sizer$   embed_tokensr    num_hidden_layersstart_layer	end_layerlayersr   r|   normregister_bufferrF   tensorr   make_empty_intermediate_tensors)r7   r   r)   r   r8   r   r;   r2     s*   


zGemma2Model.__init__	input_idsr*   c                 C   s
   |  |S r=   )r   r7   r   r:   r:   r;   embed_input_ids!  s   
zGemma2Model.embed_input_idsNrh   intermediate_tensorsinputs_embedsc           	      C   s   t  jr|d ur|}n| |}|| j9 }d }n|d usJ |d }|d }t| j| j| jD ]
}||||\}}q/t  jsEt	||dS | 
||\}}|S )Nri   r   )ri   r   )r	   is_first_rankr   r   r   r   r   r   is_last_rankr   r   )	r7   r   rh   r   r   ri   r   layerr?   r:   r:   r;   r@   $  s*   


zGemma2Model.forwardweightsc                 C   s6  g d}t |  }t }|D ]\}}| jd ur9| j| }r9|| }t|dt}	|d }|	|| || q|D ].\}
}}||vrEq;|||
}|	drU||vrUq;t
|| r[q;|| }|j}	|	|||  n)|	drt||vrtqt||}|d u r~qt
|| rq|| }t|dt}	|	|| || q|S )N))r_   q_projrn   )r_   k_projro   )r_   v_projrp   )r3   	gate_projr   )r3   up_projr   weight_loaderr   z.bias)dictnamed_parameterssetr(   get_cache_scalegetattrr   addreplaceendswithr   r   r   )r7   r   stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
scale_nameparamr   
param_name
shard_nameshard_idr:   r:   r;   load_weightsC  sJ   






zGemma2Model.load_weightsr=   )rA   rB   rC   r   rE   r2   rF   rG   r   r   r@   r   r   r   r   rH   r:   r:   r8   r;   r      s     
,r   c                       s   e Zd Zg dddgdZdddedef fd	d
ZdejdejfddZ			ddejdB dejde
dB dejdB deje
B f
ddZdejdejdB fddZdeeeejf  dee fddZ  ZS )Gemma2ForCausalLM)r   r   r   r   r   )r_   r3   r#   r   r   r)   c                   sb   |j j}|j}t   || _|jsJ || _t|t|dd| _	t
|j|jd| _| j	j| _d S )Nmodel)r   r)   )soft_cap)r   r   r(   r1   r2   rJ   tie_word_embeddingsr   r!   r   r   r   final_logit_softcappinglogits_processorr   )r7   r   r)   rJ   r(   r8   r:   r;   r2     s   


zGemma2ForCausalLM.__init__r   r*   c                 C   s   | j |S r=   )r   r   r   r:   r:   r;   r     s   z!Gemma2ForCausalLM.embed_input_idsNrh   r   r   c                 C   s   |  ||||}|S r=   )r   )r7   r   rh   r   r   ri   r:   r:   r;   r@     s   zGemma2ForCausalLM.forwardri   c                 C   s   |  | jj|}|S r=   )r   r   r   )r7   ri   logitsr:   r:   r;   compute_logits  s   z Gemma2ForCausalLM.compute_logitsr   c                 C   s$   t | | jjr	dgnd d}||S )Nzlm_head.)skip_prefixes)r   rJ   r   r   )r7   r   loaderr:   r:   r;   r     s
   
zGemma2ForCausalLM.load_weights)NN)rA   rB   rC   packed_modules_mappingr   rE   r2   rF   rG   r   r   r@   r   r   r   r   r   rH   r:   r:   r8   r;   r   x  s6    

,r   );collections.abcr   	itertoolsr   rF   r   transformersr   vllm.compilation.decoratorsr   vllm.configr   r   vllm.distributedr	   r
   vllm.loggerr   %vllm.model_executor.layers.activationr   $vllm.model_executor.layers.attentionr   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   r   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.rotary_embeddingr   3vllm.model_executor.layers.vocab_parallel_embeddingr   -vllm.model_executor.model_loader.weight_utilsr   r   vllm.sequencer   
interfacesr   r   utilsr   r   r   r   r    r!   rA   loggerModuler"   rI   rt   r   r   r:   r:   r:   r;   <module>   s8    	(XCw