o
    
۾i<                     @   s  d Z ddlmZ ddlmZ ddlmZ ddlmZ ddl	Z	ddl	m
Z
 ddlmZ dd	lmZ dd
lmZmZ ddlmZmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZmZm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z, ddl-m.Z.m/Z/ ddl0m1Z1m2Z2m3Z3m4Z4m5Z5 ee6Z7ede8dB de8dB de
j9fddZ:G dd  d e
j9Z;G d!d" d"e
j9Z<G d#d$ d$e
j9Z=eG d%d& d&e
j9Z>G d'd( d(e
j9e.e/Z?dS ))z?Inference-only Gemma model compatible with HuggingFace weights.    )Iterable)cache)islice)AnyN)nn)GemmaConfig)support_torch_compile)CacheConfig
VllmConfig)get_pp_group$get_tensor_model_parallel_world_size)init_logger)
GeluAndMul)	Attention)GemmaRMSNorm)MergedColumnParallelLinearQKVParallelLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)get_rope)VocabParallelEmbedding)default_weight_loader)IntermediateTensors   )SupportsLoRA
SupportsPP)AutoWeightsLoaderis_pp_missing_parameter'make_empty_intermediate_tensors_factorymake_layersmaybe_prefix
hidden_acthidden_activationreturnc                 C   s\   |d u r| d urt d| |  tddS |dkrtddS |dkr&tddS td|  d)	Na~  Gemma's activation function was incorrectly set to exact GeLU in the config JSON file when it was initially released. Changing the activation function to approximate GeLU (`gelu_pytorch_tanh`). If you want to use the legacy `%s`, edit the config JSON to set `hidden_activation=%s` instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.tanh)approximategelu_pytorch_tanhgelunonezActivation function z# is not supported for Gemma models.)loggerwarningr   
ValueError)r"   r#    r-   T/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/gemma.py_get_gemma_act_fn<   s   



r/   c                       sX   e Zd Z				ddedededB dedB dedB ded	df fd
dZdd Z  ZS )GemmaMLPN hidden_sizeintermediate_sizer"   r#   quant_configprefixr$   c                    sT   t    t||gd d|| dd| _t||d|| dd| _t||| _d S )N   Fz.gate_up_projbiasr4   r5   z
.down_proj)super__init__r   gate_up_projr   	down_projr/   act_fn)selfr2   r3   r"   r#   r4   r5   	__class__r-   r.   r:   [   s    
	zGemmaMLP.__init__c                 C   s*   |  |\}}| |}| |\}}|S N)r;   r=   r<   )r>   xgate_up_r-   r-   r.   forwardu   s   
zGemmaMLP.forward)NNNr1   )	__name__
__module____qualname__intstrr   r:   rE   __classcell__r-   r-   r?   r.   r0   Z   s*    r0   c                       s|   e Zd Z				ddededededeeef d	ed
edB dedB deddf fddZ	de
jde
jde
jfddZ  ZS )GemmaAttention    Nr1   r2   	num_headsnum_kv_headshead_dimrope_parametersmax_position_embeddingscache_configr4   r5   r$   c
              	      s4  t    || _t }
|| _| j|
 dksJ | j|
 | _|| _| j|
kr/| j|
 dks.J n	|
| j dks8J td| j|
 | _|| _	| j| j	 | _
| j| j	 | _| j	d | _t|| j	| j| jd||	 dd| _t| j| j	 |d||	 dd| _t| j	||dd	| _t| j| j	| j| j|||	 d
d| _d S )Nr   r   g      Fz	.qkv_projr7   z.o_projT)max_positionrQ   is_neox_stylez.attn)rO   rS   r4   r5   )r9   r:   r2   r   total_num_headsrN   total_num_kv_headsmaxrO   rP   q_sizekv_sizescalingr   qkv_projr   o_projr   
rotary_embr   attn)r>   r2   rN   rO   rP   rQ   rR   rS   r4   r5   tp_sizer?   r-   r.   r:   }   s\   

	
zGemmaAttention.__init__	positionshidden_statesc           
      C   s`   |  |\}}|j| j| j| jgdd\}}}| |||\}}| |||}| |\}	}|	S )N)dim)r\   splitrY   rZ   r^   r_   r]   )
r>   ra   rb   qkvrD   qkvattn_outputoutputr-   r-   r.   rE      s    zGemmaAttention.forward)rM   NNr1   )rF   rG   rH   rI   dictrJ   r   r	   r   r:   torchTensorrE   rK   r-   r-   r?   r.   rL   |   sB    
	
BrL   c                       sr   e Zd Z			ddededB dedB deddf
 fdd	Zd
ej	dej	dej	dB de
ej	ej	f fddZ  ZS )GemmaDecoderLayerNr1   configrS   r4   r5   r$   c                    s   t    |j| _t| j|j|j|j|j|j||| dd	| _	t
| j|j|jt|dd || dd| _t|j|jd| _t|j|jd| _d S )Nz
.self_attn)	r2   rN   rO   rP   rR   rQ   rS   r4   r5   r#   z.mlp)r2   r3   r"   r#   r4   r5   eps)r9   r:   r2   rL   num_attention_headsnum_key_value_headsrP   rR   rQ   	self_attnr0   r3   r"   getattrmlpr   rms_norm_epsinput_layernormpost_attention_layernorm)r>   rp   rS   r4   r5   r?   r-   r.   r:      s2   

zGemmaDecoderLayer.__init__ra   rb   residualc                 C   sX   |d u r|}|  |}n|  ||\}}| j||d}| ||\}}| |}||fS )N)ra   rb   )ry   ru   rz   rw   )r>   ra   rb   r{   r-   r-   r.   rE      s   
zGemmaDecoderLayer.forward)NNr1   )rF   rG   rH   r   r	   r   rJ   r:   rm   rn   tuplerE   rK   r-   r-   r?   r.   ro      s0    !ro   c                       s   e Zd Zdddedef fddZdejdejfd	d
Z	ddejdB dejde	dB dejdB deje	B f
ddZ
deeeejf  dee fddZ  ZS )
GemmaModelr1   r5   vllm_configr5   c                   s   t    |jj|j |j| _tjj	| _
tj fdd| dd\| _| _| _tj	jd| _| jj	d }| jdt|dd	 td
dgj	| _d S )Nc                    s   t  | dS )Nr~   )ro   r~   rS   rp   r4   r-   r.   <lambda>  s    z%GemmaModel.__init__.<locals>.<lambda>z.layersr~   rq   g      ?
normalizerF)
persistentrb   r{   )r9   r:   model_config	hf_configrS   r4   rp   r   
vocab_sizer2   embed_tokensr    num_hidden_layersstart_layer	end_layerlayersr   rx   normregister_bufferrm   tensorr   make_empty_intermediate_tensors)r>   r   r5   r   r?   r   r.   r:     s(   


zGemmaModel.__init__	input_idsr$   c                 C   s
   |  |S rA   )r   r>   r   r-   r-   r.   embed_input_ids'  s   
zGemmaModel.embed_input_idsNra   intermediate_tensorsinputs_embedsc           	      C   s   t  jr|d ur|}n| |}|| j9 }d }n|d }|d }t| j| j| jD ]
}||||\}}q)t  js?t	||dS | 
||\}}|S )Nrb   r{   )rb   r{   )r   is_first_rankr   r   r   r   r   r   is_last_rankr   r   )	r>   r   ra   r   r   rb   r{   layerrD   r-   r-   r.   rE   *  s(   


zGemmaModel.forwardweightsc                 C   s   g d}t |  }t }|D ]Y\}}|D ].\}}}	||vrq|||}|dr/||vr/qt|| r5q|| }
|
j}||
||	  n|drN||vrNqt|| rTq|| }
t|
dt}||
| |	| q|S )N))r\   q_projrg   )r\   k_projrh   )r\   v_projri   )r;   	gate_projr   )r;   up_projr   z.biasweight_loader)
rl   named_parameterssetreplaceendswithr   r   rv   r   add)r>   r   stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
param_name
shard_nameshard_idparamr   r-   r-   r.   load_weightsH  s2   


zGemmaModel.load_weightsrA   )rF   rG   rH   r
   rJ   r:   rm   rn   r   r   rE   r   r|   r   r   rK   r-   r-   r?   r.   r}     s      
,r}   c                       s   e Zd Zg dddgdZdddedef fd	d
ZdejdejfddZ			ddejdB dejde
dB dejdB deje
B f
ddZdejdejdB fddZdeeeejf  dee fddZ  ZS )GemmaForCausalLM)r   r   r   r   r   )r\   r;   r1   r~   r   r5   c                   s\   t    |jj}|j}|| _|jsJ || _t|t|dd| _	t
|j| _| j	j| _d S )Nmodel)r   r5   )r9   r:   r   r   r4   rp   tie_word_embeddingsr}   r!   r   r   r   logits_processorr   )r>   r   r5   rp   r4   r?   r-   r.   r:   |  s   


zGemmaForCausalLM.__init__r   r$   c                 C   s   | j |S rA   )r   r   r   r-   r-   r.   r     s   z GemmaForCausalLM.embed_input_idsNra   r   r   c                 C   s   |  ||||}|S rA   )r   )r>   r   ra   r   r   rb   r-   r-   r.   rE     s   zGemmaForCausalLM.forwardrb   c                 C   s   |  | jj|}|S rA   )r   r   r   )r>   rb   logitsr-   r-   r.   compute_logits  s   zGemmaForCausalLM.compute_logitsr   c                 C   s$   t | | jjr	dgnd d}||S )Nzlm_head.)skip_prefixes)r   rp   r   r   )r>   r   loaderr-   r-   r.   r     s
   
zGemmaForCausalLM.load_weights)NN)rF   rG   rH   packed_modules_mappingr
   rJ   r:   rm   rn   r   r   rE   r   r   r|   r   r   rK   r-   r-   r?   r.   r   o  s6    

,r   )@__doc__collections.abcr   	functoolsr   	itertoolsr   typingr   rm   r   transformersr   vllm.compilation.decoratorsr   vllm.configr	   r
   vllm.distributedr   r   vllm.loggerr   %vllm.model_executor.layers.activationr   $vllm.model_executor.layers.attentionr   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   r   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.rotary_embeddingr   3vllm.model_executor.layers.vocab_parallel_embeddingr   -vllm.model_executor.model_loader.weight_utilsr   vllm.sequencer   
interfacesr   r   utilsr   r   r   r    r!   rF   r*   rJ   Moduler/   r0   rL   ro   r}   r   r-   r-   r-   r.   <module>   sN   "P9i