o
    -i=0                     @   s  d Z ddlmZ ddlmZ ddlZddlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZmZ dd
lmZmZ ddlmZ ddlmZ ddlmZmZ ddlmZ ddlmZ ddlmZ ddl m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z' ddl(m)Z)m*Z*m+Z+ ddl,m-Z. ddl,m/Z/ ddl0m1Z1m2Z2m3Z3m4Z4 ee5Z6G dd dej7Z8G dd dej7Z9de9iZ:eddddd d!G d"d# d#e/Z;G d$d% d%ej7e*e+e)Z<dS )&z?Inference-only Qwen3 model compatible with HuggingFace weights.    )Iterable)AnyN)nn)Qwen3Config)	Attention)support_torch_compile)CacheConfig
VllmConfig)get_pp_group$get_tensor_model_parallel_world_size)init_logger)RMSNorm)QKVParallelLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)get_rope)ParallelLMHead)IntermediateTensors)set_default_rope_theta)AttentionType   )SupportsEagle3SupportsLoRA
SupportsPP)Qwen2MLP)
Qwen2Model)AutoWeightsLoaderPPMissingLayerextract_layer_indexmaybe_prefixc                       s   e Zd Zdddddddejdf	dededed	ed
ededB dedede	dB de
dB dededeeef dB ddf fddZdejdejdejfddZ  ZS )Qwen3Attentioni   Ngư>F hidden_size	num_headsnum_kv_headsrope_parametersmax_positionhead_dimrms_norm_epsqkv_biascache_configquant_configprefix	attn_typedual_chunk_attention_configreturnc              	      s  t    || _t }|| _| j| dksJ | j| | _|| _| j|kr/| j| dks.J n	|| j dks8J td| j| | _|pG|| j | _	| j| j	 | _
| j| j	 | _| j	d | _|| _t|| j	| j| j||
| dd| _t| j| j	 |d|
| dd| _t| j	|||d| _t| j| j	| jf| j|	|
| d	|d
|rt||dni | _t| j	|d| _t| j	|d| _d S )Nr   r   g      z	.qkv_proj)biasr,   r-   Fz.o_proj)r'   r&   r/   z.attn)r%   r+   r,   r-   r.   )	layer_idxr/   eps)super__init__r#   r   total_num_headsr$   total_num_kv_headsmaxr%   r(   q_sizekv_sizescalingr/   r   qkv_projr   o_projr   
rotary_embr   r   attnr   q_normk_norm)selfr#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   tp_size	__class__ ]/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/model_executor/models/qwen3.pyr6   9   st   

	
zQwen3Attention.__init__	positionshidden_statesc                 C   s   |  |\}}|j| j| j| jgdd\}}}|jg |jd d |jd | j | jR  }| |}||j}|jg |jd d |jd | j | jR  }	| |	}	|	|j}| 	|||\}}| 
|||}
| |
\}}|S )N)dim)r=   splitr:   r;   viewshaper(   rA   rB   r?   r@   r>   )rC   rI   rJ   qkv_qkv	q_by_head	k_by_headattn_outputoutputrG   rG   rH   forward   s    0
0
zQwen3Attention.forward)__name__
__module____qualname__r   DECODERintdictfloatboolr   r   strr   r6   torchTensorrY   __classcell__rG   rG   rE   rH   r!   8   s\    	
Pr!   c                       sr   e Zd Z			ddededB dedB deddf
 fdd	Zd
ej	dej	dej	dB de
ej	ej	f fddZ  ZS )Qwen3DecoderLayerNr"   configr+   r,   r-   r0   c                    s   t    |j| _t|dd t|dd }t|ddrtj}ntj}t| j|j	|j
|j|jt|ddt|dd |||j| d	||d
| _t| j|j|j|| dd| _t|j|jd| _t|j|jd| _d S )Ni@B )default_thetar/   	is_causalTattention_biasFr(   z
.self_attn)r#   r$   r'   r%   r)   r*   r(   r+   r,   r&   r-   r.   r/   z.mlp)r#   intermediate_size
hidden_actr,   r-   r3   )r5   r6   r#   r   getattrr   r]   ENCODER_ONLYr!   num_attention_headsmax_position_embeddingsnum_key_value_headsr)   r&   	self_attnQwen3MLPrk   rl   mlpr   input_layernormpost_attention_layernorm)rC   rg   r+   r,   r-   r/   r.   rE   rG   rH   r6      sF   


zQwen3DecoderLayer.__init__rI   rJ   residualc                 C   sX   |d u r|}|  |}n|  ||\}}| j||d}| ||\}}| |}||fS )N)rI   rJ   )ru   rr   rv   rt   )rC   rI   rJ   rw   rG   rG   rH   rY      s   
zQwen3DecoderLayer.forward)NNr"   )rZ   r[   r\   r   r   r   rb   r6   rc   rd   tuplerY   re   rG   rG   rE   rH   rf      s0    2rf   	attentionrK   )	input_idsrI   intermediate_tensorsinputs_embeds)dynamic_arg_dimsc                       s,   e Zd Zdddedef fddZ  ZS )
Qwen3Modelr"   r-   vllm_configr-   c                   s   t  j||td d S )N)r   r-   decoder_layer_type)r5   r6   rf   )rC   r   r-   rE   rG   rH   r6      s   
zQwen3Model.__init__)rZ   r[   r\   r	   rb   r6   re   rG   rG   rE   rH   r~      s    $r~   c                       s   e Zd Zg dddgdZdddedef fd	d
Zdeedf ddfddZ	deedf fddZ
dejdejfddZ		d!dejdejdedB dejdB dejeB f
ddZdejdejdB fddZdeeeejf  dee fdd Z  ZS )"Qwen3ForCausalLM)q_projk_projv_proj	gate_projup_proj)r=   gate_up_projr"   r   r   r-   c                   s   t    |jj}|j}|| _|| _t|t|dd| _t	 j
r8|jr)| jj| _nt|j|j|t|dd| _nt | _t|j| _| jj| _d S )Nmodel)r   r-   lm_head)r,   r-   )r5   r6   model_config	hf_configr,   rg   r~   r    r   r
   is_last_ranktie_word_embeddingsembed_tokensr   r   
vocab_sizer#   r   r   logits_processormake_empty_intermediate_tensors)rC   r   r-   rg   r,   rE   rG   rH   r6   
  s*   


zQwen3ForCausalLM.__init__layers.r0   Nc                 C   s   || j _d S N)r   aux_hidden_state_layers)rC   r   rG   rG   rH   set_aux_hidden_state_layers)     z,Qwen3ForCausalLM.set_aux_hidden_state_layersc                 C   s   t | jj}d|d |d fS )N      )lenr   r   )rC   
num_layersrG   rG   rH   "get_eagle3_aux_hidden_state_layers,  s   z3Qwen3ForCausalLM.get_eagle3_aux_hidden_state_layersrz   c                 C   s   | j |S r   )r   embed_input_ids)rC   rz   rG   rG   rH   r   0  r   z Qwen3ForCausalLM.embed_input_idsrI   r{   r|   c                 C   s   |  ||||}|S r   )r   )rC   rz   rI   r{   r|   rJ   rG   rG   rH   rY   3  s   zQwen3ForCausalLM.forwardrJ   c                 C   s   |  | j|}|S r   )r   r   )rC   rJ   logitsrG   rG   rH   compute_logits?  s   zQwen3ForCausalLM.compute_logitsweightsc                 C   s$   t | | jjr	dgnd d}||S )Nzlm_head.)skip_prefixes)r   rg   r   load_weights)rC   r   loaderrG   rG   rH   r   F  s
   
zQwen3ForCausalLM.load_weights)NN)rZ   r[   r\   packed_modules_mappingr	   rb   r6   rx   r^   r   r   rc   rd   r   r   rY   r   r   setr   re   rG   rG   rE   rH   r      s:    

,r   )=__doc__collections.abcr   typingr   rc   r   transformersr   vllm.attention.layerr   vllm.compilation.decoratorsr   vllm.configr   r	   vllm.distributedr
   r   vllm.loggerr   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.rotary_embeddingr   3vllm.model_executor.layers.vocab_parallel_embeddingr   vllm.sequencer   vllm.transformers_utils.configr   vllm.v1.attention.backendr   
interfacesr   r   r   qwen2r   rs   r   utilsr   r   r   r    rZ   loggerModuler!   rf   ALL_DECODER_LAYER_TYPESr~   r   rG   rG   rG   rH   <module>   sL   eK
