o
    i1                     @   s  d Z ddlmZ ddlmZ ddlZddlmZ ddlmZ ddl	m
Z
 ddlmZmZ dd	lmZmZ dd
lmZ ddlmZmZ ddlmZ ddlmZmZ ddlmZ ddlmZ ddlm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z*m+Z+m,Z, ddl-m.Z/ ddl-m0Z0 ddl1m2Z2m3Z3m4Z4m5Z5 ee6Z7G dd dej8Z9G dd dej8Z:de:iZ;e
ddddd d!G d"d# d#e0Z<G d$d% d%ej8e+e,e*Z=dS )&z?Inference-only Qwen3 model compatible with HuggingFace weights.    )Iterable)AnyN)nn)Qwen3Config)support_torch_compile)CacheConfig
VllmConfig)get_pp_group$get_tensor_model_parallel_world_size)init_logger)	AttentionEncoderOnlyAttention)RMSNorm)QKVParallelLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)get_rope)ParallelLMHead)IntermediateTensors)set_default_rope_theta)AttentionType   )SupportsEagle3SupportsLoRA
SupportsPP)Qwen2MLP)
Qwen2Model)AutoWeightsLoaderPPMissingLayerextract_layer_indexmaybe_prefixc                       s   e Zd Zdddddddejdf	dededed	ed
ededB dedede	dB de
dB dededeeef dB ddf fddZdejdejdejfddZ  ZS )Qwen3Attentioni   Ngư>F hidden_size	num_headsnum_kv_headsrope_parametersmax_positionhead_dimrms_norm_epsqkv_biascache_configquant_configprefix	attn_typedual_chunk_attention_configreturnc              	      s  t    || _t }|| _| j| dksJ | j| | _|| _| j|kr/| j| dks.J n	|| j dks8J td| j| | _|pG|| j | _	| j| j	 | _
| j| j	 | _| j	d | _|| _t|| j	| j| j||
| dd| _t| j| j	 |d|
| dd| _t| j	|||d| _|tjkrtnt}|| j| j	| jf| j|	|
| d	|d
|rt||dni | _t| j	|d| _t| j	|d| _d S )Nr   r   g      z	.qkv_proj)biasr-   r.   Fz.o_proj)r(   r'   r0   z.attn)r&   r,   r-   r.   r/   )	layer_idxr0   eps)super__init__r$   r
   total_num_headsr%   total_num_kv_headsmaxr&   r)   q_sizekv_sizescalingr0   r   qkv_projr   o_projr   
rotary_embr   ENCODER_ONLYr   r   r    attnr   q_normk_norm)selfr$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   tp_sizeattn_cls	__class__ V/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/model_executor/models/qwen3.pyr7   <   s|   

	

zQwen3Attention.__init__	positionshidden_statesc                 C   s   |  |\}}|j| j| j| jgdd\}}}|jg |jd d |jd | j | jR  }| |}||j}|jg |jd d |jd | j | jR  }	| |	}	|	|j}| 	|||\}}| 
|||}
| |
\}}|S )N)dim)r>   splitr;   r<   viewshaper)   rC   rD   r@   rB   r?   )rE   rL   rM   qkv_qkv	q_by_head	k_by_headattn_outputoutputrJ   rJ   rK   forward   s    0
0
zQwen3Attention.forward)__name__
__module____qualname__r   DECODERintdictfloatboolr   r   strr   r7   torchTensorr\   __classcell__rJ   rJ   rH   rK   r"   ;   s\    	
Ur"   c                       sr   e Zd Z			ddededB dedB deddf
 fdd	Zd
ej	dej	dej	dB de
ej	ej	f fddZ  ZS )Qwen3DecoderLayerNr#   configr,   r-   r.   r1   c                    s   t    |j| _t|dd t|dd }t|ddrtj}ntj}t| j|j	|j
|j|jt|ddt|dd |||j| d	||d
| _t| j|j|j|| dd| _t|j|jd| _t|j|jd| _d S )Ni@B )default_thetar0   	is_causalTattention_biasFr)   z
.self_attn)r$   r%   r(   r&   r*   r+   r)   r,   r-   r'   r.   r/   r0   z.mlp)r$   intermediate_size
hidden_actr-   r.   r4   )r6   r7   r$   r   getattrr   r`   rA   r"   num_attention_headsmax_position_embeddingsnum_key_value_headsr*   r'   	self_attnQwen3MLPrn   ro   mlpr   input_layernormpost_attention_layernorm)rE   rj   r,   r-   r.   r0   r/   rH   rJ   rK   r7      sF   


zQwen3DecoderLayer.__init__rL   rM   residualc                 C   sX   |d u r|}|  |}n|  ||\}}| j||d}| ||\}}| |}||fS )N)rL   rM   )rw   rt   rx   rv   )rE   rL   rM   ry   rJ   rJ   rK   r\      s   
zQwen3DecoderLayer.forward)NNr#   )r]   r^   r_   r   r   r   re   r7   rf   rg   tupler\   rh   rJ   rJ   rH   rK   ri      s0    2ri   	attentionrN   )	input_idsrL   intermediate_tensorsinputs_embeds)dynamic_arg_dimsc                       s,   e Zd Zdddedef fddZ  ZS )
Qwen3Modelr#   r.   vllm_configr.   c                   s   t  j||td d S )N)r   r.   decoder_layer_type)r6   r7   ri   )rE   r   r.   rH   rJ   rK   r7      s   
zQwen3Model.__init__)r]   r^   r_   r   re   r7   rh   rJ   rJ   rH   rK   r      s    $r   c                       s  e Zd Zg dddgdZdddZdd	d
edef fddZdee	df ddfddZ
dee	df fddZdejdejfddZ		d$dejdB dejdedB dejdB dejeB f
ddZdejdejdB fdd Zd!eeeejf  dee fd"d#Z  ZS )%Qwen3ForCausalLM)q_projk_projv_proj	gate_projup_proj)r>   gate_up_projinput_embeddingsoutput_embeddings)embed_tokenslm_headr#   r   r   r.   c                   s   t    |jj}|j}|| _|| _t|t|dd| _t	 j
r8|jr)| jj| _nt|j|j|t|dd| _nt | _t|j| _| jj| _d S )Nmodel)r   r.   r   )r-   r.   )r6   r7   model_config	hf_configr-   rj   r   r!   r   r	   is_last_ranktie_word_embeddingsr   r   r   
vocab_sizer$   r   r   logits_processormake_empty_intermediate_tensors)rE   r   r.   rj   r-   rH   rJ   rK   r7     s*   


zQwen3ForCausalLM.__init__layers.r1   Nc                 C   s   || j _d S N)r   aux_hidden_state_layers)rE   r   rJ   rJ   rK   set_aux_hidden_state_layers6     z,Qwen3ForCausalLM.set_aux_hidden_state_layersc                 C   s   t | jj}d|d |d fS )N      )lenr   r   )rE   
num_layersrJ   rJ   rK   "get_eagle3_aux_hidden_state_layers9  s   z3Qwen3ForCausalLM.get_eagle3_aux_hidden_state_layersr|   c                 C   s   | j |S r   )r   embed_input_ids)rE   r|   rJ   rJ   rK   r   =  r   z Qwen3ForCausalLM.embed_input_idsrL   r}   r~   c                 C   s   |  ||||}|S r   )r   )rE   r|   rL   r}   r~   rM   rJ   rJ   rK   r\   @  s   zQwen3ForCausalLM.forwardrM   c                 C   s   |  | j|}|S r   )r   r   )rE   rM   logitsrJ   rJ   rK   compute_logitsL  s   zQwen3ForCausalLM.compute_logitsweightsc                 C   s$   t | | jjr	dgnd d}||S )Nzlm_head.)skip_prefixes)r   rj   r   load_weights)rE   r   loaderrJ   rJ   rK   r   S  s
   
zQwen3ForCausalLM.load_weights)NN)r]   r^   r_   packed_modules_mappingembedding_modulesr   re   r7   rz   ra   r   r   rf   rg   r   r   r\   r   r   setr   rh   rJ   rJ   rH   rK   r     s@    

,r   )>__doc__collections.abcr   typingr   rf   r   transformersr   vllm.compilation.decoratorsr   vllm.configr   r   vllm.distributedr	   r
   vllm.loggerr   ;vllm.model_executor.layers.attention.encoder_only_attentionr   r   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.rotary_embeddingr   3vllm.model_executor.layers.vocab_parallel_embeddingr   vllm.sequencer   vllm.transformers_utils.configr   vllm.v1.attention.backendr   
interfacesr   r   r   qwen2r   ru   r   utilsr   r   r    r!   r]   loggerModuler"   ri   ALL_DECODER_LAYER_TYPESr   r   rJ   rJ   rJ   rK   <module>   sL   jK
