o
    -i W                  
   @   s  d Z ddlmZ ddlmZ ddlmZ ddlZddlmZ ddl	m
Z
 ddlmZ dd	lmZ dd
lmZmZ ddlmZmZ ddlmZ ddlmZ ddlmZ ddlmZmZmZ ddlm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z&m'Z' ddl(m)Z)m*Z* ddl+m,Z, ddl-m.Z.m/Z/ ddl0m1Z1 ddl2m3Z3m4Z4m5Z5 ddl6m7Z7m8Z8m9Z9m:Z:m;Z;m<Z<m=Z= G dd dej>Z?G dd dej>Z@G dd  d ej>ZA		d.d!ejBd"ejBd#e,dB d$ejBdB fd%d&ZCedd'ddd(eCd)G d*d+ d+ej>ZDG d,d- d-ej>e4e5e3ZEdS )/z?Inference-only Qwen2 model compatible with HuggingFace weights.    )Iterable)islice)AnyN)nn)Qwen2Config)	Attention)support_torch_compile)CacheConfig
VllmConfig)get_pp_group$get_tensor_model_parallel_world_size)
SiluAndMul)EncoderOnlyAttention)RMSNorm)MergedColumnParallelLinearQKVParallelLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)get_rope)ParallelLMHeadVocabParallelEmbedding)default_weight_loadermaybe_remap_kv_scale_name)IntermediateTensors)is_interleavedset_default_rope_theta)AttentionType   )SupportsEagle3SupportsLoRA
SupportsPP)AutoWeightsLoaderPPMissingLayerextract_layer_indexis_pp_missing_parameter'make_empty_intermediate_tensors_factorymake_layersmaybe_prefixc                       sH   e Zd Z		ddededededB deddf fd	d
Zdd Z  ZS )Qwen2MLPN hidden_sizeintermediate_size
hidden_actquant_configprefixreturnc                    sh   t    t||gd d|| dd| _t||d|| dd| _|dkr.td| dt | _d S )	N   Fz.gate_up_projbiasr.   r/   z
.down_projsiluzUnsupported activation: z!. Only silu is supported for now.)	super__init__r   gate_up_projr   	down_proj
ValueErrorr   act_fn)selfr+   r,   r-   r.   r/   	__class__ ]/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/model_executor/models/qwen2.pyr6   N   s(   

zQwen2MLP.__init__c                 C   s*   |  |\}}| |}| |\}}|S N)r7   r:   r8   )r;   xgate_up_r>   r>   r?   forwardk   s   
zQwen2MLP.forward)Nr*   )	__name__
__module____qualname__intstrr   r6   rD   __classcell__r>   r>   r<   r?   r)   M   s"    r)   c                       s   e Zd Zddddejdddfdededed	eeef d
ede	dB de
dB dededeeef dB dededdf fddZdejdejdejfddZ  ZS )Qwen2Attentioni   Nr*   Fgư>r+   	num_headsnum_kv_headsrope_parametersmax_positioncache_configr.   r/   	attn_typedual_chunk_attention_configqk_normrms_norm_epsr0   c              	      s  t    || _t }|| _| j| dksJ | j| | _|| _| j|kr/| j| dks.J n	|| j dks8J td| j| | _|| j | _	| j| j	 | _
| j| j	 | _| j	d | _|
| _|| _t|| j	| j| jd|| dd| _t| j| j	 |d|| dd| _| jrt| j	|d	| _t| j	|d	| _t| j	|||
d
| _|	tjkrtnt}|| j| j	| jf| j|||	| dd|
rt||
dni | _d S )Nr   r   g      Tz	.qkv_projr2   Fz.o_projeps)rO   rN   rR   z.attn)rM   rP   r.   rQ   r/   )	layer_idxrR   )r5   r6   r+   r   total_num_headsrL   total_num_kv_headsmaxrM   head_dimq_sizekv_sizescalingrR   rS   r   qkv_projr   o_projr   q_normk_normr   
rotary_embr   ENCODER_ONLYr   r   r$   attn)r;   r+   rL   rM   rN   rO   rP   r.   r/   rQ   rR   rS   rT   tp_sizeattn_clsr<   r>   r?   r6   s   s   

	
	
zQwen2Attention.__init__	positionshidden_statesc                 C   s   |  |\}}|j| j| j| jgdd\}}}| jrI|jd }||| j| j}||| j	| j}| 
|}| |}||| j}||| j}| |||\}}| |||}	| |	\}
}|
S )N)dimr   )r_   splitr\   r]   rS   shapeviewrL   r[   rM   ra   rb   rc   re   r`   )r;   rh   ri   qkvrC   qkvtotal_tokensattn_outputoutputr>   r>   r?   rD      s    


zQwen2Attention.forward)rE   rF   rG   r   DECODERrH   dictrI   r   r	   r   boolfloatr6   torchTensorrD   rJ   r>   r>   r<   r?   rK   r   sV    
	
XrK   c                       sr   e Zd Z			ddededB dedB deddf
 fdd	Zd
ej	dej	dej	dB de
ej	ej	f fddZ  ZS )Qwen2DecoderLayerNr*   configrP   r.   r/   r0   c                    s   t    |j| _t|dd t|dd }t|ddrtj}ntj}t|dd}t| j|j	|j
|j|||j| d||||jd	| _t| j|j|j|| d
d| _t|j|jd| _t|j|jd| _d S )Ni@B )default_thetarR   	is_causalTrS   Fz
.self_attn)r+   rL   rO   rM   rP   r.   rN   r/   rQ   rR   rS   rT   z.mlp)r+   r,   r-   r.   r/   rU   )r5   r6   r+   r   getattrr   rv   rd   rK   num_attention_headsmax_position_embeddingsnum_key_value_headsrN   rT   	self_attnr)   r,   r-   mlpr   input_layernormpost_attention_layernorm)r;   r}   rP   r.   r/   rR   rQ   rS   r<   r>   r?   r6      sF   
zQwen2DecoderLayer.__init__rh   ri   residualc                 C   sX   |d u r|}|  |}n|  ||\}}| j||d}| ||\}}| |}||fS )N)rh   ri   )r   r   r   r   )r;   rh   ri   r   r>   r>   r?   rD     s   
zQwen2DecoderLayer.forward)NNr*   )rE   rF   rG   r   r	   r   rI   r6   rz   r{   tuplerD   rJ   r>   r>   r<   r?   r|      s0    4r|   	input_idsrh   intermediate_tensorsinputs_embedsc                 C   s   t |  d | d k |dur$t |  d |d  d k |dur7t |  d | d k |durR|durTt | d |d  d k dS dS dS )zShape invariants for Qwen2Model Model, those are translated to
    runtime assertions for unbacked dynamic shapes and are compiled away for
    backedr   rj   Nri   r   )rz   _checksizer   rh   r   r   r>   r>   r?   qwen_2_model_invariants5  s   r   rj   r   )dynamic_arg_dimsshape_invariantsc                       s   e Zd Zdeddededeej f fddZ	de
jd	e
jfd
dZ		dde
jde
jdedB de
jdB d	e
jeB f
ddZdeeee
jf  d	ee fddZ  ZS )
Qwen2Modelr*   )r/   decoder_layer_typevllm_configr/   r   c                   s  t    |jj |j |jt|jjr'j	j
ks'J dj	j
| _| _j| _t js<jrKt jrKtjj| dd| _nt | _tj
 fdd| dd\| _| _| _tdd	gj| _t jr~tjjd
| _nt | _tt df  | _!d S )NzSliding window for some but all layers is not supported. This model uses sliding window but `max_window_layers` = {} is less than `num_hidden_layers` = {}. Please open an issue to discuss this feature.z.embed_tokensr.   r/   c                    s    | dS )N)r}   rP   r.   r/   r>   r/   rP   r}   r   r.   r>   r?   <lambda>  s    z%Qwen2Model.__init__.<locals>.<lambda>z.layersr   ri   r   rU   .)"r5   r6   model_config	hf_configget_text_configrP   r.   r   hf_text_configmax_window_layersnum_hidden_layersformatr}   
vocab_sizer   is_first_ranktie_word_embeddingsis_last_rankr   r+   embed_tokensr#   r'   start_layer	end_layerlayersr&   make_empty_intermediate_tensorsr   rT   normr   rH   aux_hidden_state_layers)r;   r   r/   r   r<   r   r?   r6   a  sN   



zQwen2Model.__init__r   r0   c                 C   s
   |  |S r@   )r   r;   r   r>   r>   r?   embed_input_ids  s   
zQwen2Model.embed_input_idsNrh   r   r   c                 C   s   t  jr|d ur|}n| |}d }n|d usJ |d }|d }g }tt| j| j| jD ]\}}	|| jv r>|	||  |	|||\}}q.t  j
sRt||dS | ||\}}
t|dkrd||fS |S )Nri   r   )ri   r   r   )r   r   r   	enumerater   r   r   r   r   appendr   r   r   len)r;   r   rh   r   r   ri   r   aux_hidden_statesidxlayerrC   r>   r>   r?   rD     s.   

zQwen2Model.forwardweightsc                 C   s  g d}t | jdd}t }|D ]\}}d|v rq| jd urH| j| }rH|| }t|dt}	| dkr9|n|d }|	|| || q|D ]J\}
}}||vrTqJ|	||
}|
drd||vrdqJt|| rjqJ|
dryt||}|d u ryqJ|| }t|dt}	|	tkr|	|| n|	|||  n.|
dr||vrqt||}|d u rqt|| rq||vrq|| }t|dt}	|	|| || q|S )	N))r_   q_projrp   )r_   k_projrq   )r_   v_projrr   )r7   	gate_projr   )r7   up_projr   F)remove_duplicatezrotary_emb.inv_freqweight_loaderr   z.biasscale)rw   named_parameterssetr.   get_cache_scaler   r   rk   addreplaceendswithr%   r   )r;   r   stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
scale_nameparamr   
param_nameweight_nameshard_idr>   r>   r?   load_weights  s`   








zQwen2Model.load_weightsNN)rE   rF   rG   r|   r
   rI   typer   Moduler6   rz   r{   r   r   rD   r   r   r   r   rJ   r>   r>   r<   r?   r   U  s2    >
,&r   c                       s   e Zd Zg dddgdZdddedef fd	d
ZdejdejfddZ	de
edf ddfddZde
edf fddZ		d!dejdejdedB dejdB dejeB f
ddZdejdejdB fddZdee
eejf  dee fdd Z  ZS )"Qwen2ForCausalLM)r   r   r   r   r   )r_   r7   r*   r   r   r/   c                   s   t    |jj }|j}|| _|| _t|t|dd| _	t
 jr:|jr+| j	j| _nt|j|j|t|dd| _nt | _t|j| _| j	j| _d S )Nmodel)r   r/   lm_headr   )r5   r6   r   r   r   r.   r}   r   r(   r   r   r   r   r   r   r   r   r+   r#   r   logits_processorr   )r;   r   r/   r}   r.   r<   r>   r?   r6     s*   


zQwen2ForCausalLM.__init__r   r0   c                 C   s   | j |S r@   )r   r   r   r>   r>   r?   r   6     z Qwen2ForCausalLM.embed_input_idsr   .Nc                 C   s   || j _d S r@   )r   r   )r;   r   r>   r>   r?   set_aux_hidden_state_layers9  r   z,Qwen2ForCausalLM.set_aux_hidden_state_layersc                 C   s   t | jj}d|d |d fS )Nr1      )r   r   r   )r;   
num_layersr>   r>   r?   "get_eagle3_aux_hidden_state_layers<  s   z3Qwen2ForCausalLM.get_eagle3_aux_hidden_state_layersrh   r   r   c                 C   s   |  ||||}|S r@   )r   )r;   r   rh   r   r   ri   r>   r>   r?   rD   @  s   zQwen2ForCausalLM.forwardri   c                 C   s   |  | j|}|S r@   )r   r   )r;   ri   logitsr>   r>   r?   compute_logitsL  s   zQwen2ForCausalLM.compute_logitsr   c                 C   s$   t | | jjr	dgnd d}||S )Nzlm_head.)skip_prefixes)r"   r}   r   r   )r;   r   loaderr>   r>   r?   r   S  s
   
zQwen2ForCausalLM.load_weightsr   )rE   rF   rG   packed_modules_mappingr
   rI   r6   rz   r{   r   r   rH   r   r   r   rD   r   r   r   r   rJ   r>   r>   r<   r?   r   
  s:    

,r   r   )F__doc__collections.abcr   	itertoolsr   typingr   rz   r   transformersr   vllm.attention.layerr   vllm.compilation.decoratorsr   vllm.configr	   r
   vllm.distributedr   r   %vllm.model_executor.layers.activationr   ;vllm.model_executor.layers.attention.encoder_only_attentionr   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   r   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.rotary_embeddingr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r   -vllm.model_executor.model_loader.weight_utilsr   r   vllm.sequencer   vllm.transformers_utils.configr   r   vllm.v1.attention.backendr   
interfacesr   r    r!   utilsr"   r#   r$   r%   r&   r'   r(   r   r)   rK   r|   r{   r   r   r   r>   r>   r>   r?   <module>   sd   $%wO
  +