o
    i3                     @   s  d Z ddlZddlmZ ddlmZ ddlmZ ddlZddlm	Z	 ddl
mZ ddlmZ dd	lmZmZ dd
lmZmZ ddlmZ ddlmZ ddlmZ ddlmZmZmZ ddlmZ ddl m!Z! ddl"m#Z# ddl$m%Z%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z,m-Z- ddl.m/Z/m0Z0m1Z1m2Z2 G dd de	j3Z4G dd de	j3Z5G dd de	j3Z6eG dd de	j3Z7G d d! d!e	j3Z8G d"d# d#e8e-e,Z9dS )$z>Inference-only QWen model compatible with HuggingFace weights.    N)Iterable)islice)Any)nn)PretrainedConfig)support_torch_compile)CacheConfig
VllmConfig)get_pp_group$get_tensor_model_parallel_world_size)
SiluAndMul)	Attention)RMSNorm)MergedColumnParallelLinearQKVParallelLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)get_rope)ParallelLMHeadVocabParallelEmbedding)default_weight_loader)IntermediateTensors   )SupportsLoRA
SupportsPP)is_pp_missing_parameter'make_empty_intermediate_tensors_factorymake_layersmaybe_prefixc                       sX   e Zd ZdZ			ddededededB d	ef
 fd
dZdej	dej	fddZ
  ZS )QWenMLPzMLP for the language component of the Qwen model, which contains a
    MergedColumnParallelLinear merging 2 outputs via silu activation.siluN hidden_sizeintermediate_size
hidden_actquant_configprefixc                    sh   t    t||gd d|| dd| _t||d|| dd| _|dkr.td| dt | _d S )	N   Fz.gate_up_projbiasr&   r'   .c_projr!   zUnsupported activation: z!. Only silu is supported for now.)	super__init__r   gate_up_projr   c_proj
ValueErrorr   act_fn)selfr#   r$   r%   r&   r'   	__class__ U/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/model_executor/models/qwen.pyr-   5   s(   

zQWenMLP.__init__xreturnc                 C   s*   |  |\}}| |}| |\}}|S N)r.   r1   r/   )r2   r7   gate_up_r5   r5   r6   forwardR   s   
zQWenMLP.forward)r!   Nr"   )__name__
__module____qualname____doc__intstrr   r-   torchTensorr<   __classcell__r5   r5   r3   r6   r    1   s"    r    c                       st   e Zd Z				ddedededeeef dB dedB dedB d	ef fd
dZ	de
jde
jde
jfddZ  ZS )QWenAttentionNr"   r#   	num_headsmax_position_embeddingsrope_parameterscache_configr&   r'   c           	         s   t    || _t }|| _| j| dksJ | j| | _|| j | _t|| j| jd|| dd| _t	| j| j |d|| dd| _
| jd | _t| j||d| _t| j| j| j||| d	d
| _d S )Nr   Tz.c_attnr)   Fr+   g      )max_positionrI   .attn)rJ   r&   r'   )r,   r-   r#   r   total_num_headsrG   head_dimr   c_attnr   r/   scalingr   
rotary_embr   attn)	r2   r#   rG   rH   rI   rJ   r&   r'    tensor_model_parallel_world_sizer3   r5   r6   r-   Z   sH   


zQWenAttention.__init__	positionshidden_statesr8   c           
      C   sT   |  |\}}|jddd\}}}| |||\}}| |||}| |\}	}|	S )N   )chunksdim)rO   chunkrQ   rR   r/   )
r2   rT   rU   qkvr;   qkvattn_outputoutputr5   r5   r6   r<      s   zQWenAttention.forward)NNNr"   )r=   r>   r?   rA   dictrB   r   r   r   r-   rC   rD   r<   rE   r5   r5   r3   r6   rF   Y   s6    0rF   c                
       sn   e Zd Z			ddededB dedB def fddZd	ej	d
ej	dej	dB de
ej	ej	f fddZ  ZS )	QWenBlockNr"   configrJ   r&   r'   c              	      sx   t    t|j|jd| _t|j|j|j|j	||| dd| _
t|j|jd| _t|j|jd || dd| _d S )NepsrL   )rI   rJ   r&   r'   r(   z.mlpr&   r'   )r,   r-   r   r#   layer_norm_epsilonln_1rF   num_attention_headsrH   rI   rR   ln_2r    r$   mlp)r2   rc   rJ   r&   r'   r3   r5   r6   r-      s$   

zQWenBlock.__init__rT   rU   residualr8   c                 C   sX   |d u r|}|  |}n|  ||\}}| j||d}| ||\}}| |}||fS )N)rT   rU   )rh   rR   rj   rk   )r2   rT   rU   rl   r5   r5   r6   r<      s   
zQWenBlock.forward)NNr"   )r=   r>   r?   r   r   r   rB   r-   rC   rD   tupler<   rE   r5   r5   r3   r6   rb      s,    rb   c                       s|   e Zd Zdddedef fddZdejdejfd	d
Z	ddejdB dejde	dB dejdB deje	B f
ddZ
  ZS )	QWenModelr"   r'   vllm_configr'   c                   s   t    |jj|j |j| _j| _tjj	| _
tj fdd| dd\| _| _| _tj	jd| _tddgj	| _d S )Nc                    s   t  | dS )Nro   )rb   ro   rJ   rc   r&   r5   r6   <lambda>   s    z$QWenModel.__init__.<locals>.<lambda>z.hro   rd   rU   rl   )r,   r-   model_config	hf_configrJ   r&   rc   
vocab_sizer   r#   wter   num_hidden_layersstart_layer	end_layerhr   rg   ln_fr   make_empty_intermediate_tensors)r2   rp   r'   r3   rq   r6   r-      s&   


zQWenModel.__init__	input_idsr8   c                 C   s
   |  |S r9   )rv   r2   r}   r5   r5   r6   embed_input_ids   s   
zQWenModel.embed_input_idsNrT   intermediate_tensorsinputs_embedsc           	      C   s   t  jr|d ur|}n| |}d }n|d usJ |d }|d }t| j| j| jD ]
}||||\}}q*t  js@t||dS | 	||\}}|S )NrU   rl   )rU   rl   )
r
   is_first_rankr   r   rz   rx   ry   is_last_rankr   r{   )	r2   r}   rT   r   r   rU   rl   layerr;   r5   r5   r6   r<      s(   

zQWenModel.forwardr9   )r=   r>   r?   r	   rB   r-   rC   rD   r   r   r<   rE   r5   r5   r3   r6   rn      s    rn   c                	       s   e Zd Zdeddededee ddf fdd	Zd
ej	dej	fddZ
dej	dej	dB fddZdeeeej	f  dee fddZ  ZS )QWenBaseModelr"   )r'   transformer_typerp   r'   r   r8   Nc                   s   t    |jj}|j}|jj}|| _|| _|| _||t|dd| _t	|j
|j|t|dd| _| jjr<| jjj| j_t|j
| _| jj| _d S )Ntransformerrp   r'   lm_headrf   )r,   r-   rs   rt   r&   multimodal_configrc   r   r   r   ru   r#   r   tie_word_embeddingsrv   weightr   logits_processorr|   )r2   rp   r'   r   rc   r&   r   r3   r5   r6   r-   
  s*   

zQWenBaseModel.__init__r}   c                 C   s   | j |S r9   )r   rv   r~   r5   r5   r6   r   (  s   zQWenBaseModel.embed_input_idsrU   c                 C   s   |  | j|}|S r9   )r   r   )r2   rU   logitsr5   r5   r6   compute_logits+  s   zQWenBaseModel.compute_logitsweightsc                 C   s   ddg}t |  }t }|D ]^\}}d|v rq|D ].\}}}	||vr$q|||}|dr4||vr4qt|| r:q|| }
|
j}||
||	  n|drS||vrSqt|| rYq|| }
t|
dt}||
| |	| q|S )N)r.   w2r   )r.   w1r   zrotary_emb.inv_freqz.biasweight_loader)
ra   named_parameterssetreplaceendswithr   r   getattrr   add)r2   r   stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
param_nameweight_nameshard_idparamr   r5   r5   r6   load_weights2  s:   


zQWenBaseModel.load_weights)r=   r>   r?   rn   r	   rB   typer-   rC   rD   r   r   r   rm   r   r   rE   r5   r5   r3   r6   r   	  s&    
,r   c                       sx   e Zd ZdgddgdZdddedef fd	d
Z		ddejdB dejde	dB dejdB deje	B f
ddZ
  ZS )QWenLMHeadModelrO   r   r   )rO   r.   r"   ro   rp   r'   c                   sF   |j j}t|drddgi}tdt| dt j||d d S )NvisualarchitecturesQwenVLForConditionalGenerationzThe configuration of this model indicates that it supports vision inputs, but you instantiated the text-only version of this model. Please use the vision model by setting `--hf-overrides 'z'`r   )rs   rt   hasattrRuntimeErrorjsondumpsr,   r-   )r2   rp   r'   rc   hf_overridesr3   r5   r6   r-   b  s   

zQWenLMHeadModel.__init__Nr}   rT   r   r   r8   c                 C   s   |  ||||}|S r9   )r   )r2   r}   rT   r   r   rU   r5   r5   r6   r<   o  s   zQWenLMHeadModel.forward)NN)r=   r>   r?   packed_modules_mappingr	   rB   r-   rC   rD   r   r<   rE   r5   r5   r3   r6   r   Y  s(    r   ):r@   r   collections.abcr   	itertoolsr   typingr   rC   r   transformersr   vllm.compilation.decoratorsr   vllm.configr   r	   vllm.distributedr
   r   %vllm.model_executor.layers.activationr   $vllm.model_executor.layers.attentionr   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   r   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.rotary_embeddingr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r   -vllm.model_executor.model_loader.weight_utilsr   vllm.sequencer   
interfacesr   r   utilsr   r   r   r   Moduler    rF   rb   rn   r   r   r5   r5   r5   r6   <module>   s<   (>5<P