o
    پi?T                     @   s  d Z ddlZddlmZmZ ddlmZmZmZm	Z	m
Z
mZ ddlZddlmZ ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z&m'Z' ddl(m)Z)m*Z* ddl+m,Z, ddl-m.Z. ddl/m0Z0 ddl1m2Z2m3Z3 ddl4m5Z5m6Z6 ddl7m8Z8 e9e:Z;G dd deZ<G dd deZ=G dd dej>Z?G dd dej>Z@G dd  d ej>ZAG d!d" d"ej>ZBG d#d$ d$ej>ZCG d%d& d&ej>ZDee8ZEG d'd( d(ej>ZFeFZGdS ))zBInference-only Qwen2-VL model compatible with HuggingFace weights.    N)	lru_cachepartial)IterableListOptionalTupleType	TypedDict)	rearrange)Qwen2VLConfig)Qwen2VLVisionConfig)	QuickGELU)VisionAttention)ColumnParallelLinearRowParallelLinear)LogitsProcessor)PoolerPoolingType)QuantizationConfig)ParallelLMHead)/MultiModalityDataPaddingPatternMultimodalTokensgeneral_mm_embed_routine)MultimodalDataItemMultimodalInputs)ForwardBatch)default_weight_loader)
Qwen2Model)WeightsMapper"compute_cu_seqlens_from_grid_numpy)
add_prefixis_npu)get_processorc                   @   $   e Zd ZU ejed< 	 ejed< dS )Qwen2VLImageInputspixel_valuesimage_grid_thwN__name__
__module____qualname__torchTensor__annotations__ r-   r-   N/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/qwen2_vl.pyr#   =   s
   
 

r#   c                   @   r"   )Qwen2VLVideoInputspixel_values_videosvideo_grid_thwNr&   r-   r-   r-   r.   r/   J   s
   
 

r/   c                       s\   e Zd Zdeddfdededeej dee	 de
f
 fdd	Zd
ejdejfddZ  ZS )Qwen2VisionMLPN in_featureshidden_features	act_layerquant_configprefixc                    sF   t    t|||td|d| _| | _t|||td|d| _d S )Nfc1r7   r8   fc2)super__init__r   r   r9   actr   r;   )selfr4   r5   r6   r7   r8   	__class__r-   r.   r=   ]   s   
zQwen2VisionMLP.__init__xreturnc                 C   s*   |  |\}}| |}| |\}}|S N)r9   r>   r;   )r?   rB   
x_parallel_r-   r-   r.   forwardt   s   
zQwen2VisionMLP.forward)r'   r(   r)   r   intr   nnModuler   r   strr=   r*   r+   rG   __classcell__r-   r-   r@   r.   r2   [   s"    r2   c                       sz   e Zd Zedddfdedededeej deej de	e
 d	ed
df fddZdejdejdejd
ejfddZ  ZS )Qwen2VisionBlockNr3   dim	num_heads	mlp_ratior6   
norm_layerr7   r8   rC   c           	   
      s|   t    |d u rttjdd}||| _||| _t|| }t|||dd|t	d|d| _
t||||t	d|d| _d S )Nư>epsTattn)	embed_dimrO   projection_sizeuse_qkv_parallelflatten_batchr7   r8   mlp)r6   r7   r8   )r<   r=   r   rI   	LayerNormnorm1norm2rH   r   r   rU   r2   rZ   )	r?   rN   rO   rP   r6   rQ   r7   r8   mlp_hidden_dimr@   r-   r.   r=   }   s,   



	zQwen2VisionBlock.__init__rB   
cu_seqlensposition_embeddingsc                 C   sN   |  |}t|d}| j|||d}t|d}|| }|| | | }|S )Nzs b ... -> b s ...r_   r`   zb s ... -> s b ...)r\   r
   rU   rZ   r]   )r?   rB   r_   r`   hidden_statesrU   r-   r-   r.   rG      s   


zQwen2VisionBlock.forward)r'   r(   r)   r   rH   floatr   rI   rJ   r   r   rK   r=   r*   r+   rG   rL   r-   r-   r@   r.   rM   {   s>    	"rM   c                       sR   e Zd Z				ddedededed	d
f
 fddZdejd	ejfddZ  ZS )Qwen2VisionPatchEmbed           
patch_sizetemporal_patch_sizein_chansrV   rC   Nc                    s@   t    || _|| _|| _|||g}tj||||dd| _d S )NF)kernel_sizestridebias)r<   r=   ri   rj   rV   rI   Conv3dproj)r?   ri   rj   rk   rV   rl   r@   r-   r.   r=      s   


zQwen2VisionPatchEmbed.__init__rB   c                 C   s:   |j \}}||d| j| j| j}| ||| j}|S N)shapeviewrj   ri   rp   rV   )r?   rB   LCr-   r-   r.   rG      s   
zQwen2VisionPatchEmbed.forward)re   rf   rg   rh   )	r'   r(   r)   rH   r=   r*   r+   rG   rL   r-   r-   r@   r.   rd      s"    rd   c                       sd   e Zd Z				ddededeej dedee d	e	d
df fddZ
dejd
ejfddZ  ZS )Qwen2VisionPatchMergerNrf   r3   d_modelcontext_dimrQ   spatial_merge_sizer7   r8   rC   c                    s   t    ||d  | _|d u rttjdd}||| _tt| j| jd|t	d|dt
 t| j|d|t	d|dg| _d S )Nrf   rR   rS   Tzmlp.0)rn   r7   r8   zmlp.2)r<   r=   hidden_sizer   rI   r[   ln_q
ModuleListr   r   GELUr   rZ   )r?   rx   ry   rQ   rz   r7   r8   r@   r-   r.   r=      s.   
	

zQwen2VisionPatchMerger.__init__rB   c                 C   sH   |  |}|d| j}| j\}}}||\}}||}||\}}|S rq   )r|   rt   r{   rZ   )r?   rB   mlp_fc1mlp_actmlp_fc2rE   rF   outr-   r-   r.   rG      s   
zQwen2VisionPatchMerger.forward)Nrf   Nr3   )r'   r(   r)   rH   r   rI   rJ   r   r   rK   r=   r*   r+   rG   rL   r-   r-   r@   r.   rw      s*    "rw   c                       sR   e Zd Zddededdf fddZdeddfd	d
ZdedejfddZ	  Z
S )Qwen2VisionRotaryEmbedding     @rN   thetarC   Nc                    sV   t    || _|| _d|tjd|dtjd|   }| jd|dd d| _d | _	d S )N      ?r   rf   )dtypeinv_freqF)
persistent)
r<   r=   rN   r   r*   arangerc   register_buffer_seq_len_cached_freqs_cached)r?   rN   r   r   r@   r-   r.   r=      s   
 
z#Qwen2VisionRotaryEmbedding.__init__seqlenc              	   C   sz   || j kr;|d9 }|| _ d| jtjd| jdtj| jjd| j   | _tj|| jj| jjd}t	|| j}|| _
d S d S )Nrf   r   r   )r   devicer   r   )r   r   r*   r   rN   rc   r   r   r   outerr   )r?   r   seqfreqsr-   r-   r.   update_freqs_cache  s$   
	
z-Qwen2VisionRotaryEmbedding.update_freqs_cachec                 C   s   |  | | jd | S rD   )r   r   )r?   r   r-   r-   r.   rG     s   
z"Qwen2VisionRotaryEmbedding.forward)r   )r'   r(   r)   rH   rc   r=   r   r*   r+   rG   rL   r-   r-   r@   r.   r      s    	r   c                       s   e Zd Z			ddededee deddf
 fd	d
Ze	de
jfddZe	de
jfddZde
jde
jfddZde
jde
jde
jfddZ  ZS )Qwen2VisionTransformerrR   Nr3   vision_confignorm_epsr7   r8   rC   c                    s   t    |j}|j}|j}|j}|j}	|j |j}
|j	|j
|| _t||| d| _ttj|d  }t|d | _t fddt|
D | _t|	 tdd| _d S )N)ri   rj   rk   rV   rS   rf   c                    s,   g | ]}t  td | dqS )zblocks.)rN   rO   rP   rQ   r7   r8   )rM   r   ).0irV   rP   rQ   rO   r8   r7   r-   r.   
<listcomp>@  s    	z3Qwen2VisionTransformer.__init__.<locals>.<listcomp>merger)rx   ry   rQ   r7   r8   )r<   r=   ri   rj   rz   rk   r{   rV   depthrO   rP   rd   patch_embedr   rI   r[   r   rotary_pos_embr}   rangeblocksrw   r   r   )r?   r   r   r7   r8   ri   rj   rz   rk   r{   r   head_dimr@   r   r.   r=      s@   
	zQwen2VisionTransformer.__init__c                 C   s   | j jjjS rD   )r   rp   weightr   r?   r-   r-   r.   r   T     zQwen2VisionTransformer.dtypec                 C   s   | j d jjjjS )Nr   )r   rZ   r;   r   r   r   r-   r-   r.   r   X  s   zQwen2VisionTransformer.devicegrid_thwc                 C   s  g }t |dD ]c}||  \}}}t|dd|}t|d|d}||| j | j|| j | j	dddd
 }||| j | j|| j | j	dddd
 }|tj||gdd|d q	tj|dd}|d d dd f  }	| |	}
|
| 
d}|S )Nr      rr   rf   rg   rN   )r   sizetolistr*   r   	unsqueezeexpandreshaperz   permuteflattenappendstackrepeatcatmaxr   )r?   r   pos_idsr   thwhpos_idswpos_idsmax_grid_sizerotary_pos_emb_fullr   r-   r-   r.   rot_pos_emb\  s:   "

z"Qwen2VisionTransformer.rot_pos_embrB   c                 C   s   |j | j| jd}| |}| |}tj||fdd}| | f}t	|}t
 r0| d}|d}| jD ]	}||||d}q8| |}|S )Nr   rr   r   cpur   ra   )tor   r   r   r   r*   r   cossinr   r    r   r   r   )r?   rB   r   r   embr`   r_   blkr-   r-   r.   rG   }  s   





zQwen2VisionTransformer.forward)rR   Nr3   )r'   r(   r)   r   rc   r   r   rK   r=   propertyr*   r   r   r+   r   rG   rL   r-   r-   r@   r.   r     s6    4!r   c                	       s  e Zd Zg dZddddddZedd	id
ddd
ddZ		d1dedee	 de
ddf fddZdee defddZdee dejfddZdee dejfddZd edejfd!d"Zd#d$ Zd%e
defd&d'Z		(d2dejd)ejd*ed+efd,d-Zd.eee
ejf  fd/d0Z  ZS )3Qwen2VLForConditionalGeneration)z.gate_proj.z.down_proj.z	.up_proj.z.q_proj.z.k_proj.z.v_proj.z.o_proj.)qkv_projr   )r   r   )r   rf   )gate_up_projr   )r   r   )q_projk_projv_proj	gate_projup_projzattn.qkvzattn.qkv_projzlanguage_model.model.zvisual.zlanguage_model.lm_head.)zmodel.language_model.zmodel.visual.zlm_head.zmodel.)orig_to_new_substrorig_to_new_prefixNr3   configr7   r8   rC   c                    s   t    || _t|jt|dd|td|d| _t||td|d| _	|j
r-| j	j| _nt|j|j|td|d| _d	| jjv | _t|| _ttjd
d| _d S )Nrms_norm_epsrR   visual)r   r7   r8   model)r8   lm_headr:   mrope_sectionT)pooling_type	normalize)r<   r=   r   r   r   getattrr   r   r   r   tie_word_embeddingsembed_tokensr   r   
vocab_sizer{   rope_scalingis_mrope_enabledr   logits_processorr   r   LASTpooler)r?   r   r7   r8   r@   r-   r.   r=     s,   

	
z(Qwen2VLForConditionalGeneration.__init__	input_ids	mm_inputsc                 C   s   t  }|||S rD   )r   pad_input_tokens)r?   r   r   patternr-   r-   r.   pad_input_ids  s   z-Qwen2VLForConditionalGeneration.pad_input_idsitemsc                 C   |   t jdd |D dd| jj}t jdd |D dd}| dks)J | | dks5J | | j||d}|S )Nc                 S      g | ]}|j qS r-   featurer   itemr-   r-   r.   r         zEQwen2VLForConditionalGeneration.get_image_feature.<locals>.<listcomp>r   r   c                 S   r   r-   )r%   r   r-   r-   r.   r     r   rf   r   r*   r   typer   r   concatrN   )r?   r   r$   r%   image_embedsr-   r-   r.   get_image_feature     z1Qwen2VLForConditionalGeneration.get_image_featurec                 C   r   )Nc                 S   r   r-   r   r   r-   r-   r.   r     r   zEQwen2VLForConditionalGeneration.get_video_feature.<locals>.<listcomp>r   r   c                 S   r   r-   )r1   r   r-   r-   r.   r     r   rf   r   r   )r?   r   r$   r1   video_embedsr-   r-   r.   get_video_feature  r   z1Qwen2VLForConditionalGeneration.get_video_featurevideo_inputc                 C   s(   |d  | jj}| j||d d}|S )Nr0   r1   r   )r   r   r   )r?   r   r0   r   r-   r-   r.   _process_video_input  s
   z4Qwen2VLForConditionalGeneration._process_video_inputc                 C   s   | j jS rD   )r   r   r   r-   r-   r.   get_input_embeddings  s   z4Qwen2VLForConditionalGeneration.get_input_embeddingsmodule_namec                 C   s   | d S )Nr   )
startswith)r?   r   r-   r-   r.   should_apply_lora  r   z1Qwen2VLForConditionalGeneration.should_apply_loraF	positionsforward_batchget_embeddingc                 C   s   | j r|j}|j s'| r'| j r'|jdkr|ddks'J d|  t||| j| |d}|r9| 	||S | 
||| j|S )a  Run forward pass for Qwen2-VL.

        Args:
            input_ids: Flattened (concatenated) input_ids corresponding to a
                batch.
            positions: Flattened (concatenated) position ids corresponding to a
                batch.
                **NOTE**: If mrope is enabled (default setting for Qwen2-VL
                opensource models), the shape will be `(3, seq_len)`,
                otherwise it will be `(seq_len,).
                (Use input_metadata.mrope_positions to replace it)
        rf   r   rg   zMmultimodal section rotary embedding requires (3, seq_len) positions, but got )r   r   language_modelmultimodal_modelr   )r   mrope_positionsforward_mode	is_decodecontains_image_inputsndimr   r   r   r   r   r   )r?   r   r   r   input_embedsr   rb   r-   r-   r.   rG     s0   
z'Qwen2VLForConditionalGeneration.forwardweightsc              	   C   s  g d}t | jdd}|D ]r\}}d|v rq| jjr d|v r q|D ](\}}}||vr,q"|||}|dr<||vr<q"|| }	|	j}
|
|	||  n5d|v rU|dd	}z|dra||vraW q|| }	W n tyt   t|	   w t
|	d
t}
|
|	| qd S )N))r   r   q)r   r   k)r   r   v)r   r   r   )r   r   r   F)remove_duplicatezrotary_emb.inv_freqzlm_head.weightz.biasr   z	attn.qkv.zattn.qkv_proj.weight_loader)dictnamed_parametersr   r   replaceendswithr  KeyErrorprintkeysr   r   )r?   r  stacked_params_mappingparams_dictnameloaded_weight
param_nameweight_nameshard_idparamr  r-   r-   r.   load_weights<  s>   
z,Qwen2VLForConditionalGeneration.load_weights)Nr3   )NF) r'   r(   r)   #default_bitsandbytes_target_modules#bitsandbytes_stacked_params_mappingr   hf_to_sglang_mapperr   r   r   rK   r=   r   rH   r   r   r   r*   r+   r   r   r/   r   r   boolr   r   rG   r   r   r  rL   r-   r-   r@   r.   r     s\    
$	
$0r   )H__doc__logging	functoolsr   r   typingr   r   r   r   r   r	   r*   torch.nnrI   einopsr
   transformersr   3transformers.models.qwen2_vl.configuration_qwen2_vlr   sglang.srt.layers.activationr   "sglang.srt.layers.attention.visionr   sglang.srt.layers.linearr   r   "sglang.srt.layers.logits_processorr   sglang.srt.layers.poolerr   r   *sglang.srt.layers.quantization.base_configr   *sglang.srt.layers.vocab_parallel_embeddingr   sglang.srt.managers.mm_utilsr   r   "sglang.srt.managers.schedule_batchr   r   ,sglang.srt.model_executor.forward_batch_infor   $sglang.srt.model_loader.weight_utilsr   sglang.srt.models.qwen2r   sglang.srt.models.utilsr   r   sglang.srt.utilsr   r    &sglang.srt.utils.hf_transformers_utilsr!   	getLoggerr'   loggerr#   r/   rJ   r2   rM   rd   rw   r   r   cached_get_processorr   
EntryClassr-   r-   r-   r.   <module>   sJ    
 7/#| N