o
    
۾i!                     @   s  d dl mZmZmZmZmZ d dlmZmZ d dl	m
Z
mZmZ d dlZd dlmZ d dlm  mZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z&m'Z'm(Z(m)Z) d dl*m+Z+ d dl,m-Z- d dl.m/Z/ d dl0m1Z1 d dl2m3Z3 d dl4m5Z5m6Z6m7Z7m8Z8m9Z9 d dl:m;Z; d dl<m=Z=m>Z>m?Z? d dl@mAZAmBZBmCZCmDZD d dlEmFZF d dlGmHZHmIZI d dlJmKZK d dlLmMZMmNZNmOZO d dlPmQZQ d dlRmSZSmTZT d dlUmVZV dd lWmXZX eYeZZ[G d!d" d"ej\Z]G d#d$ d$ej\Z^G d%d& d&ej\Z_G d'd( d(ej\Z`G d)d* d*ej\ZaG d+d, d,ej\ZbG d-d. d.ej\ZcG d/d0 d0ej\ZdG d1d2 d2e?ZeG d3d4 d4eSZfG d5d6 d6eSZgG d7d8 d8eSZhG d9d: d:eSZiG d;d< d<e>ZjG d=d> d>e=ZkeFjlejeeekd?G d@dA dAej\e8e6e9e7ZmdBdC ZndDdE ZoedFdG						dWdHepdB dIeqereq B dB dJeqereq B dB dKepdB dLeqdB dMedN dOesfdPdQZtejufdRdSdKepdLeqdHepdIeqereq B dJeqereq B dTejvdOdSfdUdVZwdS )X    )CallableIterableIteratorMappingSequence)	lru_cachepartial)	AnnotatedLiteralOptionalN)	rearrange)v2)logging)
VllmConfig)parallel_state)utils)_ACTIVATION_REGISTRY)MMEncoderAttention)RMSNorm)ColumnParallelLinearMergedColumnParallelLinearQKVParallelLinearRowParallelLinear)QuantizationConfig)
GPTQConfig)GPTQMarlinConfig)ApplyRotaryEmb)default_weight_loader)MultiModalEmbeddingsSupportsLoRASupportsMRoPESupportsMultiModal
SupportsPP)MultiModelKeys)Qwen2_5_VLDummyInputsBuilderQwen2_5_VLMultiModalProcessorQwen2_5_VLProcessingInfo)AutoWeightsLoaderWeightsMapperinit_vllm_registered_modelmaybe_prefixMULTIMODAL_REGISTRY)MultiModalFeatureSpecMultiModalKwargsItems)MultiModalDataItems)PromptReplacementPromptUpdatePromptUpdateDetails)IntermediateTensors)TensorSchemaTensorShape)AttentionBackendEnum   )get_vit_attn_backendc                       sh   e Zd Z		ddededededB deddf fd	d
Zdejdejdejdejdejf
ddZ	  Z
S )OpenPanguVisionAttentionN 	embed_dim	num_headsprojection_sizequant_configprefixreturnc              	      s   t    t||| _t | _t | _	t|| j| _
t|| j||d|| dd| _t|||| dd| _t| j
| j| jd | dd| _tdd	| _d S )
NTz.qkv)hidden_size	head_sizetotal_num_headstotal_num_kv_headsbiasr>   r?   z.proj)
input_sizeoutput_sizer>   r?   g      .attn)r<   rB   scaler?   )enforce_enable)super__init__
dist_utilsdividehidden_size_per_attention_headr   $get_tensor_model_parallel_world_sizetp_sizeget_tensor_model_parallel_ranktp_rank!num_attention_heads_per_partitionr   qkvr   projr   attnr   apply_rotary_emb)selfr;   r<   r=   r>   r?   	__class__ [/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/openpangu_vl.pyrL   [   s>   


	z!OpenPanguVisionAttention.__init__x
cu_seqlenscossinc                    s   |  \}} |\}}|d ur|| }|jddd\}}	}
 fdd||	|
fD \}}	}
tj||	gdd} |||}tj|ddd\}}	|dd  |d d   } j||	|
||d	}t|d
dd	 } 
|\}}|d urx|| }|S )N   r7   dimc                 3   s&    | ]}t |d  jdd V  qdS )zs (b n d) -> b s n dr7   )dbN)r   rO   
contiguous).0r^   rY   r\   r]   	<genexpr>   s    

z3OpenPanguVisionAttention.forward.<locals>.<genexpr>r      )querykeyvaluer_   
max_seqlenzb s h d -> s (b h d))rf   )sizerU   chunktorchcatrX   maxrW   r   rg   rV   )rY   r^   r_   r`   ra   
seq_length_rE   qkv	qk_concat
qk_rotatedrp   context_layeroutputr\   ri   r]   forward   s8   
z OpenPanguVisionAttention.forward)Nr:   )__name__
__module____qualname__intr   strrL   rs   Tensorr   __classcell__r\   r\   rZ   r]   r9   Z   s6    )r9   c                       sf   e Zd Zdejdddfdedededeej	gej	f de
dB d	ef fd
dZdej	fddZ  ZS )OpenPanguVisionMLPFNr:   in_featureshidden_featuresrE   act_fnr>   r?   c           	         s   t    |j| _| jdkr3t }|| dkr"|| d | | }t||gd ||| dd| _nt||||| dd| _t	||||| d	d| _
|| _d S )
Nsilur   r7   rk   z.gate_up_proj)rF   output_sizesrE   r>   r?   z.up_proj)rE   r>   r?   z
.down_proj)rK   rL   
hidden_actr   rP   r   gate_up_projr   up_projr   	down_projr   )	rY   r   r   rE   r   vision_configr>   r?   rQ   rZ   r\   r]   rL      s8   




zOpenPanguVisionMLP.__init__r^   c                 C   sD   | j dkr| |\}}n| |\}}| |}| |\}}|S )Nr   )r   r   r   r   r   )rY   r^   rw   r\   r\   r]   r      s   

zOpenPanguVisionMLP.forward)r   r   r   Fr   r   boolr   rs   r   r   r   rL   r   r   r\   r\   rZ   r]   r      s(    )r   c                       s   e Zd Zejddddfdedededeejgejf deege	j
f dB dedB d	ed
df fddZdejdejdejdejd
ejf
ddZ  ZS )OpenPanguVisionBlockNr:   rd   r<   mlp_hidden_dimr   
norm_layerr>   r?   r@   c	           	   	      sp   t    |d u rttjdd}||| _||| _t||||| dd| _t	|||d||| dd| _
d S )Nư>epsrH   )r;   r<   r=   r>   r?   Tz.mlp)r   rE   r   r>   r?   )rK   rL   r   nn	LayerNormnorm1norm2r9   rW   r   mlp)	rY   rd   r<   r   r   r   r   r>   r?   rZ   r\   r]   rL      s*   


zOpenPanguVisionBlock.__init__hidden_statesr_   r`   ra   c                 C   s4   || j | ||||d }|| | | }|S )Nr_   r`   ra   )rW   r   r   r   )rY   r   r_   r`   ra   r\   r\   r]   r     s
   zOpenPanguVisionBlock.forward)r   r   r   r   r   r   r   rs   r   r   Moduler   r   rL   r   r   r\   r\   rZ   r]   r      sD    	
!r   c                       sR   e Zd Zddededdf fddZdeddfd	d
ZdedejfddZ	  Z
S )OpenPanguVisionRotaryEmbedding     @rd   thetar@   Nc                    s<   t    d|tjd|dtjd|   | _d| _d | _d S )N      ?r   rk   dtype)rK   rL   rs   arangefloatinv_freq_seq_len_cached_freqs_cached)rY   rd   r   rZ   r\   r]   rL     s   

z'OpenPanguVisionRotaryEmbedding.__init__seqlenc                 C   sL   || j kr$|d9 }|| _ tj|| jj| jjd}t|| j}|| _d S d S )Nrk   devicer   )r   rs   r   r   r   r   outerr   )rY   r   seqfreqsr\   r\   r]   update_freqs_cache  s   

z1OpenPanguVisionRotaryEmbedding.update_freqs_cachec                 C   s(   |  | | jd ur| jd | S | jS N)r   r   )rY   r   r\   r\   r]   r   #  s   

z&OpenPanguVisionRotaryEmbedding.forward)r   )r   r   r   r   r   rL   r   rs   r   r   r   r\   r\   rZ   r]   r     s    
r   c                       sR   e Zd Z				ddedededed	d
f
 fddZdejd	ejfddZ  ZS )OpenPanguVisionPatchEmbed   rk   rb     
patch_sizetemporal_patch_sizein_channelsrA   r@   Nc                    sX   t    || _|| _|| _| j| j | | j | _|||f}tj||||dd| _d S )NF)kernel_sizestriderE   )	rK   rL   r   r   rA   rF   r   Conv3drV   )rY   r   r   r   rA   r   rZ   r\   r]   rL   -  s   

z"OpenPanguVisionPatchEmbed.__init__r^   c                 C   sr   |j d | jkr&tj|d| j| j |d| j| j gddd| j}|| jjj	
| jddd}|S )Nrl   rc   r   r7   )shaperF   rs   rt   reshaper   matmulrV   weightdataviewrA   	transposerY   r^   r\   r\   r]   r   E  s   
"z!OpenPanguVisionPatchEmbed.forward)r   rk   rb   r   )	r   r   r   r   rL   rs   r   r   r   r\   r\   rZ   r]   r   ,  s"    r   c                       sn   e Zd Z				ddededeegejf dB dededB d	ed
df fddZ	de
jd
e
jfddZ  ZS )OpenPanguVisionPatchMergerNrk   r:   d_modelcontext_dimr   spatial_merge_sizer>   r?   r@   c                    s   t    |d u rttjdd}||d  | _||| _tt| j| jd|| dddt	 t
| j|d|| ddd| _d S )	Nr   r   rk   Tz.mlp.0F)rE   r>   r?   return_biasz.mlp.2)rK   rL   r   r   r   rA   ln_q
Sequentialr   GELUr   r   )rY   r   r   r   r   r>   r?   rZ   r\   r]   rL   S  s0   
	

z#OpenPanguVisionPatchMerger.__init__r^   c                 C   s   |  | |d| jS )Nrl   )r   r   r   rA   r   r\   r\   r]   r   u  s   z"OpenPanguVisionPatchMerger.forward)Nrk   Nr:   )r   r   r   r   r   r   r   r   r   rL   rs   r   r   r   r\   r\   rZ   r]   r   R  s*    "r   c                	       s   e Zd Z				ddededB deddf fd	d
Zedej	fddZ	edej
fddZ
dd ZdejdejfddZdd ZdejdejdejfddZdee fddZ  ZS )OpenPanguVisionTransformerr   Nr:   Fnorm_epsr>   r?   r@   c           	         s  t    j_j_j_j_j_j_jd _t	t
|d |_j_j_jj }t|t d_jtjhvrStdj dt|d _tjjjjd_t fddtjD _t ! _"t # _$t%&jj_'t(d	d
dg_)fddj)D _*j*d d d
 _*dd tt+j*D _)j*_,t
j|d_-t fddtt+j)D _.t/||_0d S )Nrk   r   )rB   r   zPangu-VL does not support z backend now.)r   r   r   rA   c                    s:   g | ]}t jjjtj   d | dqS )z.blocks.)rd   r<   r   r   r   r   r>   r?   )r   rA   r<   intermediate_sizer   r   )rh   	layer_idxr   r?   r>   rY   r   r\   r]   
<listcomp>  s    z7OpenPanguVisionTransformer.__init__.<locals>.<listcomp>mm_unit_vision_select_layerrl   c                    s   g | ]} j | qS r\   )depthrh   i)r   r\   r]   r     s    c                 S   s   g | ]}d |d  qS )rl   r7   r\   r   r\   r\   r]   r     s    c                    s0   g | ]}t jj j d | dqS )z.merger.)r   r   r   r   r>   r?   )r   out_hidden_sizerA   r   r   r   r\   r]   r     s    	)1rK   rL   rA   r<   window_sizer   r   fullatt_block_indexesspatial_merge_unitr   r   interleavedr   r   r8   rs   get_default_dtypeattn_backendr6   
FLASH_ATTNRuntimeErrorr   rotary_pos_embr   r   r   patch_embedr   
ModuleListranger   blocksr   rP   rQ   rR   rS   rM   rN   rO   getattrselect_layerselect_indexlentake_indicesfinal_layernormmergerProjectionSinglevision_projection)	rY   r   r   rA   r   r>   r?   r   head_dimrZ   r   r]   rL   z  sl   




	z#OpenPanguVisionTransformer.__init__c                 C      | j jjjS r   )r   rV   r   r   ri   r\   r\   r]   r        z OpenPanguVisionTransformer.dtypec                 C   r   r   )r   rV   r   r   ri   r\   r\   r]   r     r   z!OpenPanguVisionTransformer.devicec                 C   s   |  }| }||fS r   )r`   ra   )rY   r   r`   ra   r\   r\   r]   cal_cos_sin  s   z&OpenPanguVisionTransformer.cal_cos_singrid_thwc                 C   s  g }|D ]]\}}}t |dd|}t |d|d}||| j | j|| j | jdddd }||| j | j|| j | jdddd }|t j	||gdd
|d qt j|dd}|d d dd f  }| |}	|	| d}
|
S )Nr7   rl   r   rk   rb   rc   )rs   r   	unsqueezeexpandr   r   permuteflattenappendstackrepeatrt   ru   r   )rY   r   pos_idsthwhpos_idswpos_idsmax_grid_sizerotary_pos_emb_fullr   r\   r\   r]   rot_pos_emb  s8   "

z&OpenPanguVisionTransformer.rot_pos_embc                 C   sh  g }dg}d}| j | j | j }|D ]\}}}|| j }	|| j }
t||	 |
 ||	|
}||	|  }||
|  }|	| | }|
| | }t|d|d|fdd}||||||}|ddddd||| ||}|dk	ddgd}|d}||dk }|
||  |d| j |d  }||  |||	 |
  7 }qtj|dd	}||fS )
Nr   constantir7   rb   rk      rl   rc   )r   r   r   rs   r   r   r   padr   sumr   cumsumr   extendtolistitemrt   )rY   r   window_indexcu_window_seqlenswindow_index_idvit_merger_window_sizegrid_tgrid_hgrid_w
llm_grid_h
llm_grid_windexpad_hpad_wnum_windows_hnum_windows_windex_paddedseqlens	index_newcu_seqlens_tmpr\   r\   r]   get_window_index  sN   


z+OpenPanguVisionTransformer.get_window_indexr^   c                 C   s  t |d d df |d d df  |d d df t j|j}t j|dt jd}t|ddd}| |}| 	|}| 
|\}}t j||jt j rQ|jnt jd}t |}| \}}||| j | jd}||d d d d f }||d}||| j | jd}||d d d d f }||d}| ||j\}	}
g }t| jD ]%\}}|| jv r|}n|}||||	|
d	}|| jv r| |}|| qg }t| jD ]\}}|| j| ||  qt|}t |}||d d f }| |}|S )
Nr7   rk   r   )rd   r   )r7   r   r  r   rl   r   )rs   repeat_interleavetoint32r   r	  r   r  r   r  r  tensorjit
is_tracingr   unique_consecutiverq   r   r   r   	enumerater   r   r   r   r   r   r   r  argsortr   )rY   r^   r   r_   r   r  r  seq_lenrw   r`   ra   intermediates	layer_numblkcu_seqlens_nowln_hsimage_embeddings_listidxslreverse_indicesr\   r\   r]   r   .  sV   2








z"OpenPanguVisionTransformer.forwardc                    s   dt dtjdtjf fdd}g d} jdkr|dd	g t jd
d}t }|D ]C\}} jdkr:|||}|D ]\}}	}
|	|vrFq<||	|}|| }|j	}||||
  n|| }t
|dt}||| || q,|S )Nnamer   r@   c                    s   d| v sd| v rd| d}}nd| v rd| d}}n|S |  j }|dkr,|S dg|jd  }|||d  d d < tj||ddd	S )
N	gate_projr   r   r   r7   rl   rk   r  )modero   )rq   rQ   ndimr   r  )r3  r   rd   rq   pad_lenr  ri   r\   r]   _padding_weighti  s   z@OpenPanguVisionTransformer.load_weights.<locals>._padding_weight))	attn.qkv.zattn.q.rx   )r9  zattn.k.ry   )r9  zattn.v.rz   r   )r   r4  r   )r   r   r7   F)remove_duplicateweight_loader)r   rs   r   r   r
  dictnamed_parameterssetreplacer;  r   r   add)rY   weightsr8  stacked_params_mappingparams_dictloaded_paramsr3  loaded_weight
param_nameweight_nameshard_idparamr;  r\   ri   r]   load_weightsh  s4   



z'OpenPanguVisionTransformer.load_weights)r   Nr:   F)r   r   r   r   r   r   rL   propertyrs   r   r   r   r   r  r  r   r>  rJ  r   r\   r\   rZ   r]   r   y  s:    	Y!-
:r   c                       s.   e Zd Zdedef fddZdd Z  ZS )r   i_hidden_sizet_hidden_sizec                    s(   t    tj| _tj||dd| _d S )NT)rE   )rK   rL   r   r   actr   Linearfc1)rY   rL  rM  rZ   r\   r]   rL     s   
zProjectionSingle.__init__c                 C   s   |  |}| |S r   )rN  rP  )rY   r   r^   r\   r\   r]   r     s   

zProjectionSingle.forward)r   r   r   r   rL   r   r   r\   r\   rZ   r]   r     s    r   c                   @   s^   e Zd Zdd ZddddddedB dedB deeef dB deee B dB d	e	f
d
dZ
dS )OpenPanguVLProcessingInfoc                 C   s
   | j jjS r   )ctxmodel_config	hf_configri   r\   r\   r]   get_hf_config  s   
z'OpenPanguVLProcessingInfo.get_hf_configN)
min_pixels
max_pixelsrq   fpsrV  rW  rq   rX  kwargsc                K   s.   |d ur||d< | j jdd|ddi|S )NrX  use_fastTr\   )rR  get_hf_processorpop)rY   rV  rW  rq   rX  rY  r\   r\   r]   r[    s   	

z*OpenPanguVLProcessingInfo.get_hf_processor)r   r   r   rU  r   r<  r   r   listobjectr[  r\   r\   r\   r]   rQ    s"    rQ  c                   @   J   e Zd ZU ed ed< eejeddf ed< eejeddf ed< dS )	OpenPanguVLImagePixelInputspixel_valuestypenpcpsnirb   image_grid_thwN	r   r   r   r
   __annotations__r	   rs   r   r5   r\   r\   r\   r]   r`       
 r`  c                   @   r_  )	OpenPanguVLImageEmbeddingInputsimage_embedsrb  nfhsre  rb   rf  Nrg  r\   r\   r\   r]   rj    ri  rj  c                   @   r_  )	OpenPanguVLVideoPixelInputspixel_values_videosrb  rc  ctpsnvrb   video_grid_thwNrg  r\   r\   r\   r]   rn    ri  rn  c                   @   r_  )	OpenPanguVLVideoEmbeddingInputsvideo_embedsrb  rl  rm  rq  rb   rr  Nrg  r\   r\   r\   r]   rs    ri  rs  c                	   @   s2   e Zd Zdedeeef dedee	 fddZ
dS )OpenPanguVLMultiModalProcessormm_itemshf_processor_mm_kwargsout_mm_kwargsr@   c                    s   | j jd	i |}| j jd	i |}| j  }| }|j}|j}	|j}
|j}|| ||	 ||
 || d|j	d dt
dtffdd  fdddD S )
N)imagevideork   item_idxmodalityc                    s   | |  }|| d j }t|tjstd|dkr) gt|   }|S |\}}}||   }gg|  g }	|	|  }
|
dd }tj	|dS )N	_grid_thwz"Expected 'grid_thw' to be a Tensorry  r7   rl   )embed_token_id)
r   
isinstancers   r   	TypeErrorr   prodr  r2   select_token_id)r{  r|  out_itemr   image_token_id_totalr  r  r  video_seq_length_per_timevideo_token_id_per_timevideo_token_id_totalvideo_token_id_middle)image_token_idmerge_lengthrx  video_token_idvision_end_token_idvision_start_token_idr\   r]    get_replacement_openpangu_vision	  s.   
z\OpenPanguVLMultiModalProcessor._get_prompt_updates.<locals>.get_replacement_openpangu_visionc              	      s(   g | ]}t || gt |d dqS ))r|  )r|  targetreplacement)r0   r   )rh   r|  )r  placeholderr\   r]   r   #  s    zFOpenPanguVLMultiModalProcessor._get_prompt_updates.<locals>.<listcomp>r\   )infor[  get_image_processorget_tokenizer	get_vocabimage_tokenvideo_tokenvision_start_tokenvision_end_token
merge_sizer   r   )rY   rv  rw  rx  hf_processorimage_processor	tokenizervocabr  r  r  r  r\   )r  r  r  rx  r  r  r  r  r]   _get_prompt_updates  s(   

 z2OpenPanguVLMultiModalProcessor._get_prompt_updatesN)r   r   r   r/   r   r   anyr.   r   r1   r  r\   r\   r\   r]   ru    s    
ru  c                   @   s   e Zd ZdS )OpenPanguVLDummyInputsBuilderN)r   r   r   r\   r\   r\   r]   r  /  s    r  )r  dummy_inputsc                       s   e Zd ZeddddddZg dddgd	Zd
ddedef fddZdd Z	de
fddZdededejfddZdefddZdefddZdedefdd Zdeded!B fd"d#Z	!dCd$ejdejfd%d&Zdeejd'f fd(d)Zdejfd*d+Z	!	!dDd$ejd,ejd-ed!B d.ejd!B dedejeB fd/d0Z	!dCd1ejdejd!B fd2d3Zd4eeeejf  dee fd5d6Zde fd7d8Z!e"d9ed:e#ded!B fd;d<Z$d=e%e& de'eee#e#e#e#f  fd>d?Z(d@e%e# d=e%e& deeje#f fdAdBZ)  Z*S )E#OpenPanguVLForConditionalGenerationzlanguage_model.model.visual.zlanguage_model.lm_head.)zmodel.language_model.zmodel.visual.zlm_head.zmodel.)orig_to_new_prefix)q_projk_projv_projr4  r   )qkv_projr   r:   )r?   vllm_configr?   c             
      s   t    |jj}|| _|| _|j}| |ddh" t|j	|j	j
|jt|j	dd| |t|dd| _W d    n1 s?w   Y  | | t|tddd	gd
| _W d    n1 s`w   Y  | jj| _| |j	 d S )Nry  rz  rms_norm_epsr   visual)r   r   rA   r   r>   r?   	openpangulanguage_modelPanguEmbeddedForCausalLM)r  r?   architectures)rK   rL   rS  rT  configr  r>   _mark_tower_modelr   r   r   rA   r   _maybe_ignore_quant_configr*   r  _mark_language_modelr)   r  make_empty_intermediate_tensors_parse_preprocess_params)rY   r  r?   r  r>   rZ   r\   r]   rL   H  s2   



z,OpenPanguVLForConditionalGeneration.__init__c                 C   sf   |j | _|j| _ddlm} || jjj	 j
}|j| _|j| _|j| _t|j| _t|j| _d S )Nr   r+   )r   channelr   vllm.multimodalr,   create_processorr  rS  r  r[  r  
do_rescalerescale_factordo_normalizetuple
image_mean	image_std)rY   r   r,   r  r\   r\   r]   r  e  s   z<OpenPanguVLForConditionalGeneration._parse_preprocess_paramsr>   c                 C   s   t |ttfr	d S |S r   )r  r   r   )rY   r>   r\   r\   r]   r  u  s   z>OpenPanguVLForConditionalGeneration._maybe_ignore_quant_configmm_inputr3  r@   c                 C   s   t |tjtfstd| dt| t |tjr<|jdkr!|S |jdkr5t| d|j d|j dtt|S t|S )NzIncorrect type of z. Got type: rk   rb   z. should be 2D or batched 3D tensor. Got ndim: z (shape=))	r  rs   r   r]  
ValueErrorrb  r6  r   concat)rY   r  r3  r\   r\   r]   _validate_and_reshape_mm_tensorz  s   


zCOpenPanguVLForConditionalGeneration._validate_and_reshape_mm_tensorrY  c                 K   s   | dd }| dd }| dd }|d u r|d u rd S |d urD| |d}| |d}t|tjtfs=tdt| td||dS |d urj| |d}| |d}t|tjsctd	t| t	d||d
S d S )Nra  rk  rf  zimage pixel valueszimage grid_thwz0Incorrect type of image pixel values. Got type: )rb  ra  rf  zimage embedsz.Incorrect type of image embeddings. Got type: )rb  rk  rf  )
r\  r  r  rs   r   r]  r  rb  r`  rj  )rY   rY  ra  rk  rf  r\   r\   r]   _parse_and_validate_image_input  sT   zCOpenPanguVLForConditionalGeneration._parse_and_validate_image_inputc                 K   s   | dd }| dd }| dd }|d u r|d u rd S |d ur3| |d}| |d}td||dS |d urY| |d}| |d}t|tjsRtdt| td||d	S d S )
Nro  rt  rr  zvideo pixel valueszvideo grid_thw)rb  ro  rr  zvideo embedsz.Incorrect type of video embeddings. Got type: )rb  rt  rr  )	r\  r  rn  r  rs   r   r  rb  rs  )rY   rY  ro  rt  rr  r\   r\   r]   _parse_and_validate_video_input  sH   zCOpenPanguVLForConditionalGeneration._parse_and_validate_video_inputc                 K   sZ   i }|D ]&}|dv rd|vr| j di ||d< |dv r*d|vr*| jdi ||d< q|S )N)ra  rk  ry  )ro  rt  rz  r\   )r  r  )rY   rY  mm_input_by_modality	input_keyr\   r\   r]   %_parse_and_validate_multimodal_inputs  s   

zIOpenPanguVLForConditionalGeneration._parse_and_validate_multimodal_inputsNc                 K   sv   | j di |}|sd S d}|D ](}|| }|dkr'| |}|s#|n|| }|dkr8| |}|s4|n|| }q|S )Nr\   ry  rz  )r  _process_image_input_process_video_input)rY   rY  r  multimodal_embeddingsr|  multimodal_inputvision_embeddingsvideo_embeddingsr\   r\   r]   embed_multimodal  s(   

z4OpenPanguVLForConditionalGeneration.embed_multimodal	input_idsc                 C   s4   | j |}|d ur| |||| jj| jjg}|S r   )r  embed_input_idsr  r  r  )rY   r  r  inputs_embedsr\   r\   r]   get_input_embeddings  s   z8OpenPanguVLForConditionalGeneration.get_input_embeddings.c                 C   s   |d }|j dkrtd|j  |d dkr!|d | jj}n6|d | jj}|d| j| j| j}t|| j	| j
| j| j| j}|d| j| j | j }| j||d}| jj}|d| | }|| S )	Nrf  rk   #grid_thw.ndim must be 2, but it is rb  rk  ra  rl   r   )r6  r  rb  r  r   r   r  r   rescale_and_normalizer  r  r  r  r  r   r  splitr  )rY   image_inputr   rk  ra  r  sizesr\   r\   r]   r    s0   
z8OpenPanguVLForConditionalGeneration._process_image_inputc                 C   s   |d }|j dkrtd|j  |d dkr!|d | jj}n|d | jj}| j||d}| jj}|d| | }|| S )	Nrr  rk   r  rb  rt  ro  r  rl   )	r6  r  rb  r  r   r   r  r  r  )rY   video_inputr   rt  ro  r  r  r\   r\   r]   r  >  s   
z8OpenPanguVLForConditionalGeneration._process_video_input	positionsintermediate_tensorsr  c                 K   s$   |d urd }| j j||||d}|S )N)r  r  r  r  )r  model)rY   r  r  r  r  rY  r   r\   r\   r]   r   Q  s   z+OpenPanguVLForConditionalGeneration.forwardr   c                 C   s   | j |S r   )r  compute_logits)rY   r   sampling_metadatar\   r\   r]   r  d  s   z2OpenPanguVLForConditionalGeneration.compute_logitsrA  c                 C   s   t | }|j|| jdS )N)mapper)r'   rJ  hf_to_vllm_mapper)rY   rA  loaderr\   r\   r]   rJ  k  s   z0OpenPanguVLForConditionalGeneration.load_weightsc                 C   s   t jddddS )z<
        Get the module prefix in multimodal models
        r  zvisual.merger.r  )r  	connectortower_model)r#   from_string_fieldri   r\   r\   r]   get_mm_mappingo  s
   z2OpenPanguVLForConditionalGeneration.get_mm_mappingr|  r   c                 C   s$   | drdS | drdS td)Nry  z[unused18][unused19][unused20]rz  z[unused18][unused32][unused20]z)Only image or video modality is supported)
startswithr  )clsr|  r   r\   r\   r]   get_placeholder_stry  s
   

z7OpenPanguVLForConditionalGeneration.get_placeholder_strmm_featuresc           	      c   s    | j jj}t|dd dD ]R}|jj}|j}|dkr>|jd j \}}}|dks1J d| ||d|| || fV  q|dkrZ|jd	 j \}}}||||| || fV  qt	d
| d S )Nc                 S   s   | j jS r   )mm_positionoffset)fr\   r\   r]   <lambda>  s    zFOpenPanguVLForConditionalGeneration.iter_mm_grid_thw.<locals>.<lambda>)rn   ry  rf  r7   zImage must have 1 frame, got rz  rr  zUnsupported modality: )
r  r   r   sortedr  r  r|  r   r  r  )	rY   r  r   
mm_featurer  r|  r   r   r   r\   r\   r]   iter_mm_grid_thw  s0   
z4OpenPanguVLForConditionalGeneration.iter_mm_grid_thwinput_tokensc                 C   s  g }d}|  |D ]\}}}}}	|| }
t|dkr"|d  d nd}|t|
dddd|  |dkrtjddtj	d}t||	}|
| }t|ddd|	
 }t|	dd|d
 }ttj|dtj	d||g}|||  t|d D ]#}|| }|||  ||| d  ||| d  |d7 }q||| |	  |d d  }q	t|ddd||	 	 
 }t|ddd|d|	
 }t|	ddd||d
 }|t|||g|
 |  ||| |	  }q	|t|k r6t|dkr|d  d nd}t|| }
|t|
dddd|  tj|dd	dd}| d t|  }||fS )
Nr   rl   r7   rb   rz  )rb   r7   r   rk   rc   )r  r   ru   r   rs   r   r   r   fulllongr   r   	full_liker   rt   r   r  )rY   r  r  llm_pos_ids_liststr|  r  
llm_grid_tr  r  text_lenst_idxeot_bot_pos
offset_poscurrent_posr  r  	frame_posrw   t_indexh_indexw_indexllm_positionsmrope_position_deltar\   r\   r]   get_mrope_input_positions  s    






"z=OpenPanguVLForConditionalGeneration.get_mrope_input_positionsr   )NN)+r   r   r   r(   r  packed_modules_mappingr   r   rL   r  r   r  r^  rs   r   r  r  r  r<  r  r   r  r  r  r  r  r3   r   r  r   r>  rJ  r#   r  classmethodr   r  r]  r-   r   r  r  r   r\   r\   rZ   r]   r  3  s    	
/)


$

r  c                 C   s   | | S r   r\   )ry  rI   r\   r\   r]   rescale  s   r  c                 C   s   t j| ||S r   )r   
functional	normalize)ry  meanstdr\   r\   r]   r
    s   r
  
   )maxsizer  r  r  r  r  r   ztorch.devicer@   c                 C   sB   |r| rt j||dd|  }t j||dd|  }d}|||fS )N)r   r   F)rs   r#  r  r  r  r  r  r   r\   r\   r]   !_fuse_mean_std_and_rescale_factor  s
   	
r  imagesztorch.Tensorr   c                 C   sT   t |||||| jd\}}}|rt| jtjd||} n|r#t| |} | |} | S )z'
    Rescale and normalize images.
    r  r   )r  r   r
  r!  rs   float32r  )r  r  r  r  r  r  r   r\   r\   r]   r    s   	

r  )NNNNNN)xcollections.abcr   r   r   r   r   	functoolsr   r   typingr	   r
   r   rs   torch.nnr   torch.nn.functionalr	  r   einopsr   torchvision.transformsr   transformers.utilsr   vllm.configr   vllm.distributedr   r   rM   %vllm.model_executor.layers.activationr   9vllm.model_executor.layers.attention.mm_encoder_attentionr   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   r   r   'vllm.model_executor.layers.quantizationr   ,vllm.model_executor.layers.quantization.gptqr   3vllm.model_executor.layers.quantization.gptq_marlinr   2vllm.model_executor.layers.rotary_embedding.commonr   -vllm.model_executor.model_loader.weight_utilsr   %vllm.model_executor.models.interfacesr   r   r    r!   r"   )vllm.model_executor.models.module_mappingr#   %vllm.model_executor.models.qwen2_5_vlr$   r%   r&    vllm.model_executor.models.utilsr'   r(   r)   r*   r  r,   vllm.multimodal.inputsr-   r.   vllm.multimodal.parser/   vllm.multimodal.processingr0   r1   r2   vllm.sequencer3   vllm.utils.tensor_schemar4   r5   #vllm.v1.attention.backends.registryr6   visionr8   
get_loggerr   loggerr   r9   r   r   r   r   r   r   r   rQ  r`  rj  rn  rs  ru  r  register_processorr  r  r
  r   r   r]  r  r  bfloat16r   r  r\   r\   r\   r]   <module>   s   
R40&'  #@
   D

