o
    
۾i6                  	   @   s  U d Z ddlZddlmZmZmZmZmZ ddlm	Z	 ddl
mZmZmZmZ ddlZddlZddlmZ ddlm  mZ ddlmZ ddlmZmZ ddlmZ dd	lmZm Z  dd
l!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z(m)Z) ddl*m+Z+m,Z, ddl*m-Z. ddl/m0Z0 ddl1m2Z2 ddl3m4Z4m5Z5 ddl6m7Z7 ddl8m9Z9m:Z:m;Z;m<Z< ddl=m>Z> ddl?m@Z@ ddlAmBZB ddlCmDZD ddlEmFZF ddlGmHZH ddlImJZJmKZKmLZLmMZMmNZN ddlOmPZPmQZQmRZR ddlSmTZTmUZUmVZVmWZWmXZXmYZY ddlZm[Z[ ddl\m]Z]m^Z^ dd l_m`Z` d!d"lambZb d#d$lcmdZdmeZemfZfmgZgmhZh d#d%limjZj d#d&l-mkZkmlZlmmZmmnZn d#d'lompZpmqZqmrZr e0esZtd(ZuG d)d* d*e]ZvG d+d, d,e]ZwevewB Zxeeyd-< G d.d/ d/e]ZzG d0d1 d1e]Z{eze{B Z|eeyd2< G d3d4 d4ej}Z~d5ed6efd7d8ZG d9d: d:ej}ZG d;d< d<ej}ZG d=d> d>ej}ZG d?d@ d@ej}ZG dAdB dBej}ZG dCdD dDej}ZG dEdF dFeVZG dGdH dHeTe ZG dIdJ dJeUe ZeHjeeedKG dLdM dMej}egeeehefZeHjeeedKG dNdO dOeZdS )PzeInference-only GLM-4.1V & GLM-4.6V-Flash, AutoGLM-Phone-9B model
compatible with HuggingFace weights.    N)CallableIterableIteratorMappingSequence)partial)	AnnotatedAnyLiteral	TypeAlias)	rearrange)BatchFeatureGlm4vProcessor)Glm4vVisionConfig)Glm4vImageProcessorsmart_resize)Glm4vVideoProcessor)VideoMetadata)
VllmConfig)BaseDummyOptionsVideoDummyOptions)$get_tensor_model_parallel_world_sizeparallel_state)utils)init_logger)MMEncoderAttention)Conv2dLayerConv3dLayer)RMSNorm)ColumnParallelLinearMergedColumnParallelLinearQKVParallelLinearRowParallelLinear)QuantizationConfig)get_rope)ApplyRotaryEmb)default_weight_loader)MultiModelKeys)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFeatureSpecMultiModalFieldConfigMultiModalKwargsItems	VideoItem)	ImageSizeMultiModalDataItemsMultiModalDataParser)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdatePromptUpdateDetails)IntermediateTensors)TensorSchemaTensorShape)AttentionBackendEnum   )
SiluAndMul   )MultiModalEmbeddingsSupportsLoRASupportsMRoPESupportsMultiModal
SupportsPP)_create_qwen2vl_field_factory)AutoWeightsLoaderWeightsMapperinit_vllm_registered_modelmaybe_prefix)get_vit_attn_backendis_vit_use_data_parallel!run_dp_sharded_mrope_vision_modeliX  c                   @   R   e Zd ZU dZdZed ed< eej	e
ddf ed< eej	e
ddf ed< d	S )
Glm4vImagePixelInputsz
    Dimensions:
        - np: Number of patches
        - cpp: Number of channels * patch_size * patch_size
        - ni: Number of images
        - g: Grid dimensions (3 for grid_t, grid_h, grid_w)
    pixel_valuestypenpcppni   image_grid_thwN__name__
__module____qualname____doc__rN   r
   __annotations__r   torchTensorr9    r\   r\   V/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/glm4_1v.pyrL   z   
   
 rL   c                   @   rK   )
Glm4vImageEmbeddingInputsz
    Dimensions:
        - f: Number of image features (varies based on image resolution)
        - h: Hidden size (must match language model backbone)
        - n: Number of images
        - g: Grid dimensions (3 for grid_t, grid_h, grid_w)
    image_embedsrN   fhnrR   rS   NrT   r\   r\   r\   r]   r_      r^   r_   Glm4vImageInputsc                   @   rK   )
Glm4vVideoPixelInputsa  
    Dimensions:
        - np: Number of patches
        - ctpp: Number of channels * temporal_patch_size *
            patch_size * patch_size
        - f: Number of frames
        - g: Grid dimensions (3 for grid_t which is usually 1 for processed
          video, grid_h, grid_w)
    pixel_values_videosrN   rO   ctppra   rR   video_grid_thwNrT   r\   r\   r\   r]   re      s
   
 
re   c                   @   rK   )
Glm4vVideoEmbeddingInputsa  
    Dimensions:
        - p: Number of video patches across all frames
        - h: Hidden size (must match language model backbone)
        - f: Number of frames
        - g: Grid dimensions (3 for grid_t which is usually 1 for processed
          video, grid_h, grid_w)
    video_embedsrN   prb   ra   rR   rh   NrT   r\   r\   r\   r]   ri      s
   
 	ri   Glm4vVideoInputsc                       sN   e Zd Z			ddededededB def
 fd	d
Zdej	fddZ
  ZS )Glm4vVisionMLPFN in_featureshidden_featuresbiasquant_configprefixc                    sZ   t    t }t||gd ||| d|d| _t||||| d|d| _t | _d S )Nr;   .gate_up_proj
input_sizeoutput_sizesrq   rr   rs   
disable_tp
.down_projrq   rr   rs   rx   )	super__init__rI   r    gate_up_projr"   	down_projr<   act_fn)selfro   rp   rq   rr   rs   use_data_parallel	__class__r\   r]   r|      s&   
zGlm4vVisionMLP.__init__xc                 C   s*   |  |\}}| |}| |\}}|S N)r}   r   r~   )r   r   _r\   r\   r]   forward   s   
zGlm4vVisionMLP.forward)FNrn   )rU   rV   rW   intboolr#   strr|   rZ   r[   r   __classcell__r\   r\   r   r]   rm      s     rm   hidden_sizetp_sizec                    sp   ddl m} fddtD }|j|t jd  fdd|D }dd t| D }tj	|dd	}|S )
zEAll-gather the input tensor interleavely across model parallel group.r   Nc                    s   g | ]}t  qS r\   )rZ   
zeros_like).0r   )local_tensorr\   r]   
<listcomp>   s    z)all_gather_interleave.<locals>.<listcomp>)groupc                    s   g | ]}t |  d qS ))rZ   split)r   tensor)r   r   r\   r]   r      s    c                 S   s   g | ]	}|D ]}|qqS r\   r\   )r   pairr   r\   r\   r]   r      s
    r   dim)
torch.distributeddistributedrange
all_gatherr   get_tp_groupdevice_groupziprZ   cat)r   r   r   distgathered_tensorsgathered_tensors_splitordered_tensorsresult_tensorr\   )r   r   r   r]   all_gather_interleave   s   r   c                       s   e Zd Z		ddededededB deddf fd	d
Zdejde	ejdf fddZ
	ddejdejdejdejdejdB dejfddZ  ZS )Glm4vVisionAttentionNrn   	embed_dim	num_headsprojection_sizerr   rs   returnc              
      s   t    t }|rdnt | _|rdnt | _t	||| _
t	|| j| _t|| j
||d||r7| dn| d|d| _t|||| dd|d| _t| j| j
| j
d	 | d
d| _tdd| _d S )Nr=   r   Fz	.qkv_projz.qkv)r   	head_sizetotal_num_headstotal_num_kv_headsrq   rr   rs   rx   .proj)rv   output_sizerr   rs   rq   rx   g      .attn)r   r   scalers   T)enforce_enable)r{   r|   rI   r   r   r   get_tensor_model_parallel_ranktp_rank
dist_utilsdividehidden_size_per_attention_head!num_attention_heads_per_partitionr!   qkvr"   projr   attnr%   apply_rotary_emb)r   r   r   r   rr   rs   r   r   r\   r]   r|      sJ   
	zGlm4vVisionAttention.__init__r   .c                    sX   |j \}}}|jddd\}}}||| j| jf  fdd|||fD \}}}|||fS )NrR   r;   r   c                 3   s    | ]}|j   V  qd S r   )viewr   r   	new_shaper\   r]   	<genexpr>>  s    z1Glm4vVisionAttention.split_qkv.<locals>.<genexpr>)shapechunkr   r   )r   r   seq_lenbsr   qkvr\   r   r]   	split_qkv0  s   
zGlm4vVisionAttention.split_qkvr   
cu_seqlensrotary_pos_emb_cosrotary_pos_emb_sin
max_seqlenc                 C   s   |  |\}}| |\}}}	dd |||	fD \}}}	|d ur>|d ur>tj||gdd}
| |
||}tj|ddd\}}| j|||	||d}t|d }| 	|\}}|S )Nc                 s   s    | ]
}t |d  V  qdS )zs b ... -> b s ...N)r   
contiguousr   r\   r\   r]   r   O  s    z/Glm4vVisionAttention.forward.<locals>.<genexpr>r   r   r;   )querykeyvaluer   r   zb s h d -> s b (h d))
r   r   rZ   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   r   r   	qk_concat
qk_rotatedcontext_layeroutputr\   r\   r]   r   A  s*   	zGlm4vVisionAttention.forward)Nrn   r   )rU   rV   rW   r   r#   r   r|   rZ   r[   tupler   r   r   r\   r\   r   r]   r      s>    5r   c                       s   e Zd Z			ddedededeegejf dB dedB ded	df fd
dZ		dde
jde
jde
jde
jdedB d	e
jfddZ  ZS )Glm4vVisionBlockNrn   r   r   mlp_hidden_dim
norm_layerrr   rs   r   c                    sl   t    |d u rttjdd}||| _||| _t||||| dd| _t	||d|| dd| _
d S )Nư>epsr   )r   r   r   rr   rs   Fz.mlp)rq   rr   rs   )r{   r|   r   nn	LayerNormnorm1norm2r   r   rm   mlp)r   r   r   r   r   rr   rs   r   r\   r]   r|   h  s&   
	

zGlm4vVisionBlock.__init__r   r   r   r   r   c           	      C   s>   | j | |||||d}| j||d\}}|| | }|S )Nr   r   r   r   )residual)r   r   r   r   )	r   r   r   r   r   r   x_attnx_fused_normr   r\   r\   r]   r     s   zGlm4vVisionBlock.forward)NNrn   r   )rU   rV   rW   r   r   r   Moduler#   r   r|   rZ   r[   r   r   r\   r\   r   r]   r   g  sB    #r   c                       sR   e Zd Z				ddedededed	d
f
 fddZdejd	ejfddZ  ZS )Glm4vVisionPatchEmbed   r=   rR      
patch_sizetemporal_patch_sizein_channelsr   r   Nc                    s>   t    || _|| _|| _|||f}t||||dd| _d S )NT)kernel_sizestriderq   )r{   r|   r   r   r   r   r   )r   r   r   r   r   r   r   r\   r]   r|     s   

zGlm4vVisionPatchEmbed.__init__r   c                 C   s:   |j \}}||d| j| j| j}| ||| j}|S )Nr   )r   r   r   r   r   r   )r   r   LCr\   r\   r]   r     s   
zGlm4vVisionPatchEmbed.forward)r   r=   rR   r   )	rU   rV   rW   r   r|   rZ   r[   r   r   r\   r\   r   r]   r     s"    r   c                       sR   e Zd Z			ddedededB deded	df fd
dZdej	fddZ
  ZS )Glm4vPatchMergerNFrn   d_modelcontext_dimrr   rq   rs   r   c              	      s   t    t }|| _t| j| j|d|| d|d| _t| j| _t	| j|gd ||| d|d| _
t|| j||| d|d| _t | _t | _d S )	NTr   )rq   gather_outputrr   rs   rx   r;   rt   ru   ry   rz   )r{   r|   rI   r   r   r   r   r   post_projection_normr    r}   r"   r~   r<   r   GELUextra_activation_func)r   r   r   rr   rq   rs   r   r   r\   r]   r|     s>   
	zGlm4vPatchMerger.__init__r   c                 C   sH   |  |\}}| | |}| |\}}| |}| |\}}|S r   )r   r   r   r}   r   r~   )r   r   r   gate_upr\   r\   r]   r     s   
zGlm4vPatchMerger.forward)NFrn   )rU   rV   rW   r   r#   r   r   r|   rZ   r[   r   r   r\   r\   r   r]   r     s$    (r   c                       s2   e Zd Zdef fddZdejfddZ  ZS )Glm4vVisionEmbeddingsconfigc                    sv   t    || _|j| _|j| _|j| _| j| j d | _| j| _t	
| j| j| _| jdt| jddd d S )Nr;   position_ids)r=   r   F)
persistent)r{   r|   r   r   r   
image_sizer   num_patchesnum_positionsr   	Embeddingposition_embeddingregister_bufferrZ   arangeexpand)r   r   r   r\   r]   r|     s   

zGlm4vVisionEmbeddings.__init__r   c                    s  | j j}|jd }|jd }|j}	||	||	}}|dkr,tjd||	|jd}
ntt	r:tj
|	tjdt tjsItj
 |	tjd |jd }t|d }||||ddddj|	tjd}t jd krg }g }ttD ]%}| jd  }| |df |  | |df |  q|t|j|	tjd}t|j|	tjd}n.t fddttD j|	tjd}t fddttD j|	tjd}|j|	tjd}|j|	tjd}|d | d d }|d | d d }tj||fd	d
dd}tj||dddd}|dd	dd}||j|j}
||
 }|S )Nr=   r   devicedtype      ?r;   c                    "   g | ]} |d f  | qS )r=   repeatr   iimage_shapeslengthsr\   r]   r   2     " z1Glm4vVisionEmbeddings.forward.<locals>.<listcomp>c                    r
  )r;   r  r  r  r\   r]   r   5  r  r   r   bicubicFborder)modealign_cornerspadding_mode)r  weightr   r  torZ   emptyr  
isinstancelistr   longr[   r   r   permute	unsqueezefloat32lenr   appendr  r   stackFgrid_samplesqueeze)r   
embeddingsr  r  h_coordsw_coordspos_embed_weightr   	total_seqr  adapted_pos_embedorig_size_sq	orig_sizepos_embed_2dtarget_h_listtarget_w_listr  	shape_idxtarget_htarget_wnorm_wnorm_hgridinterpolated_embed_fp32adapted_pos_embed_fp32r\   r  r]   r     s   






	


zGlm4vVisionEmbeddings.forward)	rU   rV   rW   r   r|   rZ   r[   r   r   r\   r\   r   r]   r     s
    r   c                       s   e Zd Z			ddedededB deddf
 fd	d
Zede	j
fddZ
ede	jfddZde	jdee	je	je	jf fddZde	jde	jdB fddZde	jde	jeee  B de	jfddZdeeee	jf  dee fddZ  ZS )Glm4vVisionTransformerr   Nrn   vision_confignorm_epsrr   rs   r   c           
         s.  t    j}j}j}j}j_j_j_j_j	_	t
|||jd_tt|d jj }	t|	ddddid_t fdd	t|D _tj	jd
 dd_t_tjjd_tjj	jjd_tjjd_t|	t  d_!d S )N)r   r   r   r   r   i    Tpartial_rotary_factorr	  )r   max_positionis_neox_stylerope_parametersc                    s0   g | ]}t jjj  d | dqS )z.blocks.)r   r   r   r   rr   rs   )r   r   r   out_hidden_size)r   	layer_idxr   rs   rr   r   r;  r\   r]   r   |  s    	z3Glm4vVisionTransformer.__init__.<locals>.<listcomp>Fz.merger)r   r   rr   rq   rs   )r   out_channelsr   r   )r   r  )"r{   r|   r   r   r   depthr   r   spatial_merge_sizerA  r   patch_embedr   r   r$   rotary_pos_embr   
ModuleListr   blocksr   intermediate_sizemergerr   r'  rms_norm_epspost_conv_layernormr   
downsamplepost_layernormrH   rZ   get_default_dtypeattn_backend)
r   r;  r<  rr   rs   r   r   r   rE  head_dimr   rC  r]   r|   X  sj   
	
zGlm4vVisionTransformer.__init__c                 C      | j jjjS r   )rG  r   r  r  r   r\   r\   r]   r       zGlm4vVisionTransformer.dtypec                 C   rT  r   )rG  r   r  r  rU  r\   r\   r]   r    rV  zGlm4vVisionTransformer.devicegrid_thwc                 C   s   g }|D ]]\}}}t |dd|}t |d|d}||| j | j|| j | jdddd }||| j | j|| j | jdddd }|t j	||gdd
|d qt j|dd}|d d dd f  }| j|\}	}
|	| d}|
| d}|||fS )Nr=   r   r   r;   rR   r   )rZ   r  r  r  reshaperF  r  flattenr"  r#  r  r   maxrH  get_cos_sin)r   rW  pos_idstrb   whpos_idswpos_idsmax_grid_sizecossincos_combinedsin_combinedr\   r\   r]   rot_pos_emb  s:   "

z"Glm4vVisionTransformer.rot_pos_embr   c                 C   s<   d }| j tjks| j tjkr|dd  |d d   }|S )Nr=   r   )rR  r:   
FLASH_ATTNROCM_AITER_FArZ  )r   r   r   r\   r\   r]   compute_attn_mask_seqlen  s
   z/Glm4vVisionTransformer.compute_attn_mask_seqlenr   c           
   	   C   s  t |trtj|tjd}|j| j| jd}| |}| 	|}| 
|\}}}t|d d df |d d df  |d d df jdtjd}t|d|g}|j| jdd}| |}|dd  |d d	   }| ||||d d df |d d df }|d}| jD ]}	|	|||||d
}q| |}|d	| j| j|jd	 }|dddd}| |d	| j}| |}|S )Nr  r  r=   r;   r   )r   r  T)non_blockingr   r   rR   )r  r  rZ   r   int32r  r  r  rG  rN  rf  repeat_interleavecumsumr   	new_zerosri  tolistr'  r  rJ  rP  r   rF  r   r  rO  rA  rL  )
r   r   rW  r   r   image_type_idsr   r   seqlensblkr\   r\   r]   r     sF   



,
"


	
zGlm4vVisionTransformer.forwardweightsc                 C   s   g d}t | jdd}t }|D ]9\}}|D ]\}}}	||vr!q|||}|| }
|
j}||
||	  n|| }
t|
dt}||
| || q|S )N))	attn.qkv.zattn.q.r   )ru  zattn.k.r   )ru  zattn.v.r   )r}   	gate_projr   )r}   up_projr=   F)remove_duplicateweight_loader)dictnamed_parameterssetreplacery  getattrr&   add)r   rt  stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
param_nameweight_nameshard_idparamry  r\   r\   r]   load_weights  s"   
z#Glm4vVisionTransformer.load_weights)r   Nrn   )rU   rV   rW   r   floatr#   r   r|   propertyrZ   r  r  r[   r   rf  ri  r  r   r   r   r|  r  r   r\   r\   r   r]   r:  W  sJ    K
&

,4r:  c                   @   sl  e Zd ZdeeedB f fddZdedefddZ	dede
fdd	Zd
d Zdddddedededededeeef fddZdefddZdededefddZdefddZdedededefddZdedefd d!Zd"ed#eeef defd$d%Zd&eeef d'edee fd(d)Zd&eeef d'edee fd*d+Zd,ejd&eeef d-ejdefd.d/Z dS )0Glm4vProcessingInfor   Nc                 C   s
   d ddS )Nr=   imagevideor\   rU  r\   r\   r]   get_supported_mm_limits0  s   
z+Glm4vProcessingInfo.get_supported_mm_limitskwargsc                 K      | j di |jS Nr\   )get_hf_processorimage_processorr   r  r\   r\   r]   get_image_processor3     z'Glm4vProcessingInfo.get_image_processorc                 K   r  r  )r  video_processorr  r\   r\   r]   get_video_processor6  r  z'Glm4vProcessingInfo.get_video_processorc                 C   s   t d|  dS )NT)video_needs_metadataexpected_hidden_size)r0   _get_expected_hidden_sizerU  r\   r\   r]   get_data_parser9  s   z#Glm4vProcessingInfo.get_data_parser   T )
num_frames	do_resizemax_image_pixelsimage_widthimage_heightr  r  r  c                C   s   |   }|j}|j}|j}	|j}
|r,t||
kr|n|
||||	 |d\}}t||d}nt||d}|||
  }t||
 d}|j| }|j	| }|| | }||	d  }||fS )N)r  heightwidthfactor
max_pixels)r  r  r=   r;   )
get_hf_configr;  r   rF  r   r   r.   rZ  r  r  )r   r  r  r  r  r  	hf_configr;  r   
merge_sizer   resized_heightresized_widthpreprocessed_sizepadded_num_framesgrid_tgrid_hgrid_wr   num_vision_tokensr\   r\   r]   _get_vision_info?  s0   	
	

z$Glm4vProcessingInfo._get_vision_infoc                 C   s   | j ddd\}}|S )Ni r  r  r  )r   max_image_sizer   r\   r\   r]   !get_image_size_with_most_featuresh  s   
z5Glm4vProcessingInfo.get_image_size_with_most_featuresc                C   s   | j ||dd\}}|S )Ni   )r  r  r  r  )r   r  r  r   num_image_tokensr\   r\   r]   get_num_image_tokensn  s   
z(Glm4vProcessingInfo.get_num_image_tokensc                 C   s   |   \}}| j||dS )Nr  )r  r  )r   target_widthtarget_heightr\   r\   r]   get_max_image_tokens{  s
   z(Glm4vProcessingInfo.get_max_image_tokensc                C   s   | j |||dd\}}|S )Nr  )r  r  r  r  r  )r   r  r  r  r   num_video_tokensr\   r\   r]   get_num_video_tokens  s   
z(Glm4vProcessingInfo.get_num_video_tokens
max_tokensc                 C   sF   |   \}}d}	 |d }| j|||d}||ks|dkr 	 |S |}q	)Nr   Tr=   )r  r  r  )r  r  )r   r  r  r  r  next_num_framesnext_max_tokensr\   r\   r]   _get_max_video_frames  s   z)Glm4vProcessingInfo._get_max_video_framesr   	mm_countsc                 C   sP   | dd}| dd}|  | }| || }t|t|d t}t|dS )Nr  r   r  r=   )getr  r  minrZ  _MAX_FRAMES_PER_VIDEO)r   r   r  
max_images
max_videosmax_image_tokensmax_total_framesmax_frames_per_videor\   r\   r]   !get_num_frames_with_most_features  s   
z5Glm4vProcessingInfo.get_num_frames_with_most_featuresmetadatatotal_framesc                    sx  |   |dj|d|}|d  |dt  d }|d }|s,|d }nC|jkrItt|j } fddt|D }n&tjj }||kr\t	t|}nt
jd	||d
d}	 fdd|	D }t g }
}|D ]}||
vr|
| || qwt|d@ r||d  |}fdd|D }|d d d }g }td	t|D ]	}|||  q|S )Nfpstotal_num_framesr=   durationdo_sample_framesframes_indicesc              	      s*   g | ]}t  tt| j qS r\   )r  r   mathceilr  r  max_frame_idx	video_fpsr  r\   r]   r     s    zCGlm4vProcessingInfo._get_video_second_idx_glm4v.<locals>.<listcomp>r   T)endpointc              	      s$   g | ]}t  tt| qS r\   )r  r   r  r  )r   r]  )r  r  r\   r]   r     s    r   c                       g | ]}t |  qS r\   r   r   idxr  r\   r]   r         r;   )r  r  r  roundmax_durationr   r  floorr   r  rO   linspacer|  r  r"  r!  )r   r  r  meta_framesr  r  frame_indicesrc   num_samplestarget_secondsseenuniqr  full_second_idxstimestamps_listselected_timestampsr\   r  r]   _get_video_second_idx_glm4v  sJ   



z/Glm4vProcessingInfo._get_video_second_idx_glm4vc                    sx  |   }|d |d|}|d }|dt| d }|dd}|s+|d }nddd	d
}	d}
d}t||}|dkrC|	d }n|dkrL|	d }n|	d }t|dd}t|| | }t||
}d   fddt|D }t|}||k rtjd|d |td	 }n&g }d}d||  }t|D ]}|| |kr||7 }|
| ||kr nqt||k rt|dkrdt|d d}}n	|d |d }}tj|||td	 }nt||krtjd|d |td	 }t g }}|D ]}||vr|| |
| qt|d@ r|
|d  |}fdd|D }|d d d }g }tt|D ]
}|
||  q/|S )Nr  r  r=   r  r  Tr  rR   r	  )   ,  `	  i  r  r  r  r   c                    s   g | ]}|  qS r\   r\   r  )duration_per_framer\   r]   r     s    zDGlm4vProcessingInfo._get_video_second_idx_glm46v.<locals>.<listcomp>r   rj  g        r   c                    r  r\   r  r  r  r\   r]   r   .  r  r;   )r  r  r  r  r~  r   r   rO   r  rp  r"  r!  rZ  r|  r  )r   r  r  r  r  r  r  r  r  DYNAMIC_FPS_THRESMAX_FRAME_COUNT_DYNAMICMAX_DURATIONeffective_duration
target_fpsr   	extract_t
timestamps
max_secondcurrent_secondinv_fpsframe_indexstartendr  r  r  r  r  r  r\   )r  r  r]   _get_video_second_idx_glm46v  s   








z0Glm4vProcessingInfo._get_video_second_idx_glm46vvideo_arrayrW  c                    s  |   }|  |j}|  }|j}|j}|j}	|j}
|jd }t	|t
js(J t	|tr5| |t|n| |t|}t	|trDdnd  fdd|D }|\}}}t|| | }g }||	 |D ]}|| ||jg|  || || qf||
 |S )Nr;   z{}z{:.1f} secondsc                    s    g | ]}j  |d dqS )F)add_special_tokens)encodeformatr  timestamp_format	tokenizerr\   r]   r   P  s    zDGlm4vProcessingInfo._construct_video_placeholder.<locals>.<listcomp>)r  get_tokenizerr  r  image_start_token_idimage_end_token_idvideo_start_token_idvideo_end_token_idr  r  rZ   r[   r   r  r!  r  r   r"  extendvideo_token_id)r   r  r  rW  hf_processorr  r  boi_token_ideoi_token_idbov_token_ideov_token_idmerge_lengthr  frames_idx_tokenTHWnum_tokens_per_frameplaceholder	frame_idxr\   r  r]   _construct_video_placeholder5  s<   





z0Glm4vProcessingInfo._construct_video_placeholder)!rU   rV   rW   r   r   r   r  objectr   r  r   r  r  r   r   r.   r  r  r  r  r  r  r  rz  r	   r  r  r  rO   ndarrayrZ   r[   r  r\   r\   r\   r]   r  /  s    

)





3

M
r  c                   @   s   e Zd Zdeeef defddZ	ddedeeef deeef dB defdd	Z	dd
dedededede
dB dee fddZdS )Glm4vDummyInputsBuilderr  r   c           
      C   sf   | dd}| dd}| j }| j }| j }|j}|j|j|jg}|	|}	|| |	|  S )Nr  r   r  )
r  infor  r  r	  image_tokenr  r  r  decode)
r   r  
num_images
num_videosr  r  r  r"  video_token_idsvideo_tokenr\   r\   r]   get_dummy_textc  s   



z&Glm4vDummyInputsBuilder.get_dummy_textNr   
mm_optionsc                 C   s   | dd}| dd}| j \}}| j||}|r!| dnd }	|r*| dnd }
| j||||	d| j|||||
ddS )Nr  r   r  )r  r  r$  	overrides)r  r  r  r%  r*  r  )r  r!  r  r  _get_dummy_images_get_dummy_videos)r   r   r  r)  r$  r%  r  r  target_num_framesimage_overridesvideo_overridesr\   r\   r]   get_dummy_mm_datau  s,   z)Glm4vDummyInputsBuilder.get_dummy_mm_data)r*  r  r  r  r%  r*  c                C   s   |rD|j r|j |krtd|j | t||j }|jr.|j|kr(td|j| t||j}|jrD|j|kr>td|j| t||j}t|d}tj|||dfdtj	d}g }t
|D ]}d|d |d	d
 t
|D ddd}	| |	f}
||
 q\|S )Nz]video.num_frames override (%d) exceeds model's maximum number of frames (%d), will be ignoredzMvideo.width override (%d) exceeds model's maximum width (%d), will be ignoredzOvideo.height override (%d) exceeds model's maximum height (%d), will be ignoredr;   rR      rj  g       @c                 S   s   g | ]}|qS r\   r\   r  r\   r\   r]   r     s    z=Glm4vDummyInputsBuilder._get_dummy_videos.<locals>.<listcomp>opencvF)r  r  r  r  video_backendr  )r  loggerwarningr  r  r  rZ  rO   fulluint8r   copyr"  )r   r  r  r  r%  r*  r  video_itemsr  video_metadata
video_itemr\   r\   r]   r,    sN   	



z)Glm4vDummyInputsBuilder._get_dummy_videosr   )rU   rV   rW   r   r   r   r(  r   r)   r0  r   r  r-   r,  r\   r\   r\   r]   r   b  s4    

(r   c                
       s   e Zd Zdedeeef deeef deeef def
 fddZded	eeef deeef fd
dZ	de
d	eeef dedee fddZ  ZS )Glm4vMultiModalProcessorpromptmm_data	mm_kwargs
tok_kwargsr   c                    sr  t |}| jjdi |}d|v rt|d trt|d dkrg }g }|dg D ]h}|\}	 t di |}
 dd|
d< t  }|	gg|d< dgtdi  fdd D gg|d< t	 j
d||
|d	}|d
}|j|||jk< |j|d }|d|d}||d  ||d  q*t t|t|d}nt  }t	 j
||||d	}t |fi |}t|S )Nvideosr   r  Tc                    s   i | ]}|vr| | qS r\   r\   )r   r   r  unuse_metadatar\   r]   
<dictcomp>  s
    z?Glm4vMultiModalProcessor._call_hf_processor.<locals>.<dictcomp>r:  +<|begin_of_video|><|video|><|end_of_video|>)r=  r>  r?  r@  	input_idsr=   rh   rf   )rf   rh   r\   )rz  r!  r  r  r  r!  popr  r   r{   _call_hf_processorr  image_token_idr  batch_decoder}  r"  rZ   r   r   )r   r=  r>  r?  r@  	processorvideo_grid_thw_lstpixel_values_videos_lstitemr  video_mm_kwargsvideo_mm_datavideo_outputsrF  video_placeholderprocessed_outputscombined_outputsr   rB  r]   rH    st   
z+Glm4vMultiModalProcessor._call_hf_processor	hf_inputshf_processor_mm_kwargsc                 C   s   t | j jj|S r   )rC   r!  r  r;  rF  )r   rU  rV  r\   r\   r]   _get_mm_fields_config"  s
   z.Glm4vMultiModalProcessor._get_mm_fields_configmm_itemsout_mm_kwargsc                    sz   j jdi | j jdi |}|jd dtf fdd}dtf fdd}td j|dtd	d
|dgS )Nr;   item_idxc                    sB   d |  }|d j }t|tjsJ t|  } jg| S )Nr  rS   )datar  rZ   r[   r   prodrI  )rZ  out_itemrW  
num_tokens)r  r  rY  r\   r]   get_image_replacement_glm4v6  s
   
zQGlm4vMultiModalProcessor._get_prompt_updates.<locals>.get_image_replacement_glm4vc                    sV   d |  }|d j }t|tjsJ d |  \}}j|||}tj| jdS )Nr  rh   )embed_token_id)	r[  r  rZ   r[   r!  r  r6   select_token_idr  )rZ  r]  rW  r  r  r  )r  rX  rY  r   r\   r]   get_video_replacement_glm4v>  s   
zQGlm4vMultiModalProcessor._get_prompt_updates.<locals>.get_video_replacement_glm4vr  )modalitytargetreplacementr  rE  r\   )r!  r  r  r  r   r4   r"  )r   rX  rV  rY  r  r_  rb  r\   )r  r  rX  rY  r   r]   _get_prompt_updates+  s    
z,Glm4vMultiModalProcessor._get_prompt_updates)rU   rV   rW   r   r   r  r   rH  r+   rW  r/   r	   r,   r   r5   rf  r   r\   r\   r   r]   r<    s8    


S


	
r<  )r!  dummy_inputsc                       s  e Zd Zg ddgdZedddddZd	Zed
ede	dedB fddZ
dddedef fddZdededB fddZdededB fddZdedeejdf fddZdedeejdf fd d!Zdedefd"d#ZdededB fd$d%Zd&ee deee	e	e	e	f  fd'd(Zd)ee	 d&ee deeje	f fd*d+Z		d@d,ejdB d-ejd.e dB d/ejdB dedeje B fd0d1Z!d2ejdejdB fd3d4Z"d5e#eeejf  de$e fd6d7Z%de&fd8d9Z'd:e	de	fd;d<Z(d=e	de	fd>d?Z)  Z*S )AGlm4vForConditionalGenerationq_projk_projv_projr}   qkv_projr}   zlanguage_model.lm_head.zlanguage_model.model.visual.)zlm_head.zmodel.language_model.zmodel.visual.)orig_to_new_prefixTrc  r  r   Nc                 C   s$   | drdS | drdS td)Nr  z+<|begin_of_image|><|image|><|end_of_image|>r  rE  z)Only image or video modality is supported)
startswith
ValueError)clsrc  r  r\   r\   r]   get_placeholder_strv  s
   

z1Glm4vForConditionalGeneration.get_placeholder_strrn   )rs   vllm_configrs   c                   s  t    |jj}|j}|jj}|| _|| _|jdk| _| 	|ddh t
|jt|dd|t|dd| _W d    n1 s@w   Y  |jdv rNd	g}n|jd
krWdg}nd }| | t||jt|d|d| _W d    n1 svw   Y  | jj| _d S )Nr[  r  r  rM  gh㈵>visual)r<  rr   rs   )glm4vglm_ocrGlm4ForCausalLM	glm4v_moeGlm4MoeForCausalLMlanguage_model)ru  r  rs   architectures)r{   r|   model_configr  rr   multimodal_configr   mm_encoder_tp_moder   _mark_tower_modelr:  r;  r~  rG   rv  
model_type_mark_language_modelrF   text_configr|  make_empty_intermediate_tensors)r   ru  rs   r   rr   r  r}  r   r\   r]   r|     s<   





	z&Glm4vForConditionalGeneration.__init__r  c                 K   h   | dd }| dd }| dd }|d u r|d u rd S |d ur'td||dS |d ur2td||dS d S )NrM   r`   rS   )rN   rM   rS   )rN   r`   rS   )rG  rL   r_   )r   r  rM   r`   rS   r\   r\   r]   _parse_and_validate_image_input  $   z=Glm4vForConditionalGeneration._parse_and_validate_image_inputc                 K   r  )Nrf   rj   rh   )rN   rf   rh   )rN   rj   rh   )rG  re   ri   )r   r  rf   rj   rh   r\   r\   r]   _parse_and_validate_video_input  r  z=Glm4vForConditionalGeneration._parse_and_validate_video_inputimage_input.c                 C      |d }|j dksJ |d dkr|d | jj}n|d | jj}| jr2t| j|| ddS | j||d}| jj}|d	| |  }|	|S )
NrS   r;   rN   r`   rM   rope_3d	rope_typerW  r   
ndimrN   rv  r  r   rJ   rp  rF  r\  r   )r   r  rW  r`   rM   r  sizesr\   r\   r]   _process_image_input  s   
z2Glm4vForConditionalGeneration._process_image_inputvideo_inputc                 C   r  )
Nrh   r;   rN   rj   rf   r  r  r  r   r  )r   r  rW  rj   rf   r  r  r\   r\   r]   _process_video_input  s$   
z2Glm4vForConditionalGeneration._process_video_inputc                 K   sZ   i }|D ]&}|dv rd|vr| j di ||d< |dv r*d|vr*| jdi ||d< q|S )N)rM   r`   r  )rf   rj   r  r\   )r  r  )r   r  mm_input_by_modality	input_keyr\   r\   r]   %_parse_and_validate_multimodal_inputs  s   

zCGlm4vForConditionalGeneration._parse_and_validate_multimodal_inputsc                 K   sn   | j di |}|sd S d}|D ]$}|| }|dkr%| |}|t|7 }|dkr4| |}|t|7 }q|S )Nr\   r  r  )r  r  r   r  )r   r  r  multimodal_embeddingsrc  multimodal_inputimage_embeddingsvideo_embeddingsr\   r\   r]   embed_multimodal  s   

z.Glm4vForConditionalGeneration.embed_multimodalmm_featuresc           	      c   s    | j }|jj}t|dd dD ]P}|jj}|jdkr=|jd j \}}}|dks1J d| |||| || fV  q|jdkrY|jd	 j \}}}|||| || fV  qt	d
|j d S )Nc                 S   s   | j jS r   )mm_positionoffset)ra   r\   r\   r]   <lambda>4  s    z@Glm4vForConditionalGeneration.iter_mm_grid_thw.<locals>.<lambda>)r   r  rS   r=   zImage must have 1 frame, got r  rh   zUnsupported modality: )
r   r;  rF  sortedr  r  rc  r[  rp  rr  )	r   r  r  rF  
mm_featurer  r]  rb   r^  r\   r\   r]   iter_mm_grid_thw/  s$   

z.Glm4vForConditionalGeneration.iter_mm_grid_thwinput_tokensc                 C   s6  g }d}|  |D ]G\}}}}|| }	t|dkr!|d  d nd}
|tt|	d|	f|
  t|||fdd}|||	 |
  ||| |  }q	|t|k r}t|| }	t|dkrk|d  d nd}
|tt|	d|	f|
  tj	|dddd}| d t| 
 }t||fS )Nr   r   r=   rR   )axis)r  r!  rZ  r"  rO   broadcast_tor  indicesrX  concatenaterN  rZ   
from_numpy)r   r  r  llm_pos_ids_liststr  
llm_grid_t
llm_grid_h
llm_grid_wtext_lenst_idxgrid_indicesllm_positionsmrope_position_deltar\   r\   r]   get_mrope_input_positionsE  s6     z7Glm4vForConditionalGeneration.get_mrope_input_positionsrF  	positionsintermediate_tensorsinputs_embedsc                 K   s$   |durd}| j j||||d}|S )a  Run forward pass for GLM-4V.

        Args:
            input_ids: Flattened (concatenated) input_ids corresponding to a
                batch.
            positions: Flattened (concatenated) position ids corresponding to a
                batch.
                **NOTE**: If mrope is enabled (default setting for GLM-4V
                opensource models), the shape will be `(3, seq_len)`,
                otherwise it will be `(seq_len,).
            intermediate_tensors: Optional intermediate tensors for pipeline
                parallelism.
            inputs_embeds: Optional pre-computed input embeddings.
            **kwargs: Additional keyword arguments.
        N)rF  r  r  r  )r|  model)r   rF  r  r  r  r  hidden_statesr\   r\   r]   r   h  s   z%Glm4vForConditionalGeneration.forwardr  c                 C   s   | j |S r   )r|  compute_logits)r   r  r\   r\   r]   r    s   z,Glm4vForConditionalGeneration.compute_logitsrt  c                 C   s   t | }|j|| jdS )N)mapper)rD   r  hf_to_vllm_mapper)r   rt  loaderr\   r\   r]   r    s   z*Glm4vForConditionalGeneration.load_weightsc                 C   s   t jddddS )z<
        Get the module prefix in multimodal models
        zlanguage_model.modelzvisual.merger.ro  )r|  	connectortower_model)r'   from_string_fieldrU  r\   r\   r]   get_mm_mapping  s
   z,Glm4vForConditionalGeneration.get_mm_mappingr  c                 C   s   | j jj}||d  S Nr;   r   r;  rF  )r   r  r  r\   r\   r]   get_num_mm_encoder_tokens     
z7Glm4vForConditionalGeneration.get_num_mm_encoder_tokensr  c                 C   s   | j jj}||d  S r  r  )r   r  r  r\   r\   r]   get_num_mm_connector_tokens  r  z9Glm4vForConditionalGeneration.get_num_mm_connector_tokens)NN)+rU   rV   rW   packed_modules_mappingrE   r  supports_encoder_tp_dataclassmethodr   r   rt  r   r|   r  rd   r  rl   r  r   rZ   r[   r  r  rz  r  r>   r  r  r*   r   r  r  r7   r   r  r   r|  r  r'   r  r  r  r   r\   r\   r   r]   rh  Z  s    	
%





'
"
$

rh  c                   @   s   e Zd Zg dddgdZdS ) Glm4vMoeForConditionalGenerationri  rv  rw  rm  N)rU   rV   rW   r  r\   r\   r\   r]   r    s    
r  )rX   r  collections.abcr   r   r   r   r   	functoolsr   typingr   r	   r
   r   numpyrO   rZ   torch.nnr   torch.nn.functional
functionalr$  einopsr   transformersr   r   -transformers.models.glm4v.configuration_glm4vr   0transformers.models.glm4v.image_processing_glm4vr   r   0transformers.models.glm4v.video_processing_glm4vr   transformers.video_utilsr   vllm.configr   vllm.config.multimodalr   r   vllm.distributedr   r   r   r   vllm.loggerr   $vllm.model_executor.layers.attentionr   vllm.model_executor.layers.convr   r   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r    r!   r"   'vllm.model_executor.layers.quantizationr#   +vllm.model_executor.layers.rotary_embeddingr$   2vllm.model_executor.layers.rotary_embedding.commonr%   -vllm.model_executor.model_loader.weight_utilsr&   )vllm.model_executor.models.module_mappingr'   vllm.multimodalr(   vllm.multimodal.inputsr)   r*   r+   r,   r-   vllm.multimodal.parser.   r/   r0   vllm.multimodal.processingr1   r2   r3   r4   r5   r6   vllm.sequencer7   vllm.utils.tensor_schemar8   r9   #vllm.v1.attention.backends.registryr:   layers.activationr<   
interfacesr>   r?   r@   rA   rB   qwen2_vlrC   rD   rE   rF   rG   visionrH   rI   rJ   rU   r4  r  rL   r_   rd   rY   re   ri   rl   r   rm   r   r   r   r   r   r   r   r:  r  r   r<  register_processorrh  r  r\   r\   r\   r]   <module>   s    $m32n Y  5l 
  P