o
    
۾iv                    @   s  d dl Z d dlmZmZmZ d dlmZmZ d dlm	Z	m
Z
 d dlmZ d dlmZmZ d dlZd dlZd dlmZ d dlm  mZ d dlmZ d dlmZ d d	lmZmZmZm Z  d d
l!m"Z" d dl#m$Z$ d dl%m&Z&m'Z' d dl(m)Z) d dl*m+Z+m,Z, d dl-m.Z.m/Z/ d dl0m1Z1m2Z2m3Z3m4Z4m5Z5 d dl6m7Z7 d dl8m9Z9m:Z:m;Z; d dl<m=Z=m>Z> d dl?m@Z@ d dlAmBZBmCZCmDZDmEZE d dlFmGZG d dlHmIZI d dlJmKZK d dlLmMZMmNZN d dlOmPZP d dlQmRZR d dlSmTZT d dlUmVZVmWZWmXZXmYZY d dlZm[Z[m\Z\m]Z]m^Z^ d dl_m`Z`maZambZbmcZcmdZd d d lemfZf d d!lgmhZh d d"limjZj d d#lkmlZlmmZm d$d%lnmoZompZpmqZqmrZrmsZs d$d&ltmuZumvZvmwZwmxZxmyZymzZzm{Z{m|Z| e7e}Z~d'Zd(Zd)ZG d*d+ d+elZG d,d- d-elZeG d.d/ d/ZeG d0d1 d1ZeG d2d3 d3ZG d4d5 d5ejZG d6d7 d7ejZG d8d9 d9ejZG d:d; d;ejZG d<d= d=ejZG d>d? d?ejZG d@dA dAejZG dBdC dCejesZG dDdE dEejZG dFdG dGejZG dHdI dIejZG dJdK dKeZe)G dLdM dMejesZdNedOedPedQedRedSeeef fdTdUZdVedSeeeef  fdWdXZdYedZedPed[efd\d]Zd^e"dSe\fd_d`Zdae"dB dSe"dB fdbdcZddejdeedfedgedhediedSeejejf fdjdkZdlejdeedmednedSeejejf f
dodpZG dqdr drZefdseeB dteeB dueeB dSee fdvdwZdsedxedyedzed{ee dSedB fd|d}Zd~d ZG dd deaZG dd defe ZG dd de`e ZeTjeeedG dd dejeqerepesZdeeeejf  dSeeeejf  fddZdS )    N)IterableMappingSequence)	dataclassfields)cached_propertypartial)islice)	AnnotatedAny)ImageOps)Image)BatchFeaturePretrainedConfigProcessorMixin
TensorType)
ImageInput)	TextInput)
VideoInputVideoMetadata)support_torch_compile)CacheConfig
VllmConfig)BaseDummyOptionsVideoDummyOptions)get_pp_groupget_tensor_model_parallel_rank$get_tensor_model_parallel_world_sizesplit_tensor_along_last_dim tensor_model_parallel_all_gather)init_logger)
MulAndSilu
SiluAndMul
get_act_fn)	AttentionMMEncoderAttention)RMSNorm)ColumnParallelLinearMergedColumnParallelLinearQKVParallelLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)get_rope)ParallelLMHeadVocabParallelEmbedding)default_weight_loader)MultiModelKeys)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItems	VideoItem)ImageProcessorItems	ImageSizeMultiModalDataItemsMultiModalDataParser)BaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdatePromptUpdateDetails)BaseDummyInputsBuilder)IntermediateTensors
round_down)TensorSchemaTensorShape   )MultiModalEmbeddingsSupportsLoRASupportsMultiModal
SupportsPPSupportsQuant)AutoWeightsLoaderWeightsMapper_merge_multimodal_embeddingsextract_layer_indexis_pp_missing_parameter'make_empty_intermediate_tensors_factorymake_layersmaybe_prefixz	<|image|>z	<|video|>   c                   @      e Zd ZU dZeejedddf ed< eejeddf ed< 	 eejed	f ed
< eej	edf ed< eejed	f ed< dS )Molmo2ImageInputsa`  
    Dimensions:
        - nc: The total number of crops (dynamic)
        - np: The total number of patches per crop
        - cps: Number of channels * patch_size * patch_size
        - npp: Number of pooled patches (dynamic)
        - pp: pooling_size * pooling_size
        - ni: Number of images
        - nt: Number of image tokens (dynamic)
    ncnpcpspixel_valuesnpppptoken_poolingninum_pooled_patchesntimage_tokensnum_image_tokensN
__name__
__module____qualname____doc__r
   torchTensorrE   __annotations__
BoolTensor rl   rl   U/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/molmo2.pyrV   l      
 rV   c                   @   rU   )Molmo2VideoInputsab  
    Dimensions:
        - nc: The total number of frames (dynamic)
        - np: The total number of patches per frame
        - cps: Number of channels * patch_size * patch_size
        - npp: Number of pooled patches (dynamic)
        - pp: pooling_size * pooling_size
        - nv: Number of videos
        - nt: Number of video tokens (dynamic)
    rW   rX   rY   pixel_values_videosr[   r\   r]   nvr_   r`   video_tokensnum_video_tokensNrc   rl   rl   rl   rm   ro      rn   ro   c                   @   s   e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed	< dZ
eed
< dZeed< dZeed< dZeed< dZeeef ed< dZeed< dZeed< dd Zedd ZdS )	VitConfigzConfig for a vision transformer  hidden_sizei  intermediate_size   num_hidden_layers   num_attention_headsnum_key_value_headsH   head_dimgelu_pytorch_tanh
hidden_actư>layer_norm_eps)z  r   image_default_input_size   image_patch_sizeiA  image_num_posc                 C   s   t | j| _d S N)tupler   selfrl   rl   rm   __post_init__   s   zVitConfig.__post_init__c                 C   s   | j \}}|| j || j fS r   )r   r   )r   hwrl   rl   rm   image_num_patch   s   
zVitConfig.image_num_patchN)rd   re   rf   rg   rv   intrj   rw   ry   r{   r|   r~   r   strr   floatr   r   r   r   r   propertyr   rl   rl   rl   rm   rt      s    
 rt   c                   @   s   e Zd ZU dZdZeeef ed< dZe	ed< dZ
eed< dZeed	< dZeed
< dZeed< dZeed< dZeed< dZeed< dS )AdapterConfigzConfig for a vit-llm adapter)i
vit_layersFpooling_attention_maskru   rv   rz   r{   r|   r}   r~   silur    J  rw      text_hidden_sizeN)rd   re   rf   rg   r   r   r   rj   r   boolrv   r{   r|   r~   r   r   rw   r   rl   rl   rl   rm   r      s   
 r   c                   @   s
  e Zd ZU dZdZeed< 	 dZeed< 	 dZeed< 	 dZ	eed	< 	 d
Z
eed< 	 dZeed< 	 dZeed< 	 dZeed< 	 dZeed< 	 dZeed< 	 dZeed< 	 dZeed< 	 dZeed< 	 dZeed< 	 dZeed< 	 dZeed< 	 d Zeed!f d B ed"< d S )#
TextConfigz*Configuration for a text model transformerr   rv      r{      r|      r~   i R 
vocab_sizeadditional_vocab_sizeTqkv_bias0   ry   r   rw   r   r   i   max_position_embeddingsg    .A
rope_thetaFuse_qk_normolmoqk_norm_typer   r   
norm_afterN.rope_scaling_layers)rd   re   rf   rg   rv   r   rj   r{   r|   r~   r   r   r   r   ry   rw   r   r   r   r   r   r   r   r   r   r   r   rl   rl   rl   rm   r      sH   
 r   c                       sZ   e Zd ZdZ		ddededededB ded	df fd
dZdej	d	ej	fddZ
  ZS )ViTMLPzMLP used in Vision Transformer.N dim
hidden_dimr   quant_configprefixreturnc                    sL   t    t||d|| dd| _t|| _t||d|| dd| _d S )NTz.w1biasr   r   z.w2)super__init__r'   w1r#   actr*   w2)r   r   r   r   r   r   	__class__rl   rm   r   #  s    

zViTMLP.__init__xc                 C   *   |  |\}}| |}| |\}}|S r   )r   r   r   r   r   _rl   rl   rm   forward=     
zViTMLP.forwardNr   rd   re   rf   rg   r   r   r,   r   rh   ri   r   __classcell__rl   rl   r   rm   r      s$    r   c                       sd   e Zd ZdZ			ddedededed	ed
edB deddf fddZde	j
de	j
fddZ  ZS )ViTMultiHeadDotProductAttentionz0Multi-head attention used in Vision Transformer.TNr   rv   	num_headsr|   r~   use_biasr   r   r   c           	   	      sJ  t    || _|| _t }| j| j dksJ | j| dks!J | j| | _|| _| j| j| j ks5J || _| j|krG| j| dksFJ n	|| j dksPJ td| j| | _	| j| j | _
| j	| j | _t| j| j| j| j||| dd| _t| j| j | j||| dd| _| jd | _t| j| j| j| j	| dd| _d S )	Nr   rF   z.merged_qkvr   z.wo      .attnnum_kv_headsr   )r   r   rv   total_num_headsr   r   r~   total_num_kv_headsmaxr   q_sizekv_sizer)   
merged_qkvr*   woscaler%   attn)	r   rv   r   r|   r~   r   r   r   tp_sizer   rl   rm   r   G  sP   


	
z(ViTMultiHeadDotProductAttention.__init__inputsc                 C   sN   |  |\}}|j| j| j| jgdd\}}}| |||}| |\}}|S Nr   )r   splitr   r   r   r   )r   r   qkvr   xqxkxvoutputrl   rl   rm   r     s
    z'ViTMultiHeadDotProductAttention.forward)TNr   )rd   re   rf   rg   r   r   r,   r   r   rh   ri   r   r   rl   rl   r   rm   r   D  s.    	<r   c                	       sR   e Zd ZdZ		ddededB deddf fdd	Zd
ej	dej	fddZ
  ZS )Molmo2VisionBlockz4Residual attention block used in Vision Transformer.Nr   configr   r   r   c                    sz   t    t|j|j|j|j|| dd| _t|j|j	|j
|| dd| _tj|j|jd| _tj|j|jd| _d S )Nz
.attention)rv   r   r|   r~   r   r   z.feed_forward)r   r   r   r   r   eps)r   r   r   rv   r{   r|   r~   	attentionr   rw   r   feed_forwardnn	LayerNormr   attention_normffn_normr   r   r   r   r   rl   rm   r     s0   
zMolmo2VisionBlock.__init__r   c                 C   s,   ||  | | }|| | | }|S r   )r   r   r   r   )r   r   rl   rl   rm   r     s   zMolmo2VisionBlock.forwardr   )rd   re   rf   rg   rt   r,   r   r   rh   ri   r   r   rl   rl   r   rm   r     s    r   c                	       sV   e Zd ZdZ		ddededB deddf fdd	Zd
ej	de
ej	 fddZ  ZS )Molmo2VisionBlockCollectionzCCollection of residual attention blocks used in Vision Transformer.Nr   r   r   r   r   c                    s2   t    t fddt jD | _d S )Nc                    s$   g | ]}t   d | dqS )z.resblocks.r   )r   ).0	layer_idxr   r   r   rl   rm   
<listcomp>  s    z8Molmo2VisionBlockCollection.__init__.<locals>.<listcomp>)r   r   r   
ModuleListrangery   	resblocksr   r   r   rm   r     s   

z$Molmo2VisionBlockCollection.__init__r   c                 C   s&   g }| j D ]}||}|| q|S r   )r   append)r   r   hidden_statesrrl   rl   rm   r     s
   
z#Molmo2VisionBlockCollection.forwardr   )rd   re   rf   rg   rt   r,   r   r   rh   ri   listr   r   rl   rl   r   rm   r     s    "r   c                	       s|   e Zd ZdZ		ddededB deddf fdd	Zd
ej	de
dej	fddZ	dd
ej	de
dB deej	 fddZ  ZS )Molmo2VisionTransformerz+Vision Transformer used in Vision Backbone.Nr   r   r   r   r   c                    sz   t    |jd }d| _|j| _tt	|j
|j| | _|j}tj|| d |jdd| _t||| dd| _d S )Nr   r      T)r   z.transformerr   )r   r   rv   num_prefix_tokensr   	patch_numr   	Parameterrh   randnr   positional_embeddingr   Linearpatch_embeddingr   transformer)r   r   r   r   r   r   r   rl   rm   r     s$   


z Molmo2VisionTransformer.__init__r   r   c                 C   s   | j }|tt|jd tt|jd |jd f}|\}}|jd |ks0|jd |krR|ddddd}tj	|||fdddd}|dddd
d}|d	|jd	 }||d d d d d f |j }|S )
Nr   rF   r      bicubicFT)sizemodealign_corners	antialiasr   )r   reshaper   mathsqrtshape	unsqueezepermuteFinterpolatesqueezetodtype)r   r   r   pos_embpatch_num_0patch_num_1rl   rl   rm   add_pos_emb  s*   "z#Molmo2VisionTransformer.add_pos_embc                 C   s2   |du r| j }| |}| ||}| |}|S )z>
        : param x: (batch_size, num_patch, n_pixels)
        N)r   r   r  r  )r   r   r   r   rl   rl   rm   r     s   

zMolmo2VisionTransformer.forwardr   r   )rd   re   rf   rg   rt   r,   r   r   rh   ri   r   r  r   r   r   rl   rl   r   rm   r     s,     r   c                       s   e Zd ZdZ				ddededed	ed
ededededB deddf fddZ	dde	j
de	j
de	j
de	j
dB de	j
f
ddZ	dde	j
de	j
de	j
dB de	j
fddZ  ZS )ImagePoolingAttentionz+Multi-head attention used for image poolingTFNr   	input_dimrv   r   r|   r~   r   use_pytorch_sdpar   r   r   c
                    s~  t    || _|| _|| _t }
| j| j dksJ | j|
 dks$J | j|
 | _|| _| j| j| j ks8J || _| j|
krJ| j|
 dksIJ n	|
| j dksSJ t	d| j|
 | _
| j
| j | _t| j| j| j |||	 dd| _t| j| j| j gd |||	 dd| _t| j| j | j|||	 dd| _| jd | _|| _|rd | _d S t| j| j| j| j
|	 d	d
| _d S )Nr   rF   z.q_projr   r  z
.merged_kvz.o_projr   r   r   )r   r   r  rv   r   r   r   r~   r   r   r   r   r'   q_projr(   	merged_kvr*   o_projr   r  r   r%   )r   r  rv   r   r|   r~   r   r  r   r   r   r   rl   rm   r   "  s`   




zImagePoolingAttention.__init__querykeyvalue	attn_maskc           
      C   s   |  \}}}| d}|||| j| j}|||| j| j}|||| j| j}dd |||fD \}}}tj||||d| j| jkddd}	|	||dS )NrF   c                 s   s    | ]	}| d dV  qdS )rF   r  N)	transpose)r   r   rl   rl   rm   	<genexpr>w  s    z5ImagePoolingAttention.forward_sdpa.<locals>.<genexpr>F)r   	is_causal
enable_gqar  r   )	r  viewr   r~   r   r  scaled_dot_product_attentionr!  r  )
r   r  r  r  r   bszq_lenr   kv_lenoutrl   rl   rm   forward_sdpai  s"   

	z"ImagePoolingAttention.forward_sdpainputs_q	inputs_kvc           
      C   sn   |  |\}}| |\}}|j| j| jgdd\}}| jr'| ||||}	n| |||}	| |	\}	}|	S r   )r  r  r   r   r  r+  r   r  )
r   r,  r-  r   r   r   kvr   r   r   rl   rl   rm   r     s   zImagePoolingAttention.forward)TFNr   r   )rd   re   rf   rg   r   r   r,   r   r   rh   ri   r+  r   r   rl   rl   r   rm   r    sb    		
L
r  c                       s^   e Zd ZdZ		ddedededededB d	ed
df fddZdej	d
ej	fddZ
  ZS )ImageProjectorMLPz MLP used for the image projectorNr   r  r   
output_dimr   r   r   r   c                    s\   t    t||gd d|| dd| _|dksJ t | _t||d|| dd| _d S )Nr  Fz.merged_linearr   r   z
.down_proj)r   r   r(   merged_linearr"   act_fnr*   	down_proj)r   r  r   r0  r   r   r   r   rl   rm   r     s"   
	zImageProjectorMLP.__init__r   c                 C   r   r   )r1  r2  r3  r   rl   rl   rm   r     r   zImageProjectorMLP.forwardr   r   rl   rl   r   rm   r/    s(    r/  c                       s   e Zd Zg dddgddgdZ		dd	ed
ededB deddf
 fddZe	de
jfddZe	de
jfddZde
jde
jfddZde
jde
jde
jfddZdeeee
jf  dee fddZ  ZS )Molmo2VisionBackbonewqwkwvk_projv_proj	gate_projup_proj)r   r  r1  Nr   
vit_configadapter_configr   r   r   c              
      s   t    || _|| _g | _|jD ]}|dkr| j| q| j||j  qt| jd }||jk r7||_t||| dd| _	| j	j
| _
|jt|j }t||j|j|j|j|j|| dd| _t|j|j|j|j|| dd| _d S )	Nr   rF   z
.image_vitr   z.image_pooling_2d)r  rv   r   r|   r~   r  r   r   z.image_projector)r  r   r0  r   r   r   )r   r   r=  r>  r   r   ry   r   r   	image_vitr   rv   lenr  r{   r|   r~   r   image_pooling_2dr/  rw   r   r   image_projector)r   r=  r>  r   r   layerlast_layer_neededpool_dimr   rl   rm   r     sH   




zMolmo2VisionBackbone.__init__c                 C      | j jjjS r   )r?  r   weightr  r   rl   rl   rm   r       zMolmo2VisionBackbone.dtypec                 C   rF  r   )r?  r   rG  devicer   rl   rl   rm   rI    rH  zMolmo2VisionBackbone.deviceimagesc           	      C   s   |j \}}}}||| ||}| |}g }| jD ]	}|||  qtj|dd}| jdkr:|ddddf }||||d}|S )zN
        : param images: (batch_size, num_crops, num_patch, n_pixels)
        r   r   r   NrF   )r  r%  r?  r   r   rh   catr   )	r   rJ  BTNDimage_featuresfeaturesrC  rl   rl   rm   encode_image  s   


z!Molmo2VisionBackbone.encode_imager]   c                 C   s  |j d d \}}|j| j| jd}| |}|j d }|dk}t|d}tj|j d tj|jd}	t	|	
|ddd|j d |j d g}	||d||	t|df }
|
|| jd d d d d d d f  }
|
d|j d |g}
| jjr|ddd|j d g}|
d|
j d  d}t|dkd|}|
jddd	|d d d d f |
j }n	d }|
jddd	}| j||
|d
}||d|j d g}| |}|
d|j d |  S )Nr  rI  r  r   r   r  rI  rF   T)keepdim)r   )r  r  rI  r  rR  rh   anyarangelongtiler%  r  clipr>  r   r   sumwheremeanrA  rB  flatten)r   rJ  r]   
batch_size	num_imagerP  r   validvalid_token	batch_idxto_poolr   denomr  pooled_featuresrl   rl   rm   r     sJ   

(
zMolmo2VisionBackbone.forwardweightsc                 C   s   g d}t |  }t }|D ]Y\}}|D ].\}}}	||vrq|||}|dr/||vr/qt|| r5q|| }
|
j}||
||	  n|drN||vrNqt|| rTq|| }
t|
dt}||
| |	| q|S )N))r   r6  q)r   r7  k)r   r8  v)r  r9  r   )r  r:  rF   )r1  r;  r   )r1  r<  rF   .biasweight_loader)
dictnamed_parameterssetreplaceendswithrP   rm  getattrr0   add)r   rh  stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
param_nameweight_nameshard_idparamrm  rl   rl   rm   load_weightsI  s2   



z!Molmo2VisionBackbone.load_weightsr   )rd   re   rf   packed_modules_mappingrt   r   r,   r   r   r   rh   r  rI  ri   rR  r   r   r   rp  r~  r   rl   rl   r   rm   r4    s>    
1
,6r4  c                       s   e Zd ZdZ			ddedeeef dedB de	dB ded	df fd
dZ
dejdejd	eejejf fddZdejdejded	ejfddZ  ZS )Molmo2AttentionzMolmo2's LLM Attention.Nr   r   rope_parameterscache_configr   r   r   c           
   	      s  t    |j| _t | _|j| _| j| j dksJ | j| j dks%J | j| j | _|j| _	| j	| jkrA| j	| j dks@J n
| j| j	 dksKJ t
d| j	| j | _|j| _| j| j | _| j| j | _|j| _|j| _t| j| j| j| j	|j|d| _d | _d | _d | _d | _|jr|jdkr| jn| j	| j }t | _t||jd| _|jdkr| jn| j| j }t||jd| _|j| _t|}|jd ur||jvr|d }	d|	d}t| j| j|d	| _| jd
 | _ t!| j| j| j | j||| dd| _"t#| j| j | jd|d| _$d S )Nr   rF   r   r   qwen3r   r   default)	rope_typer   )max_positionr  r   r   )r   r  r   r   F)%r   r   rv   r   r   r{   r   r   r|   r   r   r   r~   r   r   r   r   r)   r   qkv_projtp_rankk_normq_normr   r   r   r&   r   rO   r   r-   
rotary_embscalingr$   r   r*   r  )
r   r   r  r  r   r   k_norm_sizeq_norm_sizer   r   r   rl   rm   r   t  s   
	







zMolmo2Attention.__init__ri  rj  c                 C   sr   | j dkrt| }t| }| |}| |}| j dkr5tt| j d}||| j }||| j }||fS )NrF   )num_partitions)r   r   
contiguousr  r  r   r   r  )r   ri  rj  splitterrl   rl   rm   _apply_qk_norm  s   



zMolmo2Attention._apply_qk_norm	positionsr   kwargsc                 K   s0  |  |\}}|j| j| j| jgdd\}}}| jd ur/| jd ur/| jdkr/| ||\}}nP| jd ur| jd ur|jg |j	d d |j	d | j
 | j
R  }	| |	}	|	|j	}|jg |j	d d |j	d | j
 | j
R  }
| |
}
|
|j	}| |||\}}| |||}| |\}}|S )Nr   r   r   )r  r   r   r   r  r  r   r  r%  r  r~   r  r   r  )r   r  r   r  r   r   ri  rj  rk  	q_by_head	k_by_headattn_outputr   rl   rl   rm   r     s:    




zMolmo2Attention.forwardNNr   )rd   re   rf   rg   r   rn  r   r   r   r,   r   rh   ri   r   r  objectr   r   rl   rl   r   rm   r  q  sD    
[
r  c                       sT   e Zd ZdZ	ddededededB ddf
 fdd	Zd
ej	dej	fddZ
  ZS )LanguageModelMLPzMolmo2's LLM mlp.Nr  rw   r   r   r   c                    sL   t    t||gd d|d| _|dksJ t | _t||d|d| _d S )Nr  Fr  r   )r   r   r(   up_gate_projr!   r2  r*   r3  )r   r  rw   r   r   r   rl   rm   r     s   
zLanguageModelMLP.__init__r   c                 C   s*   |  |\}}| |}| |\}}|S r   )r  r2  r3  )r   r   up_gater   rl   rl   rm   r      s   
zLanguageModelMLP.forwardr   r   rl   rl   r   rm   r    s&    r  c                       s   e Zd Z			ddedeeef dedB dedB deddf fd	d
Z	de
jde
jde
jdB dedee
jee
je
jf dB f f
ddZ  ZS )Molmo2DecoderLayerNr   r   r  r  r   r   r   c                    sb   t    t||||| dd| _t|j|j|j|| _t	|j|j
d| _t	|j|j
d| _d S )Nz
.self_attnr   r   )r   r   r  	self_attnr  rv   rw   r   mlpr&   r   input_layernormpost_attention_layernorm)r   r   r  r  r   r   r   rl   rm   r   +  s&   
	zMolmo2DecoderLayer.__init__r  r   residualr  c                 K   s`   |d u r|}|  |}n|  ||\}}| jd||d|}| ||\}}| |}||fS N)r  r   rl   )r  r  r  r  r   r  r   r  r  rl   rl   rm   r   L  s   
zMolmo2DecoderLayer.forwardr  )rd   re   rf   r   rn  r   r   r   r,   r   rh   ri   r  r   r   r   rl   rl   r   rm   r  *  s8    
!r  c                   @   sN   e Zd ZdejdejdejdB dedeejeejejf dB f f
ddZdS )	Molmo2DecoderNormAfterLayerr  r   r  Nr  r   c                 K   sX   |}| j d||d|}| |}|| }|}| |}| |}|| }d }||fS r  )r  r  r  r  r  rl   rl   rm   r   e  s   


z#Molmo2DecoderNormAfterLayer.forward)rd   re   rf   rh   ri   r  r   r   rl   rl   rl   rm   r  d  s    r  c                       s   e Zd Zdddedef fddZdejdejfd	d
Z		ddejdB dejde	dB dejdB de
dejfddZdeeeejf  dee fddZ  ZS )Molmo2TextModelr   r   vllm_configr   c                   s  t    |jj}|j |j|| _t|dr|jn|j	i }t
tD ]}t|j||j< q$tdi |j| _|  jjpBd7  _t| jjd| _jrTtnttj fdd| dd\| _| _| _tjjd| _td	d
gj| _ d S )Ntext_configr   r   c                    s   j  | dS )N)r  r   r   )r  r   r  decoder_layerhf_text_configr   r  rl   rm   <lambda>  s    z*Molmo2TextModel.__init__.<locals>.<lambda>z.layersr   r   r   r  rl   )!r   r   model_config	hf_configr  r   r   hasattrr  
llm_configr   r   rs  rx  r   embedding_sizer   r/   rv   embed_tokensr   r  r  rR   ry   start_layer	end_layerlayersr&   r   normrQ   make_empty_intermediate_tensors)r   r  r   r   r  fieldr   r  rm   r     sB   


zMolmo2TextModel.__init__	input_idsr   c                 C   s
   |  |S r   )r  )r   r  rl   rl   rm   embed_input_ids     
zMolmo2TextModel.embed_input_idsNr  intermediate_tensorsinputs_embedsr  c           
      K   s   t  jr|d ur|}n| |}d }n|d usJ |d }|d }t| j| j| jD ]}||||fi |\}}q*t  jsDt||dS |d urR| 	||\}}	|S | 	|}|S )Nr   r  )r   r  )
r   is_first_rankr  r	   r  r  r  is_last_rankrA   r  )
r   r  r  r  r  r  r   r  rC  r   rl   rl   rm   r     s2   

zMolmo2TextModel.forwardrh  c                 C   sl   t |  }t }|D ](\}}|dr||vrqt|| rq|| }t|dt}||| || q|S )Nrl  rm  )rn  ro  rp  rr  rP   rs  r0   rt  )r   rh  rv  rw  rx  ry  r}  rm  rl   rl   rm   r~    s   

zMolmo2TextModel.load_weightsNN)rd   re   rf   r   r   r   rh   ri   r  rA   r  r   r   r   rp  r~  r   rl   rl   r   rm   r    s&    3
,%r  image_himage_w
patch_sizepool_hpool_wr   c                 C   s\   | | }|| }t || d || }t || d || }|| | }	|| | }
|	|
fS NrF   rB   )r  r  r  r  r  patch_hpatch_wh_padw_padnrowsncolsrl   rl   rm   get_patches_grid_size  s   r  max_numc                    s,    fddt d d D }t|dd dS )Nc                    s4   g | ]}t d  d  D ]}||  kr||fqqS )rF   )r   )r   ijr  rl   rm   r      s    z)get_candidate_tilings.<locals>.<listcomp>rF   c                 S   s   | d | d  | d fS )Nr   rF   rl   )r   rl   rl   rm   r    s    z'get_candidate_tilings.<locals>.<lambda>)r  )r   sorted)r  tilingsrl   r  rm   get_candidate_tilings  s   
r  heightwidthmax_num_patchesc                 C   s   t |}tj|tjd}|| }tj| |gtjd}|tj| }|jddd}	|	dk  r7|	 }
||
 S t	|	dk d|	
 }
||
 S )Nr  r   T)axiskeepdimsrF   g      ?g    _B)r  rX   arrayint32float32astypeminallargmaxr]  argmin)r  r  r  r  r  candidate_tilingscandidate_resolutionsoriginal_sizerequired_scale_drequired_scaleixrl   rl   rm   select_tiling	  s   r  imagec                 C   sh   t | tr
t| j S t | tjtjfr+| jdksJ | j	\}}}|dv s&J t||S t
dt|  )Nr   )rF   r   zUnknown image type: )
isinstancer   r8   r  rX   ndarrayrh   ri   ndimr  
ValueErrortype)r  r   r   crl   rl   rm   get_image_size   s   


r  rJ  c                 C   sT   | d u rd S | d urt | ttfrdd | D } | S | d ur(t | tr(t| } | S )Nc                 S   s"   g | ]}t |trt|n|qS rl   )r  r   exif_tranpose)r   imgrl   rl   rm   r   2  s    z!exif_tranpose.<locals>.<listcomp>)r  r   r   r   r   exif_transpose)rJ  rl   rl   rm   r  ,  s   
r  image_gridsimage_patch_idlow_res_image_start_idimage_start_idimage_col_idimage_end_idc                 C   s  | j }| jd }| d d df }| d d df }	| d d df }
| d d df }||	 |
|d   d }t|  }tj|tj|d}d}t|D ]}| | 	 \}}}}t||  }|| }|}|||< |d7 }|dkr}||||| < ||7 }|||< |d7 }|||< |d7 }|d }|dkr|dkrtj|tj|d}|dkr||d |< |||< |
|}||||||  < ||| 7 }|||< |d7 }|| |ksJ ||7 }qK||fS )Nr   rF   r  r   r   rT  )rI  r  r   r\  itemrh   emptyrY  r   tolistrepeat)r  r  r  r  r  r  rI  rL  	resized_h	resized_wr   r   lengths	total_lenflatoffsetr  resized_h_iresized_w_ih_iw_iL_inum_low_res_patchesidx	block_lenlineblockrl   rl   rm   build_flat_image_bool_length:  sL   


r  video_gridsframe_start_idframe_end_idc                 C   s$  | j }| jd }| d d df }| d d df }| d d df }|| }	|	d }
||
 }t|  }tj|tj|d}d}t|D ]J}t||  }t|	|  }t||  }tj|d tj|d}||d< |dkrx||dd| < ||d< |	|}||||| < ||7 }qC||fS )Nr   rF   r  rT  r   )
rI  r  r   r\  r  rh   r  rY  r   r   )r  r  r  r  rI  rL  tr  r  Pr  r  r  r  r  r  tiPiLir  seqrl   rl   rm   build_flat_video_bool_lengthz  s0   


r  c                       sZ  e Zd ZdZdedef fddZedee	e
f fddZede
fd	d
Zede
fddZede
fddZede
fddZede
fddZedee
e
f fddZede
fddZedee
e
f fddZede	fddZede
fddZede
fddZede
fdd Zede
fd!d"Zede
fd#d$Zede
fd%d&Zede
fd'd(Zede
fd)d*Zede
fd+d,Zede
fd-d.Zede e
 fd/d0Z!d1e
d2e
dee
e
f fd3d4Z"d5e#dee
e
f fd6d7Z$d1e
d2e
dee
e
f fd8d9Z%	:	:	:	:dBd;e&e e& B d:B d<e'd:B d=e(d:B d>e	e)B d?e*de+fd@dAZ,  Z-S )CMolmo2ProcessorWrapperzK
    Wraps :class:`Molmo2Processor` so that it can be called directly.
    	processorr  c                    s   t    || _|| _d S r   )r   r   r  r  )r   r  r  r   rl   rm   r     s   

zMolmo2ProcessorWrapper.__init__r   c                 C   s
   | j jjS r   )r  	tokenizervocabr   rl   rl   rm   r       
zMolmo2ProcessorWrapper.vocabc                 C   s    | j j}|j}t|tsJ |S r   )r  image_processor	max_cropsr  r   )r   r!  r"  rl   rl   rm   r"    s   z Molmo2ProcessorWrapper.max_cropsc                 C   $   | j j}|jd }t|tsJ |S Nr   r  r!  pooling_sizer  r   )r   r!  image_pooling_hrl   rl   rm   r'       
z&Molmo2ProcessorWrapper.image_pooling_hc                 C   r#  r  r%  )r   r!  image_pooling_wrl   rl   rm   r)    r(  z&Molmo2ProcessorWrapper.image_pooling_wc                 C   r#  r$  r  video_processorr&  r  r   )r   r+  video_pooling_hrl   rl   rm   r,    r(  z&Molmo2ProcessorWrapper.video_pooling_hc                 C   r#  r  r*  )r   r+  video_pooling_wrl   rl   rm   r-    r(  z&Molmo2ProcessorWrapper.video_pooling_wc                 C   s<   t | jdd d ur| jj}n| jj}|jd |jd f}|S )Nr!  r  r  )rs  r  r!  r+  r  )r   r  base_image_input_sizerl   rl   rm   r.    s
   
z,Molmo2ProcessorWrapper.base_image_input_sizec                 C   s<   t | jdd d ur| jj}n| jj}|j}t|tsJ |S )Nr!  )rs  r  r!  r+  r  r  r   )r   r  r   rl   rl   rm   r     s   
z'Molmo2ProcessorWrapper.image_patch_sizec                 C   s6   | j j}|j\}}t|tsJ t|tsJ ||fS r   )r  r!  overlap_marginsr  r   )r   r!  left_marginright_marginrl   rl   rm   r/    s
   
z&Molmo2ProcessorWrapper.overlap_marginsc                 C   s   | j jjp	| j jjS r   )r  r  	bos_token	eos_tokenr   rl   rl   rm   r2    s   z Molmo2ProcessorWrapper.bos_tokenc                 C      | j jS r   )r  r  r   rl   rl   rm   r       z%Molmo2ProcessorWrapper.image_patch_idc                 C   r4  r   )r  r  r   rl   rl   rm   	im_col_id  r5  z Molmo2ProcessorWrapper.im_col_idc                 C   r4  r   )r  image_start_token_idr   rl   rl   rm   im_start_id  r5  z"Molmo2ProcessorWrapper.im_start_idc                 C   r4  r   )r  image_end_token_idr   rl   rl   rm   	im_end_id  r5  z Molmo2ProcessorWrapper.im_end_idc                 C   r4  r   )r  low_res_image_start_token_idr   rl   rl   rm   low_res_im_start_id  r5  z*Molmo2ProcessorWrapper.low_res_im_start_idc                 C   r4  r   )r  frame_start_token_idr   rl   rl   rm   r    r5  z%Molmo2ProcessorWrapper.frame_start_idc                 C   r4  r   )r  frame_end_token_idr   rl   rl   rm   r    r5  z#Molmo2ProcessorWrapper.frame_end_idc                 C   r4  r   )r  image_low_res_idr   rl   rl   rm   im_low_res_id  r5  z$Molmo2ProcessorWrapper.im_low_res_idc                 C   
   | j t S r   )r  IMAGE_PROMPTr   rl   rl   rm   image_placeholder_id#  r   z+Molmo2ProcessorWrapper.image_placeholder_idc                 C   rA  r   )r  VIDEO_PROMPTr   rl   rl   rm   video_placeholder_id'  r   z+Molmo2ProcessorWrapper.video_placeholder_idc                 C   s$   | j | j| j| j| j| j| j| jgS r   )r  r6  r8  r<  r  r:  r  r@  r   rl   rl   rm   image_token_ids+  s   z&Molmo2ProcessorWrapper.image_token_idsimage_heightimage_widthc                C   sl   | j }| j\}}| j}| j}|||  }|d | }	|	||  }
|
| }t|| || ||d\}}||fS )Nr   )r  r  r  r  )r"  r/  r.  r   r  )r   rG  rH  r"  r0  r1  r.  base_image_input_dtotal_margin_pixelscrop_patchescrop_window_patchescrop_window_sizetiling_htiling_wrl   rl   rm   r  8  s   

z$Molmo2ProcessorWrapper.select_tilingis_videoc                 C   s>   | j }t|d |d | j|r| jn| j|r| jdS | jdS )Nr   rF   r  r  r  r  r  )r.  r  r   r,  r'  r-  r)  )r   rP  r.  rl   rl   rm   get_base_grid_sizeP  s   z)Molmo2ProcessorWrapper.get_base_grid_sizec                C   s   | j \}}| j}| j}|||  }|d | }|||  }	|	| }
| j||d\}}||
 | ||
 | g\}}t|||| j| jd\}}||fS )Nr   rG  rH  rQ  )r/  r.  r   r  r  r'  r)  )r   rG  rH  r0  r1  r.  rI  rJ  rK  rL  rM  rN  rO  r   r   r  r  rl   rl   rm   r  [  s,   




z,Molmo2ProcessorWrapper.get_patches_grid_sizeNtextrJ  videosreturn_tensorsr  c                 K   s  |g}t |}t| jdd d ur|| t| jdd d ur#|| | j|d|i|}|d d | j| j krG|d d d dd f |d< |d u rMg }t|tsU|g}|d u r[g }t|tsc|g}t|dv smJ d|	d	}|	d
d }	t|dkrg }
|D ]}t
|}| j|j|jd}|
t|d  qt|
t|d ksJ t|
|d   ksJ |	d}|d d d df jdd|d d dd f jdd }||d< |d jd }|d | |d< t|| j| j| j| j| j\}}||d< ||d< t|dkrW|	d}|d d df  t|d ks"J |d d df |d< |jdd|d< |d jd }|d | |d< t|| j| j| j\}}||d< ||d< t|S )Nr!  r+  rV  r  r   r   rF   >   r   rF   z)At most one video is supported for Molmo2attention_masktoken_type_idsr   rS  rZ   image_num_cropsr  r  r   image_num_pooled_patchesimage_num_patchesra   rb   r  rp   video_num_cropsvideo_num_pooled_patchesvideo_num_patchesrr   rs   )r  rs  r  r   r  r2  r  r   r@  popr  r  r  r  rX   prodr\  r  r  r  r  r<  r8  r6  r:  r  r  r  r   )r   rT  rJ  rU  rV  r  r   outputs_attention_mask_token_type_ids	num_cropsr  
image_sizetilingr  r[  	n_patchesra   rb   r  rr   rs   rl   rl   rm   __call__}  s   






&zMolmo2ProcessorWrapper.__call__)NNNN).rd   re   rf   rg   r   r   r   r   rn  r   r   r  r"  r'  r)  r,  r-  r   r.  r   r/  r2  r  r6  r8  r:  r<  r  r  r@  rC  rE  r   rF  r  r   rR  r  r   r   r   r   r  r   ri  r   rl   rl   r   rm   r    s    
	



$r  	video_fpssampling_fpsmax_fpsc                 C   s   t | } t |}t |}|du rtd| dks|dkr'td|  d| d| | dkr8td| d|  d	g }t|| d
 |D ]}||krK |S | | dkrX|t| qB|S )aE  
    Return the subset of `video_fps` factors that remain multiples
    of `sampling_fps`.

    Examples:
        >>> get_candidate_target_fps(video_fps=6, sampling_fps=2)
        [2, 6]
        >>> get_candidate_target_fps(video_fps=5, sampling_fps=1)
        [1, 5]
        >>> get_candidate_target_fps(video_fps=2, sampling_fps=2)
        [2]
        >>> get_candidate_target_fps(video_fps=5, sampling_fps=2)
        Traceback (most recent call last):
            ...
        ValueError: sampling_fps=2 must divide video_fps=5 to produce
            consistent frame steps.
    Nzsampling_fps must be providedr   z1video_fps and sampling_fps must be positive (got z, )zsampling_fps=z must divide video_fps=.rF   )r   r  r   r   r   )rj  rk  rl  
candidates	candidaterl   rl   rm   get_candidate_target_fps  s4   rq  
max_framestotal_framesframe_sample_modecandidate_target_fpsc           
      C   s   d}d}|D ]8}t t| | d}t|| }	|dkr+d|v r&|	|kr& |S |}|	}q||	ks1J |	|kr6q|	|kr>|}|	}q|S )zV
    Get the target fps that best spans the video and has the most frames sampled
    r   NrF   uniform)r   r   )
rj  rr  rs  rt  ru  num_frames_sampledselected_target_fps
target_fps	step_sizenum_frames_sampled_at_fpsrl   rl   rm   get_target_fps  s(   
r|  c                 C   s^   | d u rt jd||dtd}ntt||  d}t d||}t||kr+|d | }| |fS )Nr   F)endpointr  rF   )rX   linspacer   r   rX  r@  )rx  rs  rr  rj  frame_indicesrz  rl   rl   rm   get_frame_times_and_chosen_fps-  s   
r  c                   @   s  e Zd Zdd ZdedefddZdeee	dB f fdd	Z
dd
de	de	dedB de	fddZdd
de	dedB de	fddZdefddZde	de	fddZde	deee	f de	fddZde	dededede	d e	d!e	dejfd"d#Z	d(d$eeef d%edB dee fd&d'ZdS ))Molmo2ProcessingInfoc                 C   s   t d|  dS )NT)video_needs_metadataexpected_hidden_size)r:   _get_expected_hidden_sizer   rl   rl   rm   get_data_parser=  s   z$Molmo2ProcessingInfo.get_data_parserr  r   c                 K   s&   | j jdi |}| j  }t||S )Nrl   )ctxget_hf_processorget_hf_configr  )r   r  r  r  rl   rl   rm   r  C  s   

z%Molmo2ProcessingInfo.get_hf_processorNc                 C   s
   d ddS )NrF   r  videorl   r   rl   rl   rm   get_supported_mm_limitsH  r  z,Molmo2ProcessingInfo.get_supported_mm_limits)r  rG  rH  r  c                C   s   |d u r|   }|j}|jdd\}}|jd ur|j}n|j}d||t|   }|j||d\}	}
d|	|
t|j   }|| S )NFrP  r  rS  )r  r  rR  use_single_crop_col_tokensimage_use_col_tokensr   r  )r   rG  rH  r  hf_processorresize_nrowsresize_colsuse_col_tokensextraoverlap_nrowsoverlap_ncolsjointrl   rl   rm   get_num_image_tokensK  s    

z)Molmo2ProcessingInfo.get_num_image_tokens
num_framesc                C   s@   |d u r|   }|jdd\}}d||t|jj   }|| S )NTr  r  )r  rR  r   r  video_use_col_tokens)r   r  r  r  r  r  rl   rl   rm   get_num_video_tokensh  s   z)Molmo2ProcessingInfo.get_num_video_tokensc                 C   s   |   }|j\}}|j}|j}|||  }|d | }|||  }|| }	t|j}
d\}}|
D ]$\}}||	 | }||	 | }| j|||d}||krT|}t||d}q0|dks]|d u ratd|S )Nr   )r   N)rG  rH  r  )r  r  z(Cannot have a largest feature size of 0!)	r  r/  r.  r   r  r"  r  r8   r  )r   r  r0  r1  r.  rI  rJ  rK  rL  rM  r  largest_feature_sizelargest_feature_pinpointhrwrr  r  	feat_sizerl   rl   rm   !get_image_size_with_most_featuresx  s.   

z6Molmo2ProcessingInfo.get_image_size_with_most_features
max_tokensc                 C   s   | j dd}|| }t|dS )NrF   )r  )r  r   )r   r  num_tokens_per_framerr  rl   rl   rm   _get_max_video_frames  s   
z*Molmo2ProcessingInfo._get_max_video_framesseq_len	mm_countsc                 C   sF   |   jj}|j}|dd}| |}t|t|d |}t|dS )Nr  r   rF   )r  r  r+  r  getr  r  r   )r   r  r  r+  r  
max_videosmax_total_framesmax_frames_per_videorl   rl   rm   !get_num_frames_with_most_features  s   

z6Molmo2ProcessingInfo.get_num_frames_with_most_featurestotal_num_framesrj  durationrt  rl  rk  c                 C   sN  |dkrp|d urp|dkrt |t}|S ||d | kr1t jd|d t||ddt}|S t jd|d t|| d}	t |	d	 |d krVt j|	|d ggdd
}	t |	t}|d	 |k sfJ t	|	|ksnJ |S |dkrt jd|d t||ddt}|S |dkrt
||}
t|||||
}t||||\}}|S t|)Nuniform_last_framer  rF   r   T)numr}  g        )stopstepr   )r  fps)rX   rX  r  r   r~  r  r   roundconcatenater@  rq  r|  r  NotImplementedError)r   r  rj  r  rt  r  rl  rk  indicesfloat_indicesru  rx  r   rl   rl   rm   _sample_frames  sl   
/,

	z#Molmo2ProcessingInfo._sample_framesmetadatado_sample_framesc              	      s   |   jj}|d  |d}|d u r|dd}|r;|d }|  }|j}|j}|j}	|j}
| | ||||	|
}n|d usAJ  fdd|D }|S )Nr  frames_indicesr  Fr  c                    s   g | ]}|  qS rl   rl   )r   	frame_idxrj  rl   rm   r     s    z>Molmo2ProcessingInfo._get_video_second_idx.<locals>.<listcomp>)	r  r  r+  r  rt  r  rl  rk  r  )r   r  r  r+  r  r  r  rt  r  rl  rk  
timestampsrl   r  rm   _get_video_second_idx  s0   
z*Molmo2ProcessingInfo._get_video_second_idxr   )rd   re   rf   r  r  r  r  r   r   r   r  r  r  r8   r  r  r  r   rX   r  r  rn  r   r   r   r  rl   rl   rl   rm   r  <  sp    
!


	
@
r  c                   @   sz   e Zd Zdeeef defddZ	ddedeeef deeef dB defdd	Z	d
edededede
e f
ddZdS )Molmo2DummyInputsBuilderr  r   c                    sV   | dd}| dd}t t}|dkr }nd fddt|D }|||  S )Nr  r   r  rF   r   c                    s   g | ]}d |d    qS )zImage rF   rl   )r   r  image_placeholder_tokenrl   rm   r         z;Molmo2DummyInputsBuilder.get_dummy_text.<locals>.<listcomp>)r  rB  rD  joinr   )r   r  
num_images
num_videosvideo_placeholder_tokenimage_stringrl   r  rm   get_dummy_text  s   z'Molmo2DummyInputsBuilder.get_dummy_textNr  
mm_optionsc                 C   s  | dd}| dd}g }g }|dkr-| j \}}	|r"| dnd }
| j||	||
d}|dkr~| j }|j}| j||}|rG| dnd }|rqt|tsRJ |j	}|rq||krbt
d|| |dk rlt
d| t||}| j|d |d ||d	}||d
S )Nr  r   r  )r  r  r  	overridesz]video.num_frames override (%d) exceeds model's maximum number of frames (%d), will be ignoredr  zEvideo.num_frames override (%d) cannot be less than 2, will be ignoredrF   )r  r  r  r  r  )r  infor  _get_dummy_imagesr  r.  r  r  r   r  loggerwarningr  _get_dummy_videos)r   r  r  r  r  r  dummy_imagesdummy_videostarget_widthtarget_heightimage_overridesr  r.  target_num_framesvideo_overridesnum_frames_overriderl   rl   rm   get_dummy_mm_data  sZ   

z*Molmo2DummyInputsBuilder.get_dummy_mm_datar  r  r  r  c          
   
   C   sh   t j|||dfdt jd}g }t|D ]}d|d |tt|dd||d}| |f}	||	 q|S )Nr      r  g       @decordF)r  r  r  r  video_backendr  r  r  )rX   fulluint8r   r   copyr   )
r   r  r  r  r  r  video_itemsr  video_metadata
video_itemrl   rl   rm   r  ]  s   

z*Molmo2DummyInputsBuilder._get_dummy_videosr   )rd   re   rf   r   r   r   r  r   r3   r  r   r6   r  rl   rl   rl   rm   r    s.    

@r  c                
       s   e Zd Zdee dee fddZdedeeef deeef deeef de	f
 fd	d
Z
de	deeef deeef fddZdedeeef dedee fddZ  ZS )Molmo2MultiModalProcessorprompt_tokensr   c                 C   sD   | j  }|jj}|jp|j}t|dkr |d |kr |g| }|S r$  )r  r  r  r  bos_token_ideos_token_idr@  )r   r  r  r  r  rl   rl   rm   _apply_hf_processor_tokens_onlyx  s   

z9Molmo2MultiModalProcessor._apply_hf_processor_tokens_onlypromptmm_data	mm_kwargs
tok_kwargsc              
      s*  t |}| jjdi |}|dg  }rg }g }g }	g }
g }g }g }|D ]}|\} t di |}d|vr> dd|d< tdi  fdd D  t  }|gg|d<  gg|d< t jt|||d}|d}|j	j
|d	 }|t|d
}||d  ||d  |	|d  |
|d  ||d  ||d  ||d  q%t t|t|t|	t|
t|t|t|d}nt  }t j||||d}|j|j }|d }| d	kr	|d |kr	tj|gg|j|jd}tj||gd
d|d< t |fi |}t|S )NrU  r  Fc                    s   i | ]}|d kr| | qS )r  rl   )r   rj  r  rl   rm   
<dictcomp>  r  z@Molmo2MultiModalProcessor._call_hf_processor.<locals>.<dictcomp>r  )r  r  r  r  r  r   rF   rp   video_token_poolingr]  r^  r_  rr   rs   )rp   r  r]  r^  r_  rr   rs   rW  rS  r   rl   )rn  r  r  r`  r  r   r   _call_hf_processorrD  r  r  batch_decoderq  r   rh   rK  r  r2  numeltensorrI  r  concatr   )r   r  r  r  r  r  rU  pixel_values_videos_lstvideo_token_pooling_lstvideo_num_crops_lstvideo_num_pooled_patches_lstvideo_num_patches_lstvideo_tokens_lstnum_video_tokens_lstr  video_arrayvideo_mm_kwargsvideo_mm_datavideo_outputsr  video_stringprocessed_outputsr  bos_token_id_tensorcombined_outputsr   r  rm   r    s   	


z,Molmo2MultiModalProcessor._call_hf_processor	hf_inputshf_processor_mm_kwargsc           	      C   s   | dtd}| dtd}| dtd}| dtd}| dtd}| dtd}ttd|td|tdtdtdtd|tdtd	|td	|td	td	td	td	|td	d
S )NrZ  r   r[  r]  r^  rb   rs   r  r  )rZ   image_token_poolingrZ  r[  r\  ra   rb   rp   r  r]  r^  r_  rr   rs   )r  rh   r  rn  r4   flat_from_sizesbatched)	r   r  r  rZ  r[  r]  r^  rb   rs   rl   rl   rm   _get_mm_fields_config  sL   

z/Molmo2MultiModalProcessor._get_mm_fields_configmm_itemsout_mm_kwargsc                    s   j jdi  jjjjjjjj
jj	jj
jj	dtdtt f
f	dd}dtdtt f 	f
dd}dd td	d
gjjg||gD S )Nitem_idxr   c                    s    dt}|| }t|}jdd\}}d ur}n }r&j}n}g| gt|  }|g||  g }t|}	j|	j	|	j
d\}
}g| gt   }g||
  g }|| }t|jS )Nr  Fr  rS  )	get_itemsr7   r  r  rR  r<  r   r  r  r  r  r?   select_token_idsrF  )r
  rJ  r  r  r  r  start_id	extra_rowextra_jointrf  r  r  	joint_rowr  img_token_ids)	r  
img_col_id
img_end_idimg_patch_idimg_start_idr  r  r  use_single_crop_start_tokenrl   rm   get_image_replacement_molmo2,	  s8   

zSMolmo2MultiModalProcessor._get_prompt_updates.<locals>.get_image_replacement_molmo2c                    s   d |  \}}  d}j||}jdd\}}r%j}j}n}}g }	t|D ]9\}
}|
dkr9dnd}||dd }|	jjj	|d	d
7 }	g| gt
	  }|g||  |g }|	|7 }	q/t|	jS )Nr  r  Tr  r    r   z.1fF)add_special_tokens)r  r  r  rR  r  r  	enumerater  r  encoder   r?   r  rF  )r
  r  r  r  r  r  r  r  end_idr  r  
frame_time
prev_spaceframe_prefixr  r  )
r  r  r  r  r  r  r  r   use_frame_special_tokensr  rl   rm   get_video_replacement_molmo2Q	  s6   


zSMolmo2MultiModalProcessor._get_prompt_updates.<locals>.get_video_replacement_molmo2c                 S   s"   g | ]\}}}t ||g|d qS ))modalitytargetreplacement)r=   )r   r"  r#  replacement_fnrl   rl   rm   r   w	  s    zAMolmo2MultiModalProcessor._get_prompt_updates.<locals>.<listcomp>r  r  rl   )r  r  r  r6  r8  r:  r  r  r  r  r  r   r   r   ziprC  rE  )r   r  r  r	  r  r!  rl   )r  r  r  r  r  r  r  r  r   r   r  r  r  rm   _get_prompt_updates	  s&   *,%&
z-Molmo2MultiModalProcessor._get_prompt_updates)rd   re   rf   r   r   r  r   r   r  r   r  r4   r  r9   r5   r   r>   r'  r   rl   rl   r   rm   r  w  sB    



h


-
r  )r  dummy_inputsc                       s  e Zd Zeddddddddd	d
dddddddddddZdgdgg dddgddgdZededededB fd d!Z	d"d#d$e
d%ef fd&d'Zed(d) Zd*ededB fd+d,Zd*ededB fd-d.Zd*edefd/d0Zd1edeejd2f fd3d4Zd5edeejd2f fd6d7Zd*ededB fd8d9Z	dOdd:d;d<ejd=edB d>ejdB d?edejf
d@dAZ		dPd<ejdBejdCedB dDejdB d*edejfdEdFZ dGejdejfdHdIZ!dJe"eeejf  fdKdLZ#de$fdMdNZ%  Z&S )QMolmo2ForConditionalGenerationzimage_pooling_2d.q_projzimage_pooling_2d.k_projzimage_pooling_2d.v_projzimage_pooling_2d.o_projzimage_projector.gate_projzimage_projector.up_projzimage_projector.down_projr  r  r  r  r  r3  r  r  )zimage_pooling_2d.wqzimage_pooling_2d.wkzimage_pooling_2d.wvzimage_pooling_2d.wozimage_projector.w1zimage_projector.w3zimage_projector.w2att_projattn_outr  r  ff_projff_out	attn_normff_normzvision_backbone.zmodel.layers.zmodel.norm.)zmodel.vision_backbone.zmodel.transformer.blocks.zmodel.transformer.ln_f.)orig_to_new_substrorig_to_new_prefixr5  r9  r:  r;  r<  )r  r  r   r  r1  r"  r  r   Nc                 C   s$   | drtS | drtS td)Nr  r  z)Only image or video modality is supported)
startswithrB  rD  r  )clsr"  r  rl   rl   rm   get_placeholder_str	  s
   

z2Molmo2ForConditionalGeneration.get_placeholder_strr   r   r  r   c                   sh  t    |jj}|j}|jj}|| _|| _i }ttD ]}t	|j
|j||j< qtd	i |}i }ttD ]}t	|j|j||j< q6td	i |}	| |ddh t||	|t|dd| _W d    n1 siw   Y  | | t|t|dd| _W d    n1 sw   Y  |j| _t|dr|j}
n|j}
t|
j|
j|d| _t|
j| _| jj | _ d S )
Nr  r  vision_backboner   model)r  r   r  r  rl   )!r   r   r  r  r   multimodal_configr   r   rt   rs  r=  rx  r   r>  _mark_tower_modelr4  rS   r5  _mark_language_modelr  r6  r  r  r  r  r  r.   r   rv   lm_headr+   logits_processorr  )r   r  r   r   r   r7  r  r  r=  r>  r  r   rl   rm   r   	  sP   



z'Molmo2ForConditionalGeneration.__init__c                 C   s   t |  jS r   )next
parametersr  r   rl   rl   rm   r  	  s   z$Molmo2ForConditionalGeneration.dtyper  c                 K      | dd }|d u rd S | dd }| dd }| dd }| dd }| dd }dg|jddd d	   }d}	| }
t|D ]'\}}||	|	|  }t|| }t|dk|| ||
|	|	| < |	|7 }	qCt||
|||d
S )NrZ   r  r[  r\  ra   rb   r   r   r   )rZ   r]   r_   ra   rb   )	r`  cumsumr  cloner  r   rh   r]  rV   )r   r  rZ   r]   r_   num_patchesra   rb   accum_patchespatch_offsetnew_token_poolingr  n	cur_sliceindex_offsetrl   rl   rm   _parse_and_validate_image_input	  6   
z>Molmo2ForConditionalGeneration._parse_and_validate_image_inputc                 K   r>  )Nrp   r  r^  r_  rr   rs   r   r   r   )rp   r]   r_   rr   rs   )	r`  r?  r  r@  r  r   rh   r]  ro   )r   r  rp   r]   r_   rA  rr   rs   rB  rC  rD  r  rE  rF  rG  rl   rl   rm   _parse_and_validate_video_input
  rI  z>Molmo2ForConditionalGeneration._parse_and_validate_video_inputc                 K   sZ   i }|D ]&}|dv rd|vr| j di ||d< |dv r*d|vr*| jdi ||d< q|S )N)rZ   rJ  )rp   rU  rl   )rH  rJ  )r   r  
modalities	input_keyrl   rl   rm   %_parse_and_validate_multimodal_inputs7
  s   zDMolmo2ForConditionalGeneration._parse_and_validate_multimodal_inputsimage_input.c                 C      |d }|d }|d }|d }|d }| j |d|dd}t|| ks+J |j| dd}|j| dd}	g }
t||	D ]\}}|  |}|| j	k}|||< |

| qDt|
S )	NrZ   r]   r_   ra   rb   r   rJ  r]   r   r5  r  r@  r\  r   r  r&  get_language_modelr  r  r   r   )r   rN  rZ   r]   r_   ra   rb   image_features_flatimage_features_listimage_tokens_listr*  image_features_iimage_tokens_iout_featuresis_image_patchrl   rl   rm   _process_image_inputA
  .   
z3Molmo2ForConditionalGeneration._process_image_inputvideo_inputc                 C   rO  )	Nrp   r]   r_   rr   rs   r   rP  r   rQ  )r   r\  rp   r]   r_   rr   rs   rS  rT  video_tokens_listr*  rV  video_tokens_irX  rY  rl   rl   rm   _process_video_input_
  r[  z3Molmo2ForConditionalGeneration._process_video_inputc           	      K   sn   | j di |}|sg S d}|D ]$}|dkr#|d }| |}||7 }|dkr4|d }| |}||7 }q|S )Nrl   rJ  rU  )rM  rZ  r_  )	r   r  rK  multimodal_embeddingsr"  rN  image_embeddingsr\  video_embeddingsrl   rl   rm   embed_multimodal}
  s   

z/Molmo2ForConditionalGeneration.embed_multimodalFis_multimodalhandle_oov_mm_tokenr  r`  re  rf  c                C   sR   | j ||  j||d}|d u st|dkr|S |d u r tdt|||d}|S )Nrd  r   z`embed_input_ids` now requires `is_multimodal` arg, please update your model runner according to https://github.com/vllm-project/vllm/pull/16229.)r  r`  re  )_embed_text_input_idsrR  r  r@  r  rN   )r   r  r`  re  rf  r  rl   rl   rm   r  
  s$   z.Molmo2ForConditionalGeneration.embed_input_idsr  r  r  c                 K   s*   |d urd }| j |||fd|i|}|S )Nr  )r6  )r   r  r  r  r  r  r   rl   rl   rm   r   
  s   z&Molmo2ForConditionalGeneration.forwardr   c                 C   s   |  | j|}|S r   )r;  r:  )r   r   logitsrl   rl   rm   compute_logits
  s   z-Molmo2ForConditionalGeneration.compute_logitsrh  c                 C   s    t | }t|}|j|| jdS )N)mapper)rL   "_get_weights_with_merged_embeddingr~  hf_to_vllm_mapper)r   rh  loaderrl   rl   rm   r~  
  s   z+Molmo2ForConditionalGeneration.load_weightsc                 C   s   t jddddS )z<
        Get the module prefix in multimodal models
        r6  zvision_backbone.image_projectorr5  )language_model	connectortower_model)r1   from_string_fieldr   rl   rl   rm   get_mm_mapping
  s
   z-Molmo2ForConditionalGeneration.get_mm_mappingr   r  )'rd   re   rf   rM   rl  r  classmethodr   r   r4  r   r   r   r  r  rV   rH  ro   rJ  rn  rM  r   rh   ri   rZ  r_  rG   rc  r   r  
LongTensorrA   r   ri  r   r~  r1   rr  r   rl   rl   r   rm   r)  	  s    2

#
#



$
r)  rh  c                 c   s    i }| D ]\}}d|v r||d< qd|v r||d< q||fV  qd|vs)d|vr-t dtj|d |d gdd}d|fV  d S )	Nzwte.embedding	embeddingzwte.new_embeddingnew_embeddingzYCheckpoint is missing 'wte.embedding' or 'wte.new_embedding' weights required for Molmo2.r   r   zmodel.embed_tokens.weight)r  rh   rK  )rh  embedding_weightsrx  rG  rl   rl   rm   rk  
  s"   

rk  )r	  collections.abcr   r   r   dataclassesr   r   	functoolsr   r   	itertoolsr	   typingr
   r   numpyrX   rh   torch.nnr   torch.nn.functional
functionalr  PILr   	PIL.Imager   transformersr   r   r   r   transformers.image_utilsr   $transformers.tokenization_utils_baser   transformers.video_utilsr   r   vllm.compilation.decoratorsr   vllm.configr   r   vllm.config.multimodalr   r   vllm.distributedr   r   r   r   r   vllm.loggerr    %vllm.model_executor.layers.activationr!   r"   r#   $vllm.model_executor.layers.attentionr$   r%   $vllm.model_executor.layers.layernormr&   !vllm.model_executor.layers.linearr'   r(   r)   r*   +vllm.model_executor.layers.logits_processorr+   'vllm.model_executor.layers.quantizationr,   +vllm.model_executor.layers.rotary_embeddingr-   3vllm.model_executor.layers.vocab_parallel_embeddingr.   r/   -vllm.model_executor.model_loader.weight_utilsr0   )vllm.model_executor.models.module_mappingr1   vllm.multimodalr2   vllm.multimodal.inputsr3   r4   r5   r6   vllm.multimodal.parser7   r8   r9   r:   vllm.multimodal.processingr;   r<   r=   r>   r?   'vllm.multimodal.processing.dummy_inputsr@   vllm.sequencerA   vllm.utils.math_utilsrC   vllm.utils.tensor_schemarD   rE   
interfacesrG   rH   rI   rJ   rK   utilsrL   rM   rN   rO   rP   rQ   rR   rS   rd   r  rB  rD  _MAX_VIDEO_FPSrV   ro   rt   r   r   Moduler   r   r   r   r   r  r/  r4  r  r  r  r  r  r   r   r  r   r  r  r  r  rt  r  r  r  r   rq  r   r|  r  r  r  r  register_processorr)  ri   rk  rl   rl   rl   rm   <module>   sX  (U$J(Ly) 1 ':m





@
(  9
0
' Qk  
  Q