o
    
۾i                     @   s  d dl Z d dlmZmZmZ d dlmZ d dlmZm	Z	 d dl
mZ d dlmZ d dlZd dlZd dlmZ d dlm  mZ d dlmZ d dlmZmZmZmZ d d	lmZ d d
lm Z  d dl!m"Z" d dl#m$Z$m%Z% d dl&m'Z' d dl(m)Z)m*Z*m+Z+m,Z,m-Z- d dl.m/Z/m0Z0m1Z1 d dl2m3Z3m4Z4 d dl5m6Z6 d dl7m8Z8m9Z9m:Z:m;Z; d dl<m=Z= d dl>m?Z? d dl@mAZA d dlBmCZCmDZD d dlEmFZF d dlGmHZH d dlImJZJ d dlKmLZLmMZMmNZN d dlOmPZPmQZQmRZR d dlSmTZTmUZUmVZVmWZWmXZXmYZYmZZZ d dl[m\Z\ d dl]m^Z^m_Z_ dd l`maZambZbmcZcmdZdmeZe dd!lfmgZgmhZhmiZimjZjmkZkmlZl d"d#gZmdZnd$Zod%Zpd&Zqd'Zrd(Zsd)ZtG d*d+ d+e^ZueG d,d- d-ZvG d.d/ d/ejwZxG d0d1 d1ejwZyG d2d3 d3ejwZzG d4d5 d5ejwZ{d6ej|d7e}d8ej|fd9d:Z~G d;d< d<ejwZG d=d> d>ejwZG d?d@ d@ejwZG dAdB dBejwZG dCdD dDejwZG dEdF dFeZG dGdH dHejweeZe"G dIdJ dJejweeZdKe}dLe}d8e}fdMdNZdOe}dPe}dQe}dRe}dSe}d8e}fdTdUZdVe}dWe}dPe}dQe}dRe}dSe}d8ee}e}f fdXdYZdZe}d8eee}e}f  fd[d\Zd]e}d^e}d_e}d`e}fdadbZG dcdd ddZG dedf dfeVZG dgdh dheTe ZG didj djeUe ZeJjeeedkG dldm dmejwecedebeeZdneeeej|f  d8eeeej|f  fdodpZdS )q    N)IterableMappingSequence)	dataclass)cached_propertypartial)islice)	Annotated)	rearrange)BatchFeaturePretrainedConfigProcessorMixin
TensorType)
ImageInput)	TextInput)support_torch_compile)CacheConfig
VllmConfig)BaseDummyOptions)get_pp_groupget_tensor_model_parallel_rank$get_tensor_model_parallel_world_sizesplit_tensor_along_last_dim tensor_model_parallel_all_gather)
MulAndSilu	QuickGELU
SiluAndMul)	AttentionMMEncoderAttention)RMSNorm)ColumnParallelLinearMergedColumnParallelLinearQKVParallelLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)get_rope)ParallelLMHeadVocabParallelEmbedding)default_weight_loader)MultiModelKeys)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItems)ImageProcessorItems	ImageSizeMultiModalDataItems)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptIndexTargetsPromptInsertionPromptUpdatePromptUpdateDetails)IntermediateTensors)TensorSchemaTensorShape   )MultiModalEmbeddingsSupportsLoRASupportsMultiModal
SupportsPPSupportsQuant)AutoWeightsLoaderWeightsMapperis_pp_missing_parameter'make_empty_intermediate_tensors_factorymake_layersmaybe_prefixi   z
<im_patch>z<im_col>z
<im_start>z<im_end>   c                   @   sx   e Zd ZU dZeejedddf ed< eejdB eddf ed< eejeddf ed	< 	 eejed
f ed< dS )MolmoImageInputsz
    Dimensions:
        - bn: Batch size * number of images
        - bnc: Batch size * number of images * number of crops (dynamic)
        - np: Number of patches
        - tp: Token sequence positions
        - pd: Patch dimension
    bncnppdimagesNimage_maskstpimage_input_idxbn	num_crops)	__name__
__module____qualname____doc__r	   torchTensorr;   __annotations__ r\   r\   T/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/molmo.pyrK   ^   s   
 	rK   c                   @   s   e Zd ZU dZeeef ed< dZeed< dZeed< dZ	eed< dZ
eed	< dZeed
< dZeed< dZeed< dZeed< dZeed< dZeed< dd Zedd ZdS )VisionBackboneConfig)P  r_   image_default_input_size   image_patch_sizeimage_pos_patch_sizei   image_emb_dim   image_num_headsimage_num_key_value_heads   image_num_layersi   image_mlp_dim
quick_geluimage_mlp_activationsiA  image_num_posgh㈵>image_norm_epsc                 C   s   t | j| _d S N)tupler`   selfr\   r\   r]   __post_init__   s   z"VisionBackboneConfig.__post_init__c                 C   s   | j \}}|| j || j fS ro   )r`   rb   )rr   hwr\   r\   r]   image_num_patch   s   
z$VisionBackboneConfig.image_num_patchN)rU   rV   rW   r`   rp   intr[   rb   rc   rd   rf   rg   ri   rj   rl   strrm   rn   floatrs   propertyrv   r\   r\   r\   r]   r^   r   s   
 r^   c                       N   e Zd ZdZ		ddededB def fddZd	ej	d
ej	fddZ
  ZS )ViTMLPzMLP used in Vision Transformer.N configquant_configprefixc                    s`   t    t|j|jd|| dd| _|jdksJ t | _t	|j|jd|| dd| _
d S )NTz.w1biasr   r   rk   z.w2)super__init__r    rd   rj   w1rl   r   actr#   w2rr   r~   r   r   	__class__r\   r]   r      s"   
zViTMLP.__init__xreturnc                 C   s*   |  |\}}| |}| |\}}|S ro   )r   r   r   )rr   r   _r\   r\   r]   forward   s   
zViTMLP.forwardNr}   rU   rV   rW   rX   r^   r%   rx   r   rY   rZ   r   __classcell__r\   r\   r   r]   r|      s    r|   c                       sh   e Zd ZdZ				ddededed	edB d
ef
 fddZ		dde
jde
jdB de
jfddZ  ZS )MultiHeadDotProductAttentionz0Multi-head attention used in Vision Transformer.Tr<   Nr}   r~   use_biasnlayersr   r   c                    sx  t    |j| _|j| _t }| j| j dksJ | j| dks#J | j| | _| j| j | _|j	| _
| j
|krC| j
| dksBJ n	|| j
 dksLJ td| j
| | _t|| j | j| j ||| dd| _t|| j | j
| j ||| dd| _t|| j | j
| j ||| dd| _t| j| j | j||| dd| _| jd | _t| j| j| j| j| d	d
| _d S )Nr   r<   z.wqr   z.wkz.wvz.wo      .attn)num_kv_headsr   )r   r   rd   hidden_sizerf   total_num_headsr   	num_headshead_dimrg   total_num_kv_headsmaxr   r    wqwkwvr#   woscaler   attn)rr   r~   r   r   r   r   tp_sizer   r\   r]   r      sb   





z%MultiHeadDotProductAttention.__init__inputs_q	inputs_kvr   c           
      C   sd   |d ur	|}|}n|}|}|  |\}}| |\}}| |\}}| |||}	| |	\}	}|	S ro   )r   r   r   r   r   )
rr   r   r   inputs_kinputs_vxqr   xkxvoutputr\   r\   r]   r      s   z$MultiHeadDotProductAttention.forward)Tr<   Nr}   ro   )rU   rV   rW   rX   r^   boolrw   r%   rx   r   rY   rZ   r   r   r\   r\   r   r]   r      s2    Cr   c                       r{   )ResidualAttentionBlockz4Residual attention block used in Vision Transformer.Nr}   r~   r   r   c                    sb   t    t||| dd| _t||| dd| _tj|j|j	d| _
tj|j|j	d| _d S )Nz
.attentionr   r   z.feed_forwardr   eps)r   r   r   	attentionr|   feed_forwardnn	LayerNormrd   rn   attention_normffn_normr   r   r\   r]   r     s   
zResidualAttentionBlock.__init__r   r   c                 C   s,   ||  | | }|| | | }|S ro   )r   r   r   r   )rr   r   r\   r\   r]   r     s   zResidualAttentionBlock.forwardr   r   r\   r\   r   r]   r     s    r   c                       sR   e Zd ZdZ		ddededB def fddZd	ej	d
e
ej	 fddZ  ZS )BlockCollectionzCCollection of residual attention blocks used in Vision Transformer.Nr}   r~   r   r   c                    s2   t    t fddt jD | _d S )Nc                    s$   g | ]}t   d | dqS )z.resblocks.r   )r   ).0ir~   r   r   r\   r]   
<listcomp>/  s    z,BlockCollection.__init__.<locals>.<listcomp>)r   r   r   
ModuleListrangeri   	resblocksr   r   r   r]   r   '  s   

zBlockCollection.__init__r   r   c                 C   s&   g }| j D ]}||}|| q|S ro   )r   append)rr   r   hidden_statesrr\   r\   r]   r   7  s
   
zBlockCollection.forwardr   )rU   rV   rW   rX   r^   r%   rx   r   rY   rZ   listr   r   r\   r\   r   r]   r   $  s    "r   token
batch_sizer   c                 C   s   |  ddd|ddS )Nr<   )viewexpand)r   r   r\   r\   r]   _expand_token?  s   r   c                       sx   e Zd ZdZ		ddededB def fddZd	ej	d
e
dej	fddZ	dd	ej	d
e
dB deej	 fddZ  ZS )VisionTransformerz+Vision Transformer used in Vision Backbone.Nr}   r~   r   r   c                    s   t    |jd }|j| _tt|j| | _	t
| _tt|j|j| | _|j}tj|| d |jdd| _tj|j|jd| _t||| dd| _d S )Nr      F)r   r   z.transformerr   )r   r   rd   rv   	patch_numr   	ParameterrY   randnclass_embeddingNUM_PREFIX_TOKENSnum_prefix_tokensrm   positional_embeddingrb   Linearpatch_embeddingr   rn   pre_lnr   transformer)rr   r~   r   r   r   rb   r   r\   r]   r   F  s$   


zVisionTransformer.__init__r   r   r   c                 C   s  | j dd }| j dd  }|tt|jd tt|jd |jd f}|\}}|jd |ks;|jd |kr]|ddddd}tj	|||fdddd}|dddd
d}|d	|jd	 }|tj|d d d d d f |d d d d d f gdd
|j }|S )Nr   r<   r   rJ   bicubicFT)sizemodealign_corners	antialiasr   dim)r   reshaperw   mathsqrtshape	unsqueezepermuteFinterpolatesqueezerY   cattodtype)rr   r   r   cls_embpos_embpatch_num_0patch_num_1r\   r\   r]   add_pos_emb_  s,   BzVisionTransformer.add_pos_embc                 C   sp   |du r| j }|j\}}}| |}tjt| j|jd |j|gdd}| 	||}| 
|}| |}|S )z>
        : param x: (batch_size, num_patch, n_pixels)
        Nr   r<   r   )r   r   r   rY   r   r   r   r   r   r   r   r   )rr   r   r   BNDr   r\   r\   r]   r   }  s   


zVisionTransformer.forwardr   ro   )rU   rV   rW   rX   r^   r%   rx   r   rY   rZ   rw   r   r   r   r   r\   r\   r   r]   r   C  s(    r   c                       s   e Zd ZdZ			ddededB dedB deddf
 fd	d
Zde	j
de	j
dee	j
e	j
f fddZde	j
de	j
de	j
fddZ  ZS )MolmoAttentionzMolmo's LLM attention.Nr}   r~   cache_configr   r   r   c              	      s  t    |j| _t | _|j| _| j| j dksJ | j| j dks%J | j| j | _|jp1| j| _	| j	| jkrD| j	| j dksCJ n
| j| j	 dksNJ t
d| j	| j | _| j| j | _| j| j | _| j| j | _|j| _t| j| j| j| j	|j|| dd| _d | _d | _d | _|jrt | _t| j	| j |jd| _t|j|jd| _t| j| j|jd| _| jd | _t| j| j| j| j||| dd	| _t | j| j | jd
|| dd| _!d S )Nr   r<   z	.qkv_projr   r   )max_positionrope_parametersr   r   )r   r   r   r   Fz.o_proj)"r   r   r   r   r   num_attention_headsr   r   num_key_value_headsr   r   r   r   q_sizekv_sizemax_position_embeddingsr"   qkv_biasqkv_projtp_rankk_normq_normattention_layer_normr   r   layer_norm_epsr&   r   
rotary_embscalingr   r   r#   o_projrr   r~   r   r   r   r   r\   r]   r     sp   


zMolmoAttention.__init__qkc                 C   sr   | j dkrt| }t| }| |}| |}| j dkr5tt| j d}||| j }||| j }||fS )Nr<   )num_partitions)r   r   
contiguousr   r   r   r   r   )rr   r  r  splitterr\   r\   r]   _apply_qk_norm  s   



zMolmoAttention._apply_qk_norm	positionsr   c           
      C   s   |  |\}}|j| j| j| jgdd\}}}| jd ur)| jd ur)| ||\}}| |||\}}| |||}| 	|\}	}|	S )Nr   r   )
r   splitr   r   r   r   r  r  r   r  )
rr   r  r   qkvr   r  r  vattn_outputr   r\   r\   r]   r     s    zMolmoAttention.forwardNNr}   )rU   rV   rW   rX   r   r   r%   rx   r   rY   rZ   rp   r  r   r   r\   r\   r   r]   r     s<    K
r   c                       \   e Zd ZdZ			ddededB dedB deddf
 fd	d
Zde	j
de	j
fddZ  ZS )LanguageModelMLPzMolmo's LLM mlp.Nr}   r~   	input_dimr   r   r   c                    p   t    |j| _|jd | _t|p| j| jgd d|| dd| _t | _t| j| jd|| dd| _	d S )NrJ   Fz.gate_up_projr   
.down_proj)
r   r   r   intermediate_sizer!   gate_up_projr   act_fnr#   	down_projrr   r~   r  r   r   r   r\   r]   r     s$   

zLanguageModelMLP.__init__r   c                 C   *   |  |\}}| |}| |\}}|S ro   )r  r  r  rr   r   gate_upr   r\   r\   r]   r         
zLanguageModelMLP.forwardr  rU   rV   rW   rX   r   rw   r%   rx   r   rY   rZ   r   r   r\   r\   r   r]   r     s*    r  c                       r  )ImageProjectorMLPzMolmo's image_projector mlp.Nr}   r~   r  r   r   r   c                    r  )NrJ   Fz.merged_linearr   r  )
r   r   r   r  r!   merged_linearr   r  r#   r  r  r   r\   r]   r   -  s$   

zImageProjectorMLP.__init__r   c                 C   r  ro   )r#  r  r  r  r\   r\   r]   r   K  r   zImageProjectorMLP.forwardr  r!  r\   r\   r   r]   r"  *  s*    r"  c                       s   e Zd Z			ddededB dedB deddf
 fdd	Zd
ej	dej	dej	dB de
ej	e
ej	ej	f dB f fddZ  ZS )MolmoDecoderLayerNr}   r~   r   r   r   r   c                    sn   t    t|||| dd| _t||| dd| _|jdks#J t|j|j	d| _
t|j|j	d| _d S )Nz
.self_attnr   z.mlpr   rmsr   )r   r   r   	self_attnr  mlplayer_norm_typer   r   r  input_layernormpost_attention_layernormr  r   r\   r]   r   V  s   
zMolmoDecoderLayer.__init__r  r   residualc                 C   sX   |d u r|}|  |}n|  ||\}}| j||d}| ||\}}| |}||fS N)r  r   )r)  r&  r*  r'  rr   r  r   r+  r\   r\   r]   r   o  s   
zMolmoDecoderLayer.forwardr  )rU   rV   rW   r   r   r%   rx   r   rY   rZ   rp   r   r   r\   r\   r   r]   r$  U  s0    r$  c                   @   sJ   e Zd ZdejdejdejdB deejeejejf dB f fddZdS )MolmoDecoderNormAfterLayerr  r   r+  Nr   c                 C   sP   |}| j ||d}| |}|| }|}| |}| |}|| }d }||fS r,  )r&  r)  r'  r*  r-  r\   r\   r]   r     s   


z"MolmoDecoderNormAfterLayer.forward)rU   rV   rW   rY   rZ   rp   r   r\   r\   r\   r]   r.    s    r.  c                       s   e Zd ZdddgiZ		ddedededB d	ed
df
 fddZe	d
e
jfddZe	d
e
jfddZde
jd
e
jfddZde
jde
jd
e
jfddZdeeee
jf  d
ee fddZ  ZS )MolmoVisionBackboner#  	gate_projup_projNr}   r~   vision_configr   r   r   c                    s   t    t| _|j| _| jd d t | jd d t f| _t||| dd| _| jj	| _	| j	dv s8J dt
|t| j|| dd| _t||j|| d	d
| _|jt| j }ttd|f| _d S )Nr   r<   z
.image_vitr   >   r   r<   z'Only 0 or 1 prefix tokens are supportedz.image_pooling_2d)r   r   r   z.image_projector)r  r   r   rJ   )r   r   
VIT_LAYERS
vit_layersrv   POOLING_SIZEllm_patches_per_cropr   	image_vitr   r   lenimage_pooling_2dr"  rd   image_projectorr   r   rY   zeros	pad_embed)rr   r~   r2  r   r   	image_dimr   r\   r]   r     s6   

zMolmoVisionBackbone.__init__c                 C      | j jjjS ro   )r7  r   weightr   rq   r\   r\   r]   r        zMolmoVisionBackbone.dtypec                 C   r>  ro   )r7  r   r?  devicerq   r\   r\   r]   rA    r@  zMolmoVisionBackbone.devicerO   c           
      C   s   |j \}}}}tj||| ||dkddd }||| ||}| |}| jdurCg }| jD ]	}	|||	  q1tj|dd}n|d }| jdkrV|ddddf }|| }||||d}|S )	zN
        : param images: (batch_size, num_crops, num_patch, n_pixels)
        r   )r<   rJ   T)r   keepdimNr   r   r<   )	r   rY   allr   r7  r4  r   r   r   )
rr   rO   r   Tr   r   maskimage_featuresfeatureslayerr\   r\   r]   encode_image  s   $



z MolmoVisionBackbone.encode_imagerP   c                 C   sl  |j d d \}}|j| j| jd}| |}|j}|d us J | jd d d d d d d f }|dk}t|dk t|jtj	d}	|jtj	d}||d t
|d  }||d t
|	d  }||}|||f| j d }| jd t  }
rt|ddd|
d|
ddddf
}t|dttd	}|jd
dd}| ||}| j\}}||||| d}| |}|S )NrJ   )rA  r   r   r<   r   r   )r   z*b n (h dh) (w dw) c -> (b n h w) (dh dw) c)dhdwrH   T)rB  )r   r   rA  r   rI  r<  rY   logical_andlogical_notfloat32r   r   rv   r5  r   padr
   meanr9  r6  r   r:  )rr   rO   rP   r   	num_imagerF  og_dtyper<  all_padpartial_pad	missing_wqueryrt   ru   r\   r\   r]   r     sH   



zMolmoVisionBackbone.forwardweightsc                 C   s   ddg}t |  }t }|D ]Y\}}|D ].\}}}	||vrq|||}|dr/||vr/qt|| r5q|| }
|
j}||
||	  n|drN||vrNqt|| rTq|| }
t|
dt}||
| |	| q|S )N)r#  r0  r   )r#  r1  r<   .biasweight_loader)
dictnamed_parameterssetreplaceendswithrD   rZ  getattrr)   add)rr   rX  stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
param_nameweight_nameshard_idparamrZ  r\   r\   r]   load_weights   s6   


z MolmoVisionBackbone.load_weightsr   )rU   rV   rW   packed_modules_mappingr   r^   r%   rx   r   rz   rY   r   rA  rZ   rI  r   r   rp   r]  rk  r   r\   r\   r   r]   r/    s8    %
,7r/  c                       s   e Zd Zdddedef fddZdejdejfd	d
Z		ddejdB dejde	dB dejdB dejf
ddZ
deeeejf  dee fddZ  ZS )
MolmoModelr}   r   vllm_configr   c                   s   t    |jj|j |j| _jpj| _|  jt	7  _t
| jjd| _jr/tnttj fdd| dd\| _| _| _jdksPJ tjj| _tddgj| _d S )	N)r   c                    s    | dS )Nr   r\   r   r   r~   decoder_layerr   r\   r]   <lambda>[  s    z%MolmoModel.__init__.<locals>.<lambda>z.layersr   r%  r   r+  )r   r   model_config	hf_configr   r   r~   embedding_size
vocab_sizeADDITIONAL_VOCAB_SIZEr(   r   embed_tokens
norm_afterr.  r$  rF   num_hidden_layersstart_layer	end_layerlayersr(  r   r  normrE   make_empty_intermediate_tensors)rr   rn  r   r   ro  r]   r   E  s0   


zMolmoModel.__init__	input_idsr   c                 C   s
   |  |S ro   )rw  )rr   r  r\   r\   r]   embed_input_idsh  s   
zMolmoModel.embed_input_idsNr  intermediate_tensorsinputs_embedsc           	      C   s   t  jr|d ur|}n| |}d }n|d usJ |d }|d }t| j| j| jD ]
}||||\}}q*t  js@t||dS |d urN| 	||\}}|S | 	|}|S )Nr   r+  )r   r+  )
r   is_first_rankrw  r   r|  rz  r{  is_last_rankr9   r}  )	rr   r  r  r  r  r   r+  rH  r   r\   r\   r]   r   k  s.   


zMolmoModel.forwardrX  c                 C   sl   t |  }t }|D ](\}}|dr||vrqt|| rq|| }t|dt}||| || q|S )NrY  rZ  )r[  r\  r]  r_  rD   r`  r)   ra  )rr   rX  rc  rd  re  rf  rj  rZ  r\   r\   r]   rk    s   

zMolmoModel.load_weightsNN)rU   rV   rW   r   rx   r   rY   rZ   r  r9   r   r   rp   r]  rk  r   r\   r\   r   r]   rm  C  s"    #
,#rm  r   r  c                 C   s   | | | S ro   r\   )r   r  r\   r\   r]   _lowest_multiple  s   r  	num_tilescrop_patchesleft_marginright_marginpooling_sizec          	      C   sx   | dkrt || d |S |||  }t || | d |}t || d |}t || | d |}|| d |  | S )Nr<   rJ   )r  )	r  r  r  r  r  crop_window_patchesleft_num
middle_num	right_numr\   r\   r]   get_num_patches  s    
r  tiling_htiling_wc                 C   s,   t | ||||d}t |||||d}||fS )N)r  r  r  r  )r  )r  r  r  r  r  r  nrowsncolsr\   r\   r]   get_patches_grid_size  s   	r  max_numc                    s,    fddt d d D }t|dd dS )Nc                    s4   g | ]}t d  d  D ]}||  kr||fqqS )r<   )r   )r   r   jr  r\   r]   r     s    z)get_candidate_tilings.<locals>.<listcomp>r<   c                 S   s   | d | d  S )Nr   r<   r\   )r   r\   r\   r]   rq    s    z'get_candidate_tilings.<locals>.<lambda>)key)r   sorted)r  tilingsr\   r  r]   get_candidate_tilings  s   
r  heightwidth
patch_sizemax_num_patchesc                 C   s   t |}tj|tjd}|| }tj| |gtjd}|tj| }|jddd}	|	dk  r7|	 }
||
 S t	|	dk d|	
 }
||
 S )NrJ  r   T)axiskeepdimsr<   g      ?g    _B)r  rM   arrayint32rO  astypeminrC  argmaxwhereargmin)r  r  r  r  r  candidate_tilingscandidate_resolutionsoriginal_sizerequired_scale_drequired_scaleixr\   r\   r]   select_tiling  s   r  c                	       s  e Zd ZdZdef fddZedeee	f fddZ
ede	fdd	Zedee	e	f fd
dZede	fddZedee	e	f fddZede	fddZede	fddZededB fddZedefddZede	fddZede	fddZede	fddZede	fdd Zede	fd!d"Zd#e	d$e	dee	e	f fd%d&Zd#e	d$e	dee	e	f fd'd(Z			d.d)eee B dB d*eee B dB d+ee B dB de!fd,d-Z"  Z#S )/MolmoProcessorWrapperz
    Wraps `MolmoProcessor` so that it can be called directly.

    The original definition can be found here:
    https://huggingface.co/allenai/Molmo-7B-D-0924/blob/main/preprocessing_molmo.py
    	processorc                    s   t    || _d S ro   )r   r   r  )rr   r  r   r\   r]   r     s   

zMolmoProcessorWrapper.__init__r   c                 C   s
   | j jjS ro   )r  	tokenizervocabrq   r\   r\   r]   r  	     
zMolmoProcessorWrapper.vocabc                 C       | j j}|j}t|tsJ |S ro   )r  image_processor	max_crops
isinstancerw   )rr   r  r  r\   r\   r]   r       zMolmoProcessorWrapper.max_cropsc                 C   s(   | j j}|j}t|tr||fS t|S ro   )r  r  base_image_input_sizer  rw   rp   )rr   r  r  r\   r\   r]   r    s
   
z+MolmoProcessorWrapper.base_image_input_sizec                 C   r  ro   )r  r  rb   r  rw   )rr   r  rb   r\   r\   r]   rb      r  z&MolmoProcessorWrapper.image_patch_sizec                 C   s6   | j j}|j\}}t|tsJ t|tsJ ||fS ro   )r  r  overlap_marginsr  rw   )rr   r  r  r  r\   r\   r]   r  )  s
   
z%MolmoProcessorWrapper.overlap_marginsc                 C   r  ro   )r  r  image_token_length_wr  rw   )rr   r  r  r\   r\   r]   r  3  r  z*MolmoProcessorWrapper.image_token_length_wc                 C   r  ro   )r  r  image_token_length_hr  rw   )rr   r  r  r\   r\   r]   r  <  r  z*MolmoProcessorWrapper.image_token_length_hNc                 C      dS )Nroler\   rq   r\   r\   r]   message_formatE     z$MolmoProcessorWrapper.message_formatc                 C   r  )NTr\   rq   r\   r\   r]   always_start_with_spaceI  r  z-MolmoProcessorWrapper.always_start_with_spacec                 C   
   | j t S ro   )r  IMAGE_PATCH_TOKENrq   r\   r\   r]   image_patch_idM  r  z$MolmoProcessorWrapper.image_patch_idc                 C   r  ro   )r  IM_COL_TOKENrq   r\   r\   r]   	im_col_idQ  r  zMolmoProcessorWrapper.im_col_idc                 C   r  ro   )r  IM_START_TOKENrq   r\   r\   r]   im_start_idU  r  z!MolmoProcessorWrapper.im_start_idc                 C   r  ro   )r  IM_END_TOKENrq   r\   r\   r]   	im_end_idY  r  zMolmoProcessorWrapper.im_end_idc                 C   s   t S ro   )r5  rq   r\   r\   r]   r  ]  r  z"MolmoProcessorWrapper.pooling_sizeimage_widthimage_heightc                C   sl   | j }| j\}}| j}| j}|||  }|d | }	|	||  }
|
| }t|| || ||d\}}||fS )Nr   )r  r  r  r  )r  r  r  rb   r  )rr   r  r  r  r  r  r  base_image_input_dtotal_margin_pixelsr  r  crop_window_sizer  r  r\   r\   r]   r  a  s   

z#MolmoProcessorWrapper.select_tilingc                C   sZ   | j \}}| j}| j}| j}|d | }| j||d\}	}
t|
|	||||d\}}||fS )Nr   )r  r  )r  r  r  r  r  r  )r  r  rb   r  r  r  )rr   r  r  r  r  r  r  r  r  r  r  r  r  r\   r\   r]   r  y  s$   


	z+MolmoProcessorWrapper.get_patches_grid_sizetextrO   return_tensorsc                    s    j j||fi |}|d u rg }t|ts|g}|d}|d|d< |dd }|d ur]|dk} fdd|D }	t|	dd }
|
	 t
|ksPJ ||d< |
|d<  j|d	< t|S )
Nr  r   rR   c                    s&   g | ]} j |jd  |jd dqS )r   r<   r  r  )r  r   )r   imagerq   r\   r]   r     s    z2MolmoProcessorWrapper.__call__.<locals>.<listcomp>r   r<   rT   img_patch_id)r  processr  r   popr   rY   tensorprodsumr8  r  r   )rr   r  rO   r  kwargsoutputsr  rR   feat_is_patchr  rT   r\   rq   r]   __call__  s.   



zMolmoProcessorWrapper.__call__)NNN)$rU   rV   rW   rX   r   r   r   r[  rx   rw   r  r  rp   r  rb   r  r  r  rz   r  r   r  r  r  r  r  r  r  r  r   r   r   r   r   r  r   r\   r\   r   r]   r    sr    		




r  c                   @   sd   e Zd ZdedefddZdeeedB f fddZ	ded	ed
edB defddZ
defddZdS )MolmoProcessingInfor  r   c                 K   s   | j jdi |}t|S Nr\   )ctxget_hf_processorr  )rr   r  r  r\   r\   r]   r    s   z$MolmoProcessingInfo.get_hf_processorNc                 C   s   dd iS )Nr  r\   rq   r\   r\   r]   get_supported_mm_limits  s   z+MolmoProcessingInfo.get_supported_mm_limitsr  r  r  c                C   sl   |d u r|   }|j||d\}}|j}|j}|j}d|d |  }	d|d | d |d |   }
|	|
 S )Nr  rJ   r<   )r  r  r  r  r  )rr   r  r  r  r  r  r  r  r  extrajointr\   r\   r]   get_num_image_tokens  s   
 z(MolmoProcessingInfo.get_num_image_tokensc                 C   s   |   }t|j}|j\}}d\}}|D ]!\}}|| || }	}
| j|	|
|d}||kr5|}t|	|
d}q|dks>|d u rBtd|S )N)r   N)r  r  r  )r  r  r   z(Cannot have a largest feature size of 0!)r  r  r  r  r  r0   
ValueError)rr   r  r  base_hbase_wlargest_feature_sizelargest_feature_pinpointwrhrr  r  	feat_sizer\   r\   r]   !get_image_size_with_most_features  s$   

z5MolmoProcessingInfo.get_image_size_with_most_features)rU   rV   rW   objectr  r  r   rx   rw   r  r  r0   r  r\   r\   r\   r]   r    s    
r  c                	   @   sX   e Zd Zdeeef defddZ	d
dedeeef deeef dB defdd	Z	dS )MolmoDummyInputsBuilder	mm_countsr   c                 C   r  r   r\   )rr   r  r\   r\   r]   get_dummy_text  s   z&MolmoDummyInputsBuilder.get_dummy_textNseq_len
mm_optionsc                 C   sB   | j  \}}|dd}|r|dnd }d| j||||diS )Nr  r   )r  r  
num_images	overrides)infor  get_get_dummy_images)rr   r  r  r  target_widthtarget_heightr  image_overridesr\   r\   r]   get_dummy_mm_data  s   z)MolmoDummyInputsBuilder.get_dummy_mm_dataro   )
rU   rV   rW   r   rx   rw   r  r   r,   r  r\   r\   r\   r]   r    s    
r  c                	   @   sr   e Zd Zdee dee fddZdedeee	f deee
f fddZd	edeee	f d
edee fddZdS )MolmoMultiModalProcessorprompt_tokensr   c                 C   sT   | j  }|jj| j  |d|jd}| j j|t	|d}|
d \}|S )Nnone)r  r  )tokensr  )r  r  r  get_tokens_inputget_tokenizerdecoder  r  call_hf_processorr[  r  tolist)rr   r  r  r  processed_data
prompt_idsr\   r\   r]   _apply_hf_processor_tokens_only  s   
z8MolmoMultiModalProcessor._apply_hf_processor_tokens_only	hf_inputshf_processor_mm_kwargsc              	   C   sR   | dtd}t|}ttd|td|td|tdtd|dS )NrT   r   r  )rO   rP   rR   rT   r  )	r  rY   emptyr8  r[  r-   flat_from_sizesbatchedshared)rr   r  r  rT   r  r\   r\   r]   _get_mm_fields_config*  s   



z.MolmoMultiModalProcessor._get_mm_fields_configmm_itemsout_mm_kwargsc              	      s   | j jdi |j}j}jjjjjg| g }g||  g  dt	f fdd}t
dtd|dgS )Nitem_idxc                    sr    dt}|| }j|j|jd\}}g|d   g }g||d    g }tj | dS )Nr  r  r<   )embed_token_id)	get_itemsr/   get_image_sizer  r  r  r8   select_token_id)r  rO   
image_sizer  r  	joint_rowr  extra_joint
img_col_id
img_end_idr  img_start_idr  r  r  r\   r]   get_insertion_molmoN  s"   

zIMolmoMultiModalProcessor._get_prompt_updates.<locals>.get_insertion_molmor  z<|endoftext|>)modalitytarget	insertionr\   )r  r  r  r  r  r  r  r  r  rw   r6   r5   r   )rr   r  r  r  r  r  	extra_rowr!  r\   r  r]   _get_prompt_updates:  s"    z,MolmoMultiModalProcessor._get_prompt_updatesN)rU   rV   rW   r   rw   r  r   r   rx   r  r-   r  r1   r.   r   r7   r&  r\   r\   r\   r]   r     s,    




r   )r  dummy_inputsc                       sD  e Zd Zeddddddddd	d
ddddddddZdgdgddgdZededededB fddZ	ddd e
d!ef fd"d#Zd$ededB fd%d&Zd'edeej fd(d)Zd$edefd*d+Z		d:d,ejd-ejd.edB d/ejdB d$edejfd0d1Zd2ejdejfd3d4Zd5eeeejf  fd6d7Zdefd8d9Z  ZS );MolmoForCausalLMzimage_projector.gate_proj.zimage_projector.up_proj.zimage_projector.down_proj.zself_attn.qkv_projzself_attn.o_projzself_attn.q_normzself_attn.k_normzmlp.gate_up_projzmlp.down_projr)  r*  )zimage_projector.w1.zimage_projector.w3.zimage_projector.w2.att_projattn_outr   r   ff_projff_out	attn_normff_normzvision_backbone.zmodel.layers.zmodel.norm.zlm_head.)zmodel.vision_backbone.zmodel.transformer.blocks.zmodel.transformer.ln_f.z model.transformer.mlp.down_proj.)orig_to_new_substrorig_to_new_prefixr   r  r0  r1  )r   r  r#  r"  r   r   Nc                 C   s   | drd S td)Nr  z Only image modality is supported)
startswithr  )clsr"  r   r\   r\   r]   get_placeholder_str  s   
z$MolmoForCausalLM.get_placeholder_strr}   r   rn  r   c                   s  t    |jj}|j}|jj}|| _|| _t }| |d t	|||t
|dd| _W d    n1 s6w   Y  | | t|t
|dd| _W d    n1 sUw   Y  d | _| jjrh| jjj| _nt|jpn|j|j|t
|dd| _t|jp|j| _| jj| _d S )Nr  vision_backboner   model)rn  r   lm_headr   )r   r   rr  rs  r   multimodal_configr~   r^   _mark_tower_modelr/  rG   r4  _mark_language_modelrm  r5  r  weight_tyingr   wter6  r'   rt  ru  r   r$   logits_processorr~  )rr   rn  r   r~   r   r7  r2  r   r\   r]   r     sD   





zMolmoForCausalLM.__init__r  c                 K   s   | dd }| dd }| dd }| dd }|d u rd S | dd }t|tjr.| }t|ts5J || _t||||dS )NrO   rP   rR   rT   r  )rO   rP   rR   rT   )r  r  rY   rZ   itemrw   r  rK   )rr   r  rO   rP   rR   rT   r  r\   r\   r]   _parse_and_validate_image_input  s"   z0MolmoForCausalLM._parse_and_validate_image_inputimage_inputc                 C   s   |d }|d }|d }|d }| j |d|d u rd n|ddd}g }| }t||||D ]\}	}
|
dk}|
| }t|}||	| |  q7|S )NrO   rP   rR   rT   r   )rO   rP   )	r4  r   r   r  zipr  rY   argsortr   )rr   r?  rO   rP   rR   rT   rF  resultsnum_crops_listfeatsimg_idxis_validvalid_img_idxorderr\   r\   r]   _process_image_input  s*   
z%MolmoForCausalLM._process_image_inputc                 K   s&   | j di |}|d u rg S | |S r  )r>  rI  )rr   r  r?  r\   r\   r]   embed_multimodal   s   
z!MolmoForCausalLM.embed_multimodalr  r  r  r  c                 K   s"   |d urd }| j ||||d}|S )N)r  )r5  )rr   r  r  r  r  r  r   r\   r\   r]   r     s   zMolmoForCausalLM.forwardr   c                 C   s   |  | j|}|S ro   )r<  r6  )rr   r   logitsr\   r\   r]   compute_logits  s   zMolmoForCausalLM.compute_logitsrX  c                 C   s    t | }t|}|j|| jdS )N)mapper)rB   "_get_weights_with_merged_embeddingrk  hf_to_vllm_mapper)rr   rX  loaderr\   r\   r]   rk    s   zMolmoForCausalLM.load_weightsc                 C   s   t jddddS )z<
        Get the module prefix in multimodal models
        r5  zvision_backbone.image_projectorr4  )language_model	connectortower_model)r*   from_string_fieldrq   r\   r\   r]   get_mm_mapping!  s
   zMolmoForCausalLM.get_mm_mappingr  )rU   rV   rW   rC   rO  rl  classmethodrx   rw   r3  r   r   r  rK   r>  r   rY   rZ   rI  r=   rJ  
LongTensorr9   r   rL  r   rp   rk  r*   rU  r   r\   r\   r   r]   r(  l  sp    ,


r(  rX  c                 c   sj    i }| D ]\}}d|v r||d< qd|v r||d< q||fV  qt j|d |d gdd}d|fV  d S )Nzwte.embedding	embeddingzwte.new_embeddingnew_embeddingr   r   zmodel.embed_tokens.weight)rY   r   )rX  embedding_weightsre  r?  r\   r\   r]   rN  ,  s   

rN  )r   collections.abcr   r   r   dataclassesr   	functoolsr   r   	itertoolsr   typingr	   numpyrM   rY   torch.nnr   torch.nn.functional
functionalr   einopsr
   transformersr   r   r   r   transformers.image_utilsr   $transformers.tokenization_utils_baser   vllm.compilation.decoratorsr   vllm.configr   r   vllm.config.multimodalr   vllm.distributedr   r   r   r   r   %vllm.model_executor.layers.activationr   r   r   $vllm.model_executor.layers.attentionr   r   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr    r!   r"   r#   +vllm.model_executor.layers.logits_processorr$   'vllm.model_executor.layers.quantizationr%   +vllm.model_executor.layers.rotary_embeddingr&   3vllm.model_executor.layers.vocab_parallel_embeddingr'   r(   -vllm.model_executor.model_loader.weight_utilsr)   )vllm.model_executor.models.module_mappingr*   vllm.multimodalr+   vllm.multimodal.inputsr,   r-   r.   vllm.multimodal.parser/   r0   r1   vllm.multimodal.processingr2   r3   r4   r5   r6   r7   r8   vllm.sequencer9   vllm.utils.tensor_schemar:   r;   
interfacesr=   r>   r?   r@   rA   utilsrB   rC   rD   rE   rF   rG   r3  r   rv  r  r  r  r  r5  rK   r^   Moduler|   r   r   r   rZ   rw   r   r   r   r  r"  r$  r.  r/  rm  r  r  rp   r  r   r  r  r  r  r  r   register_processorr(  rx   rN  r\   r\   r\   r]   <module>   s   $	 
#YRk*+0 &[




 C:[
 <