o
    
۾ir                  	   @   s  d dl Z d dlZd dlmZmZmZ d dlmZmZ d dl	Z	d dl
mZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZmZmZ d d	lmZ d d
lmZ d dlmZ d dlmZm Z m!Z!m"Z" d dl#m$Z$ d dl%m&Z& d dl'm(Z(m)Z)m*Z* d dl+m,Z,m-Z-m.Z. d dl/m0Z0m1Z1m2Z2m3Z3m4Z4 d dl5m6Z6 d dl7m8Z8m9Z9 ddl:m;Z;m<Z<m=Z=m>Z>m?Z? ddl@mAZA ddlBmCZCmDZDmEZEmFZF ddlGmHZH G dd de8ZIeIZJG dd de2ZKG dd de0eK ZLG dd  d e1eK ZMG d!d" d"ejNZOe&jPeMeKeLd#G d$d% d%ejNe>e=e?e;ZQdS )&    N)IterableMappingSequence)	AnnotatedLiteral)BatchFeature)ACT2FN)Lfm2VlProcessor)Lfm2VlConfig)Lfm2VlImageProcessorFastfind_closest_aspect_ratioround_by_factor)
VllmConfig)BaseDummyOptions)set_forward_context)MambaStateCopyFuncMambaStateCopyFuncCalculatorMambaStateDtypeCalculatorMambaStateShapeCalculator)MultiModelKeys)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItems)ImageProcessorItems	ImageSizeMultiModalDataItems)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdateDetails)IntermediateTensors)TensorSchemaTensorShape   )IsHybridMultiModalEmbeddingsSupportsLoRASupportsMultiModal
SupportsPP)Siglip2Model)AutoWeightsLoaderWeightsMapperinit_vllm_registered_modelmaybe_prefix)is_vit_use_data_parallelc                   @   sj   e Zd ZU dZdZed ed< eej	e
dddf ed< eej	e
ddf ed< eej	e
d	f ed
< dS )Lfm2VLImagePixelInputsz
    Dimensions:
        - b: Number of images in the prompt
        - bn: Batch size * number of images
        - d: Number of dimensions
        - fd: Number of features per dimension
    pixel_valuestypebndfd   spatial_shapesbnum_patchesN)__name__
__module____qualname____doc__r3   r   __annotations__r   torchTensorr$    rB   rB   V/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/lfm2_vl.pyr1   A   s   
 r1   c                   @   s  e Zd Zdd Zdd ZdedefddZdee	e
d	B f fd
dZdefddZde
de
de
de
de
dedefddZde
de
de
de
de
de
dee
e
f fddZde
de
deee
e
f  fddZde
de
de
de
de
dee
e
f fddZd e
d!e
d"ed	B dee
e
f fd#d$Zd e
d!e
d"ed	B de
fd%d&Zd e
d!e
d'ejd"ed	B de	f
d(d)Zd'ejd"ed	B dee
e
f fd*d+Zd	S ),Lfm2VLProcessingInfoc                 C   s   | j tS N)ctxget_hf_configr
   selfrB   rB   rC   rG   T   s   z"Lfm2VLProcessingInfo.get_hf_configc                 K   s   | j jtfi |S rE   )rF   get_hf_processorr	   rI   kwargsrB   rB   rC   rJ   W   s   z%Lfm2VLProcessingInfo.get_hf_processorrL   returnc                 K   s   | j di |jS NrB   )rJ   image_processorrK   rB   rB   rC   get_image_processorZ   s   z(Lfm2VLProcessingInfo.get_image_processorNc                 C   s   dd iS )NimagerB   rH   rB   rB   rC   get_supported_mm_limits]   s   z,Lfm2VLProcessingInfo.get_supported_mm_limitsc                 C   sH   |   }|j}|j}|j}||d  |d  }tt|}t||dS )Nr7   )widthheight)rP   max_image_tokensencoder_patch_sizedownsample_factorintmathsqrtr   )rI   	processorrU   rV   rW   
max_pixelssiderB   rB   rC   !get_image_size_with_most_features`   s   z6Lfm2VLProcessingInfo.get_image_size_with_most_featuresrT   rS   rU   rV   rW   max_pixels_tolerancec           
      C   sH   || }t |t||}t |t||}	||	 ||d  |d  | kS )z<Check if the image is too large to be processed as one tile.r7   )maxr   )
rI   rT   rS   rU   rV   rW   r_   total_factorh_barw_barrB   rB   rC   _is_image_too_largei   s   
z(Lfm2VLProcessingInfo._is_image_too_largemin_image_tokensc                 C   s   || }||d  |d  }||d  |d  }	t |t||}
t |t||}|
| |	krWt|| |	 }t |t|| | | }
t |t|| | | }||
fS |
| |k r|t|||  }t|| | | }
t|| | | }||
fS )Nr7   )r`   r   rY   rZ   floorceil)rI   rT   rS   rW   re   rU   rV   ra   smart_resize_min_pixelssmart_resize_max_pixelsrb   rc   betarB   rB   rC   smart_resize   s*   	z!Lfm2VLProcessingInfo.smart_resize	min_tiles	max_tilesc                    s2    fddt  d D }tt|dd dS )Nc                    sX   g | ](}t d |d  D ]}t d |d  D ]}||   kr" krn n||fqqqS )r%   )range).0nwhrm   rl   rB   rC   
<listcomp>   s    
z7Lfm2VLProcessingInfo._target_ratios.<locals>.<listcomp>r%   c                 S   s   | d | d  S )Nr   r%   rB   )xrB   rB   rC   <lambda>   s    z5Lfm2VLProcessingInfo._target_ratios.<locals>.<lambda>)key)rn   sortedset)rI   rl   rm   ratiosrB   rs   rC   _target_ratios   s   z#Lfm2VLProcessingInfo._target_ratios	tile_sizec                 C   s:   || }|  ||}t|||||\}}	||	 }
||	|
fS rE   )r{   r   )rI   rT   rS   rl   rm   r|   aspect_ratiotarget_ratios
grid_widthgrid_heighttotal_patchesrB   rB   rC   _get_grid_layout   s   

z%Lfm2VLProcessingInfo._get_grid_layoutimage_widthimage_heightr[   c                 C   s   |d u r|   }|jj}|jj}|jj}|jj}|jj}|jj}	|jj}
||  ko-dkn   }| j	|||	|||d}|rN|rN| j
|||||
d\}}}nd } }}|| dkr^|d7 }|||fS )Nr%   )rT   rS   rU   rV   rW   r_   )rl   rm   r|   )rP   rO   rW   rV   r_   rl   rm   rU   r|   rd   r   )rI   r   r   r[   rW   rV   r_   rl   rm   rU   r|   do_image_splittingis_image_larger   r   r   rB   rB   rC   _get_image_feature_grid_size   s<   

z1Lfm2VLProcessingInfo._get_image_feature_grid_sizec                C   s   | j |||d\}}}|S )Nr   r   r[   )r   )rI   r   r   r[   _r   rB   rB   rC   get_num_patches   s   z$Lfm2VLProcessingInfo.get_num_patchesr8   c                    s   |d u r|   }d}|j}|j}|j}|j}	| j||d\}
}|||  | j|||d\ }} dks8|dkrR fddt|D }|
dkrQ||	||
   n||
 g}d	t
|g||g}|S )	Nz<|img_row_{n_h}_col_{n_w}|>)r8   r[   r   r%   c                    s0   g | ]}t  D ]}j|d  |d  dqqS )r%   )n_hn_w)rn   format)ro   ijgrid_wtile_img_placeholderrB   rC   rt     s    z7Lfm2VLProcessingInfo.get_image_repl.<locals>.<listcomp>r    )rJ   image_tokenimage_start_tokenimage_end_tokenimage_thumbnail_tokenget_num_image_tokensr   rn   appendjoin	itertoolschain)rI   r   r   r8   r[   grid_placeholderr   r   r   r   num_thumbnail_tokensnum_tokens_per_tilegrid_hr   tiles_placeholderplaceholderrB   r   rC   get_image_repl   s>   


z#Lfm2VLProcessingInfo.get_image_replc          
      C   sR   |j j}|j j}|j j}|d  |d  }|| }t|| }|| }	||	fS )Nr7   )rO   r|   rW   rV   prodrY   rg   )
rI   r8   r[   r|   rW   rV   r   num_patches_tiledwn_num_patches_tilenum_tiles_tokensrB   rB   rC   r   &  s   z)Lfm2VLProcessingInfo.get_num_image_tokens)r;   r<   r=   rG   rJ   objectr   rP   r   strrX   rR   r   r^   floatboolrd   tuplerk   listr{   r   r	   r   r   r@   rA   r   r   rB   rB   rB   rC   rD   S   s    	


"#




,

/
rD   c                	   @   sX   e Zd Zdeeef defddZ	d
dedeeef deeef dB defdd	Z	dS )Lfm2VLDummyInputsBuilder	mm_countsrM   c                 C   s$   | dd}| j }|j}|| S )NrQ   r   )getinforJ   r   )rI   r   
num_imagesr[   r   rB   rB   rC   get_dummy_text7  s   
z'Lfm2VLDummyInputsBuilder.get_dummy_textNseq_len
mm_optionsc                 C   sB   | dd}| j \}}|r| dnd }d| j||||diS )NrQ   r   )rS   rT   r   	overrides)r   r   r^   _get_dummy_images)rI   r   r   r   r   target_widthtarget_heightimage_overridesrB   rB   rC   get_dummy_mm_data=  s   z*Lfm2VLDummyInputsBuilder.get_dummy_mm_datarE   )
r;   r<   r=   r   r   rX   r   r   r   r   rB   rB   rB   rC   r   6  s    

r   c                
       s   e Zd Zdedeeef deeef deeef def
 fddZded	eeef deeef fd
dZ	de
d	eeef dedee fddZ  ZS )Lfm2VLMultiModalProcessorpromptmm_data	mm_kwargs
tok_kwargsrM   c                    s   | dg  }sj |}|}tt|gdddS t ||||}jj	d|idd}|
dtfdd	ttD }	jjdi |  fd
d	|	D }
t|
|d< |S )Nimages)	input_idspt)tensor_typerQ   F)validatec                    s   g | ]}  |qS rB   )get_image_size)ro   r   )parsed_imagesrB   rC   rt   j  s    
z@Lfm2VLMultiModalProcessor._call_hf_processor.<locals>.<listcomp>c                    s"   g | ]}j j|j|j d qS )r   )r   r   rS   rT   )ro   size)hf_processorrI   rB   rC   rt   o  s    r:   rB   )r   r   get_tokenizerencode_apply_hf_processor_tokens_onlyr   dictsuper_call_hf_processorparse_mm_data	get_itemsr   rn   lenrJ   r@   tensor)rI   r   r   r   r   r   
prompt_idsprocessed_outputsmm_itemsimage_sizesr:   	__class__)r   r   rI   rC   r   T  s*   


z,Lfm2VLMultiModalProcessor._call_hf_processor	hf_inputshf_processor_mm_kwargsc                 C   sF   | dtd}tttf td|tjd|ddtjddddS )Nr:   r   rQ   T)keep_on_cpu)r2   r8   r:   )r   r@   emptyr   r   r   flat_from_sizesbatched)rI   r   r   r:   rB   rB   rC   _get_mm_fields_config{  s   

z/Lfm2VLMultiModalProcessor._get_mm_fields_configr   out_mm_kwargsc                    sB   j jdi |  jdtf fdd}td|dgS )Nitem_idxc                    sb    dt}|| }d |  }|d j}t|tjsJ jj|j	|j
| d}tj|dS )NrQ   r8   )r   r   r8   r[   )
embed_text)r   r   r   data
isinstancer@   rA   r   r   rS   rT   r!   select_text)r   r   
image_sizeout_itemr8   
image_replr   r   r   r   rI   rB   rC   get_image_replacement_lfm2vl  s   

zSLfm2VLMultiModalProcessor._get_prompt_updates.<locals>.get_image_replacement_lfm2vlrQ   )modalitytargetreplacementrB   )r   rJ   r   rX   r    )rI   r   r   r   r   rB   r   rC   _get_prompt_updates  s   z-Lfm2VLMultiModalProcessor._get_prompt_updates)r;   r<   r=   r   r   r   r   r   r   r   r   r   r   r    r   __classcell__rB   rB   r   rC   r   S  s8    


'



r   c                       sF   e Zd Z	ddedef fddZdejdejdejfd	d
Z  Z	S )Lfm2VLMultiModalProjectorr   configprefixc                    s   t    t | _|jj|jd  }|j| _|j| _| jr#t	
|| _t	j||j|jd| _t|j | _t	j|j|jj|jd| _d S )Nr7   )bias)r   __init__r0   use_data_parallelvision_confighidden_sizerW   factorprojector_use_layernormnn	LayerNorm
layer_normLinearprojector_hidden_sizeprojector_biaslinear_1r   projector_hidden_actacttext_configlinear_2)rI   r   r   in_channelsr   rB   rC   r     s$   
z"Lfm2VLMultiModalProjector.__init__vision_features_packedr8   rM   c              	   C   s(  |j jdks
J d| j}|j }|jd }| }dd |D }g }d}	tj|tjd}
tj|tjd}tj|
|dd	\}}|	d}|	d}t
||D ]\\}}}|dkrXqM|| dksd|| dkrrtd
| d| d| d|| }|| }tj|tjd}tj|tjd}tj||dd	\}}|	d}|	d}|dddf | |dddf  | |dddf | |dddf   }||	d|	  |	|7 }	qM|rt|j|d}|d|}|	d|| | }n|d|| | f}| jr| |}| |}| |}| |}|S )aY  Project packed vision features without materializing padded tensors.

        Args:
            vision_features_packed: (total_tokens, hidden_size) packed in tile order.
            spatial_shapes: (num_tiles, 2) on CPU (height, width) per tile.

        Returns:
            projected_packed: (total_projected_tokens, text_hidden_size)
        cpuYExpected `spatial_shapes` on CPU to avoid device-to-host sync in variable-length packing.r   c                 S      g | ]\}}|| qS rB   rB   ro   rr   rq   rB   rB   rC   rt         z5Lfm2VLMultiModalProjector.forward.<locals>.<listcomp>r   dtypeij)indexing<spatial_shapes must be divisible by downsample_factor: got (, ) with factor=.N)device)r  r3   r   shapetolistr@   arangeint64meshgridreshapezip
ValueErrorr   cattoindex_select	new_emptyr   r   r  r  r  )rI   r  r8   r   r  r   spatial_shapes_listlengths_listgather_idx_partsoffsetdhdwdh_griddw_griddh_flatdw_flatrT   rS   length
height_out	width_outrows_outcols_outrrcc	token_idx
gather_idxgathered
unshuffledhidden_statesprojected_packedrB   rB   rC   forward  sj   




&"




z!Lfm2VLMultiModalProjector.forward)r   )
r;   r<   r=   r
   r   r   r@   rA   r:  r   rB   rB   r   rC   r     s    r   )r   dummy_inputsc                       s  e Zd ZdZeddddddZeded	ed
edB fddZ	eddd
e
ejdf fddZeddd
e
e
eef  fddZed
e
e fddZdddedef fddZded
edB fddZdejd ejd
ejfd!d"Zd#ed
ejeej B fd$d%Zded
efd&d'Z		d6d(ejdB d)ejd*edB d+ejdB ded
ejeB fd,d-Zd.ejd
ejdB fd/d0Zd1e e
eejf  d
e!e fd2d3Z"d
e#fd4d5Z$  Z%S )7Lfm2VLForConditionalGenerationTzlanguage_model.lm_head.zlanguage_model.model.zvision_tower.zmulti_modal_projector.)zlm_head.zmodel.language_model.zmodel.vision_tower.zmodel.multi_modal_projector.)orig_to_new_prefixr   r   rM   Nc                 C   s   | drdS td)NrQ   z<image>z Only image modality is supported)
startswithr  )clsr   r   rB   rB   rC   get_placeholder_str$  s   
z2Lfm2VLForConditionalGeneration.get_placeholder_strvllm_configr   .c                 C   s   t |jj|jjS rE   )r   short_conv_state_dtypemodel_configr  cache_configmamba_cache_dtype)r?  rA  rB   rB   rC   !get_mamba_state_dtype_from_config+  s   z@Lfm2VLForConditionalGeneration.get_mamba_state_dtype_from_configc                 C   s&   |j }|jjj}tj|j|j|jdS )zCalculate shapes for LFM2's convolutional cache.

        Args:
            vllm_config: vLLM config

        Returns:
            Tuple containing:
            - conv_state_shape: Shape for convolutional state cache
        )tp_world_sizeintermediate_sizeconv_kernel)	parallel_configrC  	hf_configr  r   short_conv_state_shapetensor_parallel_sizer   conv_L_cache)r?  rA  rJ  hf_language_configrB   rB   rC   !get_mamba_state_shape_from_config5  s   
z@Lfm2VLForConditionalGeneration.get_mamba_state_shape_from_configc                 C   s   t  S rE   )r   short_conv_state_copy_func)r?  rB   rB   rC   get_mamba_state_copy_funcL  s   z8Lfm2VLForConditionalGeneration.get_mamba_state_copy_funcmodel)r   r   c                   s  t    |jj}|jj}|j}|j}|| _|| _|| _|j	dk| _
| |d+ |jdkr:t||t|dd| _ntd|j t|t|dd| _W d    n1 sVw   Y  | | t||jt|d	|jjd
| _W d    n1 szw   Y  | jj| _d S )Nr   rQ   siglip2_vision_modelvision_tower)r   quant_configr   z#Unsupported visual tokenizer type: multi_modal_projector)r   r   language)rA  rK  r   architectures)r   r   rC  rK  multimodal_configr   rV  r   rA  mm_encoder_tp_moder   _mark_tower_model
model_typer+   r/   rU  r  r   rW  _mark_language_modelr.   r  rY  language_modelmake_empty_intermediate_tensors)rI   rA  r   r   rZ  r   rV  r   rB   rC   r   P  sD   





	z'Lfm2VLForConditionalGeneration.__init__rL   c                 K   s@   | dd }| dd }| dd }|d u rd S td|||dS )Nr2   r8   r:   )r3   r2   r8   r:   )popLFM2VLImageInputs)rI   rL   r2   r8   r:   rB   rB   rC   _parse_and_validate_image_inputy  s   z>Lfm2VLForConditionalGeneration._parse_and_validate_image_inputr2   r8   c              	   C   sv  |j jdks
J d|j| jjjjjjd}|	 }dd |D }t
t|}|d d df |d d df  jtjd}| rG| dntjdgtjd}|dkrVg S |||jd f}d}	t|D ]\}
}|dkroqf||	|	|  ||
d |f  |	|7 }	qf|d}tj|tj|j d	}tj|jd d tj|j d	}tj|dd
|dd < td | j | j||||d}W d    n1 sw   Y  t|d|}|d }| jj}g }t||D ]6\\}}}|dkr|d q|| dks|| dkrt d| d| d| d||| ||   q| j||d}g }d}	|D ]}|||	|	|   |	|7 }	q&|S )Nr	  r
  r  c                 S   r  rB   rB   r  rB   rB   rC   rt     r  zKLfm2VLForConditionalGeneration.image_pixels_to_features.<locals>.<listcomp>r   r%   r   )r  r  dim)pixel_values_packedr8   
cu_seqlens
max_seqlenlast_hidden_stater  r  r  r  )r  r8   )!r  r3   r   rU  vision_model
embeddingspatch_embeddingweightr  r  rX   sumr@   int32numelr`   r  r   r"  r  	enumeratecopy_	unsqueezezeroscumsumr   rA  getattrrW  r   r  r   r  )rI   r2   r8   r#  r$  total_tokenslengths_cpurh  packed_pixel_valuesr&  r   r-  lengthsrg  vision_outputsimage_outputs_packedr  r   projected_lengths_listrT   rS   r9  image_featuresout_lenrB   rB   rC   image_pixels_to_features  s    



z7Lfm2VLForConditionalGeneration.image_pixels_to_featuresimage_inputc                 C   sp   |d }|d }|d }| j ||d}| }g }d}|D ]}	||||	  }
|tj|
dd ||	7 }q|S )Nr2   r8   r:   )r8   r   rd  )r  r  r   r@   r  )rI   r  r2   r8   r:   r~  num_patches_listbatched_features	patch_idxcountimage_patchesrB   rB   rC   _process_image_input  s   
z3Lfm2VLForConditionalGeneration._process_image_inputc                 K   s&   | j di |}|d u rg S | |S rN   )rc  r  )rI   rL   r  rB   rB   rC   embed_multimodal   s   
z/Lfm2VLForConditionalGeneration.embed_multimodalr   	positionsintermediate_tensorsinputs_embedsc                 K   s"   |d urd }| j ||||d}|S )N)r   r  r  r  )r_  )rI   r   r  r  r  rL   r8  rB   rB   rC   r:    s   z&Lfm2VLForConditionalGeneration.forwardr8  c                 C   s   | j |S rE   )r_  compute_logits)rI   r8  rB   rB   rC   r    s   z-Lfm2VLForConditionalGeneration.compute_logitsweightsc                 C   s   t | }|j|| jdS )N)mapper)r,   load_weightshf_to_vllm_mapper)rI   r  loaderrB   rB   rC   r     s   z+Lfm2VLForConditionalGeneration.load_weightsc                 C   s   t jddddS )z<
        Get the module prefix in multimodal models
        r_  rW  rU  )r_  	connectortower_model)r   from_string_fieldrH   rB   rB   rC   get_mm_mapping$  s
   z-Lfm2VLForConditionalGeneration.get_mm_mapping)NN)&r;   r<   r=   merge_by_field_configr-   r  classmethodr   rX   r@  r   r@   r  rF  rP  r   rR  r   r   r   rb  rc  FloatTensorrA   r  r   r  r'   r  r"   r:  r  r   ry   r  r   r  r   rB   rB   r   rC   r<    s    		)

\


$r<  )Rr   rY   collections.abcr   r   r   typingr   r   r@   torch.nnr   transformersr   transformers.activationsr   transformers.models.lfm2_vlr	   1transformers.models.lfm2_vl.configuration_lfm2_vlr
   9transformers.models.lfm2_vl.image_processing_lfm2_vl_fastr   r   r   vllm.configr   vllm.config.multimodalr   vllm.forward_contextr   ,vllm.model_executor.layers.mamba.mamba_utilsr   r   r   r   )vllm.model_executor.models.module_mappingr   vllm.multimodalr   vllm.multimodal.inputsr   r   r   vllm.multimodal.parser   r   r   vllm.multimodal.processingr   r   r   r    r!   vllm.sequencer"   vllm.utils.tensor_schemar#   r$   
interfacesr&   r'   r(   r)   r*   lfm2_siglip2r+   utilsr,   r-   r.   r/   visionr0   r1   rb  rD   r   r   Moduler   register_processorr<  rB   rB   rB   rC   <module>   sR    dZd

