o
    i                     @   sP  d dl mZmZmZ d dlmZ d dlZd dlmZ d dl	m
Z
mZ d dlmZ d dlmZmZmZmZmZmZ d dlmZ d dlmZmZmZmZ d d	lmZ d d
lmZ d dl m!Z!m"Z" d dl#m$Z$m%Z% d dl&m'Z'm(Z(m)Z)m*Z*m+Z+ ededZ,G dd de(e Z-ddde,de'e, dedB fddZ.ej/e.eedG dd deZ0dS )    )IterableMappingSequence)TypeVarN)BatchFeaturePixtralVisionConfig)
VllmConfig)Mistral3DummyInputsBuilder Mistral3ForConditionalGenerationMistral3MultiModalProjectorMistral3ProcessingInfo_build_mistral3_infoinit_vision_tower_for_llava)PixtralHFEncoderInfo)AutoWeightsLoaderWeightsMapperinit_vllm_registered_modelmaybe_prefix)MULTIMODAL_REGISTRY)BaseMultiModalProcessorCache)MultiModalFieldConfigMultiModalKwargsItems)ImageProcessorItemsMultiModalDataItems)BaseDummyInputsBuilderBaseMultiModalProcessorPromptReplacementPromptUpdatePromptUpdateDetails_I)boundc                
       s   e Zd Zdedeeef deeef deeef def
 fddZded	eeef deeef fd
dZ	de
d	eeef dedee fddZ  ZS )LightOnOCRMultiModalProcessorpromptmm_data	mm_kwargs
tok_kwargsreturnc                    s   t  j||||d}|d}|d urQ| j }| j }| }	|	|j}
|	|j}t	
|t	|
|g }|| d|d< d|v rQ|d | d|d< |d}|d urt|d }t|t|kshJ dd t||D |d< |S )	N)r"   r#   r$   r%   	input_idsr   attention_maskpixel_valuesimage_sizesc                 S   s.   g | ]\}\}}|d d d |d |f qS )N ).0phwr+   r+   [/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/model_executor/models/lightonocr.py
<listcomp>V   s    "zDLightOnOCRMultiModalProcessor._call_hf_processor.<locals>.<listcomp>)super_call_hf_processorgetinfoget_hf_processorget_tokenizer	get_vocabimage_break_tokenimage_end_tokentorchisintensor	unsqueezelenzip)selfr"   r#   r$   r%   processed_outputsr'   	processor	tokenizervocabbreak_idend_id	keep_maskr)   r*   	__class__r+   r0   r3   -   sD   




z0LightOnOCRMultiModalProcessor._call_hf_processor	hf_inputshf_processor_mm_kwargsc                 C   s   t tdtddS )Nimage)r)   image_embeds)dictr   batched)rA   rK   rL   r+   r+   r0   _get_mm_fields_config\   s   z3LightOnOCRMultiModalProcessor._get_mm_fields_configmm_itemsout_mm_kwargsc                    sP   | j  }|jt|jtsJ t| dtf fdd}tdg|dgS )Nitem_idxc                    sF    dt}|| } j|j|jd\}}g||  }t|S )NrM   )image_widthimage_height)	get_itemsr   get_image_sizeget_patch_grid_sizewidthheightr   select_token_id)rT   imagessizencolsnrowstokensencoder_infoimage_token_idrR   r+   r0   replacer   s   

zBLightOnOCRMultiModalProcessor._get_prompt_updates.<locals>.replacerM   )modalitytargetreplacement)	r5   get_hf_configimage_token_index
isinstancevision_configr   r   intr   )rA   rR   rL   rS   	hf_configre   r+   rb   r0   _get_prompt_updatesf   s   
z1LightOnOCRMultiModalProcessor._get_prompt_updates)__name__
__module____qualname__strr   objectr   r3   r   rQ   r   r   r   r   ro   __classcell__r+   r+   rI   r0   r!   ,   s8    


/




r!   cacher5   dummy_inputsrw   c                C   s   t | tsJ t| ||dS )Nrv   )rk   r   r!   )r5   rx   rw   r+   r+   r0   _build_LightOnOCR_processor   s   ry   )r5   rx   c                   @   s`   e Zd ZeddddddZddd	ed
eddfddZdee	ee
jf  dee fddZdS )"LightOnOCRForConditionalGenerationzvision_tower.zmulti_modal_projector.zlanguage_model.lm_head.zlanguage_model.model.)zmodel.vision_encoder.zmodel.vision_projection.zlm_head.zmodel.language_model.)orig_to_new_prefix )prefixvllm_configr}   r&   Nc                C   s   t j|  |jj}|j}|jj}|| _|| _t||dt	|dd| _
t|jj|jj|j|j|jj|j|t	|dd| _t||jt	|dd| _| jj| _d S )NFvision_tower)quant_configrequire_post_normr}   multi_modal_projector)vision_hidden_sizetext_hidden_sizeprojector_hidden_actspatial_merge_size
patch_sizemultimodal_projector_biasr   r}   language_model)r~   rn   r}   )nnModule__init__model_configrn   r   multimodal_configconfigr   r   r   r   rl   hidden_sizetext_configr   r   r   r   r   r   r   make_empty_intermediate_tensors)rA   r~   r}   r   r   r   r+   r+   r0   r      s:   z+LightOnOCRForConditionalGeneration.__init__weightsc                 C   s   t | }|j|| jdS )N)mapper)r   load_weightshf_to_vllm_mapper)rA   r   loaderr+   r+   r0   r      s   z/LightOnOCRForConditionalGeneration.load_weights)rp   rq   rr   r   r   r   rs   r   r   tupler;   Tensorsetr   r+   r+   r+   r0   rz      s    	(&rz   )1collections.abcr   r   r   typingr   r;   torch.nnr   transformersr   r   vllm.configr   #vllm.model_executor.models.mistral3r	   r
   r   r   r   r   "vllm.model_executor.models.pixtralr    vllm.model_executor.models.utilsr   r   r   r   vllm.multimodalr   vllm.multimodal.cacher   vllm.multimodal.inputsr   r   vllm.multimodal.parser   r   vllm.multimodal.processingr   r   r   r   r   r   r!   ry   register_processorrz   r+   r+   r+   r0   <module>   s<    [

