o
    
۾i!!                     @   sT  d Z ddlmZmZ ddlmZ ddlZddlmZ ddl	m
Z
 ddlmZmZmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZmZmZ ddlmZ ddlmZ  ddlm!Z! ddl"m#Z#m$Z$m%Z%m&Z& ddl'm(Z(m)Z)m*Z* G dd de%Z+G dd deZ,G dd dee+ Z-G dd de#Z.ej/e-e+e.dG dd de!Z0dS )zDInference-only OpenCUA-7B model compatible with HuggingFace weights.    )MappingSequence)AnyN)BatchFeature)Qwen2VLImageProcessorQwen2VLProcessorQwen2VLVideoProcessor)
VllmConfig)MULTIMODAL_REGISTRY)MultiModalFieldConfigMultiModalKwargsItems)MultiModalDataItems)BaseMultiModalProcessorPromptReplacementPromptUpdate)TokenizerLike   )Qwen2_5_VisionTransformer)"Qwen2_5_VLForConditionalGeneration)Qwen2VLDummyInputsBuilderQwen2VLMultiModalDataParserQwen2VLProcessingInfo_create_qwen2vl_field_factory)WeightsMapperinit_vllm_registered_modelmaybe_prefixc                   @   sD   e Zd Zdd Zdd ZdeeedB f fddZd	e	fd
dZ
dS )OpenCUAProcessingInfoc                 C   s   t |  jj|  dS )N)expected_hidden_size)r   get_hf_configvision_configspatial_merge_size_get_expected_hidden_sizeself r$   V/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/opencua.pyget_data_parser8   s   
z%OpenCUAProcessingInfo.get_data_parserc                 C   s
   | j  S N)ctxr   r"   r$   r$   r%   r   >   s   
z#OpenCUAProcessingInfo.get_hf_configreturnNc                 C   s   dd iS )Nimager$   r"   r$   r$   r%   get_supported_mm_limitsA   s   z-OpenCUAProcessingInfo.get_supported_mm_limitskwargsc                 K   s&   |   }| j }td||d|S )zLoad OpenCUA processor.)r   	tokenizerNr$   )get_tokenizerr(   get_hf_image_processor_configOpenCUAProcessor)r#   r,   r-   r   r$   r$   r%   get_hf_processorD   s   
z&OpenCUAProcessingInfo.get_hf_processor)__name__
__module____qualname__r&   r   r   strintr+   objectr1   r$   r$   r$   r%   r   7   s
    r   c                       sP   e Zd Zdededdf fddZdedef fd	d
Z			dddZ	  Z
S )r0   attribute_nameargr)   Nc                    s   |dkrd S t  ||S )Nr-   )supercheck_argument_for_proper_class)r#   r8   r9   	__class__r$   r%   r;   P   s   z0OpenCUAProcessor.check_argument_for_proper_classr   r-   c                    sN   t di |}tdi |}|dd }t jd||||d| d| _d S )Nchat_template)image_processorr-   video_processorr>   <|media_placeholder|>r$   )r   r   popr:   __init__image_token)r#   r   r-   r,   r?   r@   r>   r<   r$   r%   rC   U   s   
zOpenCUAProcessor.__init__c                 K   s   |d urt |ts|g}| j|fi |}ni }i }|d ur5t |ts&|g}t|dkr5| j||p2dd}i ||}t||dS )Nr   pt)return_tensors)tensor_type)
isinstancelistr-   lenr?   r   )r#   textimagesrF   r,   text_inputsimage_inputscombined_inputsr$   r$   r%   __call__i   s   

zOpenCUAProcessor.__call__)NNN)r2   r3   r4   r5   r7   r;   dictr   rC   rP   __classcell__r$   r$   r<   r%   r0   O   s    r0   c                
   @   s   e Zd Zdedeeef deeef fddZdede	deeef deeef de
f
d	d
Zde	deeef dedee fddZdS )OpenCUAMultiModalProcessor	hf_inputshf_processor_mm_kwargsr)   c                 C   s   t | j jj|S r'   )r   infor   r   r    )r#   rT   rU   r$   r$   r%   _get_mm_fields_config   s
   z0OpenCUAMultiModalProcessor._get_mm_fields_configprompt_textmm_itemstokenization_kwargsc                 C   s   dS )u<   vLLM이 prompt 업데이트를 처리하도록 False 반환.Fr$   )r#   rX   rY   rU   rZ   r$   r$   r%   _hf_processor_applies_updates   s   z8OpenCUAMultiModalProcessor._hf_processor_applies_updatesout_mm_kwargsc                    s   | j jdi |}| j jdi |}| j  }| }| j  }t|dd}	||	t|dd |jd dt	f fdd}
t
d	 g|
d
gS )NrD   rA   media_placeholder_token_idipP    item_idxc                    s@   d |  }|d j }t|tjsJ t|  } g| S )Nr*   image_grid_thw)datarH   torchTensorr6   prod)r_   out_itemgrid_thw
num_tokensimage_token_idmerge_lengthr\   r$   r%   get_replacement_opencua   s
   

zOOpenCUAMultiModalProcessor._get_prompt_updates.<locals>.get_replacement_opencuar*   )modalitytargetreplacementr$   )rV   r1   get_image_processorr.   	get_vocabr   getattrget
merge_sizer6   r   )r#   rY   rU   r\   hf_processorr?   r-   vocab	hf_configimage_token_strrk   r$   rh   r%   _get_prompt_updates   s$   



	z.OpenCUAMultiModalProcessor._get_prompt_updatesN)r2   r3   r4   r   r   r5   r7   r   rW   r   boolr[   r   r   r   r   rx   r$   r$   r$   r%   rS      s8    


	




rS   c                   @   s&   e Zd Zdeeef defddZdS )OpenCUADummyInputsBuilder	mm_countsr)   c                 C   s   | dd}d}|| S )Nr*   r   rA   )rr   )r#   r{   
num_imagesrD   r$   r$   r%   get_dummy_text   s   z(OpenCUADummyInputsBuilder.get_dummy_textN)r2   r3   r4   r   r5   r6   r}   r$   r$   r$   r%   rz      s    rz   )rV   dummy_inputsc                   @   sn   e Zd Zg dddgdZeddddddd	Zd
Zedede	dedB fddZ
dddedefddZdS )OpenCUAForConditionalGeneration)q_projk_projv_proj	gate_projup_proj)qkv_projgate_up_projzlanguage_model.model.zvisual.zlanguage_model.lm_head.)zmodel.language_model.zmodel.visual.zvision_tower.zlm_head.zmodel.)orig_to_new_prefixTrl   ir)   Nc                 C   s   | drdS td)Nr*   rA   z Only image modality is supported)
startswith
ValueError)clsrl   r   r$   r$   r%   get_placeholder_str   s   
z3OpenCUAForConditionalGeneration.get_placeholder_str )prefixvllm_configr   c                C   s   t j|  |jj}|j}|jj}|jdk| _|| _	|| _
|| _|| _| | _| |d t|jt|dd| jt|dd| _W d    n1 sKw   Y  | | t||jt|ddgd	| _W d    n1 snw   Y  | jj| _d S )
Nra   r*   rms_norm_epsgư>visual)r   norm_epsquant_configr   language_modelQwen2ForCausalLM)r   rv   r   architectures)nnModulerC   model_configrv   r   multimodal_configmm_encoder_tp_modeuse_data_parallelconfigr   is_multimodal_pruning_enabled_mark_tower_modelOpenCUAVisionTransformerr   rq   r   r   _mark_language_modelr   text_configr   make_empty_intermediate_tensors)r#   r   r   r   r   r   r$   r$   r%   rC      s:   


	z(OpenCUAForConditionalGeneration.__init__)r2   r3   r4   packed_modules_mappingr   hf_to_vllm_mappersupports_encoder_tp_dataclassmethodr5   r6   r   r	   rC   r$   r$   r$   r%   r      s     
r   )1__doc__collections.abcr   r   typingr   rb   torch.nnr   transformersr   transformers.models.qwen2_vlr   r   r   vllm.configr	   vllm.multimodalr
   vllm.multimodal.inputsr   r   vllm.multimodal.parser   vllm.multimodal.processingr   r   r   vllm.tokenizersr   
qwen2_5_vlr   r   r   qwen2_vlr   r   r   r   utilsr   r   r   r   r0   rS   rz   register_processorr   r$   r$   r$   r%   <module>   s6   69	