o
    -i!                     @   sX  d Z ddlmZmZ ddlmZ ddlZddlmZ ddl	m
Z
 ddlmZmZmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZmZ ddlmZmZmZ ddlmZ ddlm Z! ddlm"Z" ddl#m$Z$m%Z%m&Z&m'Z' ddl(m)Z)m*Z*m+Z+ G dd de&Z,G dd deZ-G dd dee, Z.G dd de$Z/ej0e.e,e/dG dd de"Z1dS )zDInference-only OpenCUA-7B model compatible with HuggingFace weights.    )MappingSequence)AnyN)BatchFeature)Qwen2VLImageProcessorQwen2VLProcessorQwen2VLVideoProcessor)
VllmConfig)MULTIMODAL_REGISTRY)MultiModalFieldConfigMultiModalKwargsItems)MultiModalDataItemsMultiModalDataParser)BaseMultiModalProcessorPromptReplacementPromptUpdate)TokenizerLike   )Qwen2_5_VisionTransformer)"Qwen2_5_VLForConditionalGeneration)Qwen2VLDummyInputsBuilderQwen2VLMultiModalDataParserQwen2VLProcessingInfo_create_qwen2vl_field_factory)WeightsMapperinit_vllm_registered_modelmaybe_prefixc                   @   s<   e Zd Zdd ZdeeedB f fddZdefdd	Z	dS )
OpenCUAProcessingInfoc                 C   s
   | j  S N)ctxget_hf_configself r#   _/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/model_executor/models/opencua.pyr    8   s   
z#OpenCUAProcessingInfo.get_hf_configreturnNc                 C   s   dd iS )Nimager#   r!   r#   r#   r$   get_supported_mm_limits;   s   z-OpenCUAProcessingInfo.get_supported_mm_limitskwargsc                 K   s&   |   }| j }td||d|S )zLoad OpenCUA processor.)vision_config	tokenizerNr#   )get_tokenizerr   get_hf_image_processor_configOpenCUAProcessor)r"   r(   r*   r)   r#   r#   r$   get_hf_processor>   s   
z&OpenCUAProcessingInfo.get_hf_processor)
__name__
__module____qualname__r    r   strintr'   objectr.   r#   r#   r#   r$   r   7   s    r   c                       sP   e Zd Zdededdf fddZdedef fd	d
Z			dddZ	  Z
S )r-   attribute_nameargr%   Nc                    s   |dkrd S t  ||S )Nr*   )supercheck_argument_for_proper_class)r"   r5   r6   	__class__r#   r$   r8   J   s   z0OpenCUAProcessor.check_argument_for_proper_classr)   r*   c                    sN   t di |}tdi |}|dd }t jd||||d| d| _d S )Nchat_template)image_processorr*   video_processorr;   <|media_placeholder|>r#   )r   r   popr7   __init__image_token)r"   r)   r*   r(   r<   r=   r;   r9   r#   r$   r@   O   s   
zOpenCUAProcessor.__init__c                 K   s   |d urt |ts|g}| j|fi |}ni }i }|d ur5t |ts&|g}t|dkr5| j||p2dd}i ||}t||dS )Nr   pt)return_tensors)tensor_type)
isinstancelistr*   lenr<   r   )r"   textimagesrC   r(   text_inputsimage_inputscombined_inputsr#   r#   r$   __call__c   s   

zOpenCUAProcessor.__call__)NNN)r/   r0   r1   r2   r4   r8   dictr   r@   rM   __classcell__r#   r#   r9   r$   r-   I   s    r-   c                
   @   s   e Zd ZdefddZdedeeef deee	f fddZ
ded	edeeef d
eeef def
ddZd	edeeef dedee fddZdS )OpenCUAMultiModalProcessorr%   c                 C   s   t | j jjS r   )r   infor    r)   spatial_merge_sizer!   r#   r#   r$   _get_data_parser   s   z+OpenCUAMultiModalProcessor._get_data_parser	hf_inputshf_processor_mm_kwargsc                 C   s   t | j jj|S r   )r   rQ   r    r)   rR   )r"   rT   rU   r#   r#   r$   _get_mm_fields_config   s
   z0OpenCUAMultiModalProcessor._get_mm_fields_configprompt_textmm_itemstokenization_kwargsc                 C   s   dS )u<   vLLM이 prompt 업데이트를 처리하도록 False 반환.Fr#   )r"   rW   rX   rU   rY   r#   r#   r$   _hf_processor_applies_updates   s   z8OpenCUAMultiModalProcessor._hf_processor_applies_updatesout_mm_kwargsc                    s   | j jdi |}| j jdi |}| j  }| }| j  }t|dd}	||	t|dd |jd dt	f fdd}
t
d	 g|
d
gS )NrA   r>   media_placeholder_token_idipP    item_idxc                    s@   d |  }|d j }t|tjsJ t|  } g| S )Nr&   image_grid_thw)datarE   torchTensorr3   prod)r^   out_itemgrid_thw
num_tokensimage_token_idmerge_lengthr[   r#   r$   get_replacement_opencua   s
   

zOOpenCUAMultiModalProcessor._get_prompt_updates.<locals>.get_replacement_opencuar&   )modalitytargetreplacementr#   )rQ   r.   get_image_processorr+   	get_vocabr    getattrget
merge_sizer3   r   )r"   rX   rU   r[   hf_processorr<   r*   vocab	hf_configimage_token_strrj   r#   rg   r$   _get_prompt_updates   s$   



	z.OpenCUAMultiModalProcessor._get_prompt_updatesN)r/   r0   r1   r   rS   r   r   r2   r4   r   rV   r   boolrZ   r   r   r   r   rw   r#   r#   r#   r$   rP      s:    


	




rP   c                   @   s&   e Zd Zdeeef defddZdS )OpenCUADummyInputsBuilder	mm_countsr%   c                 C   s   | dd}d}|| S )Nr&   r   r>   )rq   )r"   rz   
num_imagesrA   r#   r#   r$   get_dummy_text   s   z(OpenCUADummyInputsBuilder.get_dummy_textN)r/   r0   r1   r   r2   r3   r|   r#   r#   r#   r$   ry      s    ry   )rQ   dummy_inputsc                   @   sn   e Zd Zg dddgdZeddddddd	Zd
Zedede	dedB fddZ
dddedefddZdS )OpenCUAForConditionalGeneration)q_projk_projv_proj	gate_projup_proj)qkv_projgate_up_projzlanguage_model.model.zvisual.zlanguage_model.lm_head.)zmodel.language_model.zmodel.visual.zvision_tower.zlm_head.zmodel.)orig_to_new_prefixTrk   ir%   Nc                 C   s   | drdS td)Nr&   r>   z Only image modality is supported)
startswith
ValueError)clsrk   r   r#   r#   r$   get_placeholder_str   s   
z3OpenCUAForConditionalGeneration.get_placeholder_str )prefixvllm_configr   c                C   s   t j|  |jj}|j}|jj}|jdk| _|| _	|| _
|| _|| _| | _| |d t|jt|dd| jt|dd| _W d    n1 sKw   Y  | | t||jt|ddgd	| _W d    n1 snw   Y  | jj| _d S )
Nr`   r&   rms_norm_epsgư>visual)r)   norm_epsquant_configr   language_modelQwen2ForCausalLM)r   ru   r   architectures)nnModuler@   model_configru   r   multimodal_configmm_encoder_tp_modeuse_data_parallelconfigr   is_multimodal_pruning_enabled_mark_tower_modelOpenCUAVisionTransformerr)   rp   r   r   _mark_language_modelr   text_configr   make_empty_intermediate_tensors)r"   r   r   r   r   r   r#   r#   r$   r@      s:   


	z(OpenCUAForConditionalGeneration.__init__)r/   r0   r1   packed_modules_mappingr   hf_to_vllm_mappersupports_encoder_tp_dataclassmethodr2   r3   r   r	   r@   r#   r#   r#   r$   r~      s     
r~   )2__doc__collections.abcr   r   typingr   ra   torch.nnr   transformersr   transformers.models.qwen2_vlr   r   r   vllm.configr	   vllm.multimodalr
   vllm.multimodal.inputsr   r   vllm.multimodal.parser   r   vllm.multimodal.processingr   r   r   vllm.tokenizersr   
qwen2_5_vlr   r   r   qwen2_vlr   r   r   r   utilsr   r   r   r   r-   rP   ry   register_processorr~   r#   r#   r#   r$   <module>   s6   6>	