o
    
۾i6                     @   s  U d dl mZmZmZ d dlmZmZmZ d dlZd dlm	Z	 d dl
mZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZmZmZmZ d dlmZmZmZ d dlm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z& d dl'm(Z( d dl)m*Z*m+Z+ ddl,m-Z-m.Z.m/Z/m0Z0 ddl1m2Z2 ddl3m4Z4 ddl5m6Z6m7Z7m8Z8m9Z9 ddl:m;Z; ee<Z=G dd de*Z>G dd de*Z?e>e?B Z@eeAd< G dd de	jBZCG dd de"ZDG dd de eD ZEG d d! d!e!eD ZFejGeFeDeEd"G d#d$ d$e	jBe.e/e0ZHdS )%    )IterableMappingSequence)	AnnotatedLiteral	TypeAliasN)nn)BatchFeaturePaliGemmaConfig)
VllmConfig)BaseDummyOptions)init_logger)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalInputsMultiModalKwargsItemsMultiModalUUIDDict)ImageEmbeddingItemsImageProcessorItemsMultiModalDataItems)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptIndexTargetsPromptInsertionPromptUpdatePromptUpdateDetails)IntermediateTensors)TensorSchemaTensorShape   )MultiModalEmbeddingsSupportsLoRASupportsMultiModal
SupportsPP)MultiModelKeys)SiglipVisionModel)AutoWeightsLoaderWeightsMapperinit_vllm_registered_modelmaybe_prefix)get_vision_encoder_infoc                   @   s>   e Zd ZU dZdZed ed< eej	e
ddddf ed< d	S )
PaliGemmaImagePixelInputsz
    Dimensions:
        - bn: Batch size * number of images
        - c: Number of channels (3)
        - h: Height
        - w: Width
    pixel_valuestypebn   hwdataN__name__
__module____qualname____doc__r/   r   __annotations__r   torchTensorr     r=   r=   X/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/paligemma.pyr-   9   s   
  r-   c                   @   s<   e Zd ZU dZdZed ed< eej	e
dddf ed< dS )	PaliGemmaImageEmbeddingInputsz
    Dimensions:
        - bn: Batch size * number of images
        - ifs: Image feature size
        - hs: Hidden size (must match language model backbone)
    image_embedsr/   r0   ifshsr4   Nr5   r=   r=   r=   r>   r?   F   s   
 r?   PaliGemmaImageInputsc                       s<   e Zd Zdedef fddZdejdejfddZ  ZS )	PaliGemmaMultiModalProjectorvision_hidden_sizeprojection_dimc                    s    t    tj||dd| _d S )NT)bias)super__init__r   Linearlinear)selfrE   rF   	__class__r=   r>   rI   X   s   
z%PaliGemmaMultiModalProjector.__init__image_featuresreturnc                 C   s   |  |}|S N)rK   )rL   rO   hidden_statesr=   r=   r>   forward]   s   
z$PaliGemmaMultiModalProjector.forward)	r6   r7   r8   intrI   r;   r<   rS   __classcell__r=   r=   rM   r>   rD   W   s    rD   c                   @   sL   e Zd Zdd Zdd ZdeeedB f fddZd	ed
edefddZ	dS )PaliGemmaProcessingInfoc                 C   s   | j tS rQ   )ctxget_hf_configr
   rL   r=   r=   r>   rX   c      z%PaliGemmaProcessingInfo.get_hf_configc                 C   s   t |  S rQ   )r,   rX   rY   r=   r=   r>   r,   f   rZ   z/PaliGemmaProcessingInfo.get_vision_encoder_inforP   Nc                 C   s   ddiS )Nimager!   r=   rY   r=   r=   r>   get_supported_mm_limitsi   s   z/PaliGemmaProcessingInfo.get_supported_mm_limitsimage_widthimage_heightc                C   s   |   }|j||dS )Nr]   r^   )r,   get_num_image_tokens)rL   r]   r^   vision_encoder_infor=   r=   r>   r`   l   s
   z,PaliGemmaProcessingInfo.get_num_image_tokens)
r6   r7   r8   rX   r,   r   strrT   r\   r`   r=   r=   r=   r>   rV   b   s    rV   c                	   @   sX   e Zd Zdeeef defddZ	d
dedeeef deeef dB defdd	Z	dS )PaliGemmaDummyInputsBuilder	mm_countsrP   c                 C   s   dS )N r=   )rL   rd   r=   r=   r>   get_dummy_text{      z*PaliGemmaDummyInputsBuilder.get_dummy_textNseq_len
mm_optionsc           	      C   sJ   | j  }|j}|j}|dd}|r|dnd }d| j||||diS )Nr[   r   )widthheight
num_images	overrides)inforX   vision_config
image_sizeget_get_dummy_images)	rL   rh   rd   ri   	hf_configro   max_image_sizerl   image_overridesr=   r=   r>   get_dummy_mm_data~   s   
z-PaliGemmaDummyInputsBuilder.get_dummy_mm_datarQ   )
r6   r7   r8   r   rb   rT   rf   r   r   rv   r=   r=   r=   r>   rc   z   s    
rc   c                       s   e Zd Zdedeeef deeef deeef def
 fddZded	eeef deeef fd
dZ	de
d	eeef dedee fddZ		ddeee B de
d	eeef deeef dB dedB def fddZ  ZS )PaliGemmaMultiModalProcessorpromptmm_data	mm_kwargs
tok_kwargsrP   c                    sD   | j  }|s|j|dd}tt|gdddS t j||||dS )NF)add_special_tokens)	input_idspt)tensor_type)rx   ry   rz   r{   )rn   get_tokenizerencoder	   dictrH   _call_hf_processor)rL   rx   ry   rz   r{   	tokenizer
prompt_idsrM   r=   r>   r      s   
z/PaliGemmaMultiModalProcessor._call_hf_processor	hf_inputshf_processor_mm_kwargsc                 C   s   t tddS )Nr[   )r.   )r   r   batched)rL   r   r   r=   r=   r>   _get_mm_fields_config   s   z2PaliGemmaMultiModalProcessor._get_mm_fields_configmm_itemsout_mm_kwargsc                    sh   j  }|jj  }|j t tsJ dtf fdd}tdt	|j
r- gng |dgS )Nitem_idxc                    sb    dttf}t|tr|| }n|| }jj|j|j	d}g| }t
j| g dS )Nr[   r_   )embed_token_id)	get_itemsr   r   
isinstanceget_feature_sizeget_image_sizern   r`   rj   rk   r   select_token_id)r   imagesnum_image_tokensrp   image_tokensbos_token_idimage_token_idr   rL   r=   r>   get_insertion   s   


zGPaliGemmaMultiModalProcessor._get_prompt_updates.<locals>.get_insertionr[   )modalitytarget	insertion)rn   rX   image_token_indexr   r   r   rT   r   r   prefixadd_bos_token)rL   r   r   r   rs   r   r   r=   r   r>   _get_prompt_updates   s   

z0PaliGemmaMultiModalProcessor._get_prompt_updatesNtokenization_kwargsmm_uuidsc                    sd   t  j|||||d}|d }| j }d}	||	d }
t|r0|d |
kr0||
 ||d< |S )N)r   prompt_token_ids
)rH   applyrn   r   r   lenappend)rL   rx   r   r   r   r   	mm_inputsr   r   newline_promptnewline_token_idrM   r=   r>   r      s   

z"PaliGemmaMultiModalProcessor.applyNN)r6   r7   r8   rb   r   objectr	   r   r   r   r   r   r   r   r   listrT   r   r   r   rU   r=   r=   rM   r>   rw      sV    







5

rw   )rn   dummy_inputsc                       sp  e Zd Zg dddgdZeddddd	d
ZededededB fddZ	ddde
def fddZdededB fddZdedejdejfddZdedejfdd Zdedefd!d"Z		d7d#ejdB d$ejd%edB d&ejdB dedefd'd(Zd)ejdejdB fd*d+Zd,eeeejf  dee fd-d.Zdefd/d0Zd1edefd2d3Zd4edefd5d6Z   Z!S )8!PaliGemmaForConditionalGeneration)q_projk_projv_proj	gate_projup_proj)qkv_projgate_up_projzlanguage_model.model.zvision_tower.zmulti_modal_projector.zlanguage_model.lm_head.)zmodel.language_model.zmodel.vision_tower.zmodel.multi_modal_projector.zlm_head.)orig_to_new_prefixr   irP   Nc                 C   s   | drd S td)Nr[   z Only image modality is supported)
startswith
ValueError)clsr   r   r=   r=   r>   get_placeholder_str  s   
z5PaliGemmaForConditionalGeneration.get_placeholder_strre   r   vllm_configr   c                   s  t    |jj}|j}|jj}|| _|| _|| _| |d t|j	|t
|dd| _t|j	j|j	jd| _W d    n1 sAw   Y  |jjdkrRdg|j_ndg|j_| |# t||jt
|dd	| _t|d
d}| jj j|9  _W d    n1 sw   Y  | jj| _d S )Nr[   vision_towerr   )rE   rF   gemmaGemmaForCausalLMGemma2ForCausalLMlanguage_model)r   rs   r   logit_scaleg      ?)rH   rI   model_configrs   quant_configmultimodal_configconfig_mark_tower_modelr'   ro   r+   r   rD   hidden_sizerF   multi_modal_projectortext_config
model_typearchitectures_mark_language_modelr*   r   getattrlogits_processorscalemake_empty_intermediate_tensors)rL   r   r   r   r   r   r   rM   r=   r>   rI   #  s@   


z*PaliGemmaForConditionalGeneration.__init__kwargsc                 K   sr   | dd }| dd }|d u r|d u rd S |d ur+| jjj }}td|||ddS |d ur5td|dS td)Nr.   r@   )r2   r3   )r/   r4   resolve_bindings)r/   r4   z This line should be unreachable.)popr   ro   rp   r-   r?   AssertionError)rL   r   r.   r@   r2   r3   r=   r=   r>   _parse_and_validate_image_inputJ  s"   zAPaliGemmaForConditionalGeneration._parse_and_validate_image_inputr   r.   c                 C   s    |  jj}||j|d}|S )N)dtype)get_input_embeddingsweightr   to)rL   r   r.   target_dtyperO   r=   r=   r>   _image_pixels_to_featuresd  s   z;PaliGemmaForConditionalGeneration._image_pixels_to_featuresimage_inputc                 C   s4   |d dkr
|d S |d }|  | j|}| |S )Nr/   r@   r4   )r   r   r   )rL   r   r.   rO   r=   r=   r>   _process_image_inputn  s   
z6PaliGemmaForConditionalGeneration._process_image_inputc                 K   s:   | j di |}|d u rg S | |}|| jjd  }|S )Ng      r=   )r   r   r   r   )rL   r   r   vision_embeddingsr=   r=   r>   embed_multimodal}  s   
z2PaliGemmaForConditionalGeneration.embed_multimodalr}   	positionsintermediate_tensorsinputs_embedsc                 K   s$   |d urd }| j j||||d}|S )N)r   )r   model)rL   r}   r   r   r   r   rR   r=   r=   r>   rS     s   z)PaliGemmaForConditionalGeneration.forwardrR   c                 C   s   | j |S rQ   )r   compute_logits)rL   rR   r=   r=   r>   r     s   z0PaliGemmaForConditionalGeneration.compute_logitsweightsc                 C   s   t | }|j|| jdS )N)mapper)r(   load_weightshf_to_vllm_mapper)rL   r   loaderr=   r=   r>   r     s   z.PaliGemmaForConditionalGeneration.load_weightsc                 C   s   t jddddS )Nr   r   r   )r   	connectortower_model)r&   from_string_fieldrY   r=   r=   r>   get_mm_mapping  s
   z0PaliGemmaForConditionalGeneration.get_mm_mappingr   c                 C      |S rQ   r=   )rL   r   r=   r=   r>   get_num_mm_encoder_tokens  rg   z;PaliGemmaForConditionalGeneration.get_num_mm_encoder_tokensnum_vision_tokensc                 C   r   rQ   r=   )rL   r   r=   r=   r>   get_num_mm_connector_tokens  rg   z=PaliGemmaForConditionalGeneration.get_num_mm_connector_tokensr   )"r6   r7   r8   packed_modules_mappingr)   r   classmethodrb   rT   r   r   rI   r   rC   r   r'   r;   r<   r   r   r"   r   r   rS   r   r   tuplesetr   r&   r   r   r   rU   r=   r=   rM   r>   r      st    	
'





$r   )Icollections.abcr   r   r   typingr   r   r   r;   r   transformersr	   r
   vllm.configr   vllm.config.multimodalr   vllm.loggerr   vllm.multimodalr   vllm.multimodal.inputsr   r   r   r   r   vllm.multimodal.parser   r   r   vllm.multimodal.processingr   r   r   r   r   r   r   vllm.sequencer   vllm.utils.tensor_schemar   r    
interfacesr"   r#   r$   r%   module_mappingr&   siglipr'   utilsr(   r)   r*   r+   visionr,   r6   loggerr-   r?   rC   r:   ModulerD   rV   rc   rw   register_processorr   r=   r=   r=   r>   <module>   sH   $	
h


