o
    
۾i&[                     @   s  d dl Z d dlmZmZmZ d dlmZmZmZ d dl	Z	d dl	m
Z
 d dlmZmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZmZm Z  d dl!m"Z"m#Z#m$Z$ d dl%m&Z& d dl'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0 d dl1m2Z2 d dl3m4Z4m5Z5 ddl6m7Z7m8Z8m9Z9m:Z: ddl;m<Z< ddl=m>Z>m?Z?m@Z@mAZA eeBZCG dd de4ZDeDZEG dd de)ZFG dd de&eF ZGG dd de(eF ZHG dd  d e
jIZJejKeHeFeGd!G d"d# d#e
jIe9e:e8ZLdS )$    N)IterableMappingSequence)	AnnotatedAnyLiteral)nn)BatchFeatureGemma3ConfigGemma3Processor)Gemma3ProcessorKwargs)
VllmConfig)BaseDummyOptions)init_logger)GemmaRMSNorm)MultiModelKeys)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItems)ImageProcessorItems	ImageSizeMultiModalDataItems)BaseDummyInputsBuilder)	BaseMultiModalProcessorBaseProcessingInfoMultiModalPromptUpdates"MultiModalPromptUpdatesApplyResultPlaceholderFeaturesInfoPromptReplacementPromptUpdatePromptUpdateDetailsreplace_token_matches)IntermediateTensors)TensorSchemaTensorShape   )MultiModalEmbeddingsSupportsLoRASupportsMultiModal
SupportsPP)SiglipVisionModel)AutoWeightsLoaderWeightsMapperinit_vllm_registered_modelmaybe_prefixc                   @   sT   e Zd ZU dZdZed ed< eej	e
ddddf ed< eej	e
df ed	< d
S )Gemma3ImagePixelInputsa  
    Dimensions:
        - p: Number of patches total (over each image over each prompt in the
          batch)
        - c: Number of channels (3)
        - h: Height of each patch
        - w: Width of each patch
        - bn: Batch size * number of images
    pixel_valuestypep   hwbnnum_patchesN)__name__
__module____qualname____doc__r2   r   __annotations__r   torchTensorr%    r@   r@   X/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/gemma3_mm.pyr0   8   s
   
 
r0   c                	   @   s   e Zd Zdd ZdefddZdeeedB f fdd	Z	d
e
dee deeef fddZdeded
e
dB defddZdeded
e
dB dee fddZdeded
e
dB defddZdefddZdS )Gemma3ProcessingInfoc                 C   s   | j tS N)ctxget_hf_configr
   selfr@   r@   rA   rE   N   s   z"Gemma3ProcessingInfo.get_hf_configkwargsc                 K   s   | j jtfi |S rC   )rD   get_hf_processorr   )rG   rH   r@   r@   rA   rI   Q   s   z%Gemma3ProcessingInfo.get_hf_processorreturnNc                 C   s   dd iS )Nimager@   rF   r@   r@   rA   get_supported_mm_limitsT   s   z,Gemma3ProcessingInfo.get_supported_mm_limits	processorkeysc                    sF   |j |jt|jjd}|d dtffdd  fdd|D S )N)tokenizer_init_kwargsimages_kwargskeyc                    s   t  | }|d u r|  }|S rC   )getattr)rQ   val)image_processorrP   r@   rA   _resolve_kwd   s   
z?Gemma3ProcessingInfo._resolve_image_kwargs.<locals>._resolve_kwc                    s   i | ]}| |qS r@   r@   ).0k)rU   r@   rA   
<dictcomp>k   s    z>Gemma3ProcessingInfo._resolve_image_kwargs.<locals>.<dictcomp>)rT   _merge_kwargsr   	tokenizerinit_kwargsstr)rG   rM   rN   rH   r@   )rU   rT   rP   rA   _resolve_image_kwargsW   s   z*Gemma3ProcessingInfo._resolve_image_kwargsimage_widthimage_heightc                C   s>  |d u r|   }| |h d}|d }|d }|d }|d }|s$dS td ||krW|| |k r5dS ttt|| tt|| d }	td	|	}	t||	}	d
}
n)|| |k r_dS ttt|| tt|| d }
td	|
}
t||
}
d
}	tt	||	 }tt	||
 }t|||k rdS |	|
 S )N>   do_pan_and_scanpan_and_scan_max_num_cropspan_and_scan_min_crop_size"pan_and_scan_min_ratio_to_activater`   rb   ra   rc   r   zk`do_pan_and_scan=True` has suboptimal results on V1 because of the simplified attention pattern being used.      ?   r&   )
rI   r]   loggerwarning_onceminintmathfloormaxceil)rG   r^   r_   rM   rP   r`   rb   ra   rc   num_crops_wnum_crops_hcrop_size_wcrop_size_hr@   r@   rA   get_num_cropsm   sR   




z"Gemma3ProcessingInfo.get_num_cropsc                   s   |d u r|   }|j | j|||d}|dkr }nd fddt|D }d  d| }| |j}|j}| }	|	|j	 }
t
||
S )Nr^   r_   rM   r    c                 3       | ]} V  qd S rC   r@   rV   _	boi_tokenr@   rA   	<genexpr>       z6Gemma3ProcessingInfo.get_image_repl.<locals>.<genexpr>zHere is the original image z0 and here are some crops to help you see better )rI   ry   rr   joinrangereplacefull_image_sequencerZ   	get_vocabimage_tokenr!   select_token_id)rG   r^   r_   rM   	num_crops
image_textcrops_image_tokens	repl_fullrZ   vocabimage_token_idr@   rx   rA   get_image_repl   s(   
z#Gemma3ProcessingInfo.get_image_replc                C   s2   |d u r|   }| j|||d}|j}|d | S )Nrs   r&   )rI   rr   image_seq_length)rG   r^   r_   rM   r   image_seq_lenr@   r@   rA   get_num_image_tokens   s   z)Gemma3ProcessingInfo.get_num_image_tokensc                 C   s>   |   }| |dh}|d }|  j}|j}t|| |dS )Nra   )heightwidth)rI   r]   rE   vision_config
image_sizer   )rG   rM   rP   max_num_cropsr   native_sizer@   r@   rA   !get_image_size_with_most_features   s   
z6Gemma3ProcessingInfo.get_image_size_with_most_features)r9   r:   r;   rE   objectrI   r   r\   ri   rL   r   setdictr   r]   rr   r!   r   r   r   r   r@   r@   r@   rA   rB   M   sN    


E
#
rB   c                	   @   sX   e Zd Zdeeef defddZ	d
dedeeef deeef dB defdd	Z	dS )Gemma3DummyInputsBuilder	mm_countsrJ   c                 C   s$   | dd}| j }|j}|| S )NrK   r   )getinforI   ry   )rG   r   
num_imagesrM   r   r@   r@   rA   get_dummy_text   s   
z'Gemma3DummyInputsBuilder.get_dummy_textNseq_len
mm_optionsc                 C   sB   | dd}| j \}}|r| dnd }d| j||||diS )NrK   r   )r   r   r   	overrides)r   r   r   _get_dummy_images)rG   r   r   r   r   target_widthtarget_heightimage_overridesr@   r@   rA   get_dummy_mm_data   s   z*Gemma3DummyInputsBuilder.get_dummy_mm_datarC   )
r9   r:   r;   r   r\   ri   r   r   r   r   r@   r@   r@   rA   r      s    
r   c                
       s   e Zd Zdedeeef deeef deeef def
 fddZded	eeef deeef fd
dZ	de
d	eeef dedee fddZdee dedeee ef f fddZdee dedeeee f f fddZ  ZS )Gemma3MultiModalProcessorpromptmm_data	mm_kwargs
tok_kwargsrJ   c           
         s   t  ||||}|d }d urKjjd|idd}|dtfddttD }jj	d
i |  fdd|D }	t
|	d |d	< |S )NimagesrK   F)validatec                    s   g | ]}  |qS r@   )get_image_size)rV   i)parsed_imagesr@   rA   
<listcomp>'  s    
z@Gemma3MultiModalProcessor._call_hf_processor.<locals>.<listcomp>c                    s"   g | ]}j j|j|j d qS )rs   )r   rr   r   r   )rV   size)hf_processorrG   r@   rA   r   ,  s    r&   r8   r@   )super_call_hf_processorr   r   parse_mm_data	get_itemsr   r}   lenrI   r>   tensor)
rG   r   r   r   r   processed_outputsr   mm_itemsimage_sizesr   	__class__)r   r   rG   rA   r     s$   

z,Gemma3MultiModalProcessor._call_hf_processor	hf_inputshf_processor_mm_kwargsc                 C   s,   | dtd}ttd|tddS )Nr8   r   rK   )r1   r8   )r   r>   emptyr   r   flat_from_sizesbatched)rG   r   r   r8   r@   r@   rA   _get_mm_fields_config8  s
   
z/Gemma3MultiModalProcessor._get_mm_fields_configr   out_mm_kwargsc                    s>   j jdi |  j}dtf fdd}td||dgS )Nitem_idxc                    s,    dt}|| }jj|j|j dS )NrK   rs   )r   r   r   r   r   r   r   )r   r   r   r   r   rG   r@   rA   get_replacement_gemma3M  s   
zMGemma3MultiModalProcessor._get_prompt_updates.<locals>.get_replacement_gemma3rK   )modalitytargetreplacementr@   )r   rI   ry   ri   r   )rG   r   r   r   r   r   r@   r   rA   _get_prompt_updatesD  s   z-Gemma3MultiModalProcessor._get_prompt_updatesmm_prompt_updatesc                    s   t  ||\}}| j }| }|d }|d }|d }	|d }
t|||g|	g}t|||g|	g}t|||g|
g}||fS )N









)r   _apply_token_matchesr   get_tokenizerr   r"   )rG   r   r   	token_idsresrZ   r   	newline_1	newline_2	newline_3	newline_4r   r@   rA   r   _  s.   
z.Gemma3MultiModalProcessor._apply_token_matchesnew_token_idsc           
         s   | j  }| }|d  |d |d |d dtdtt f fdd}tt  }tt  t|D ]\}||}|| fd	d
tt|D  q7t	 
||}	fdd|	 D S )Nr   r   r   r   tokrJ   c                    s&   | kr gS | krgS | gS rC   r@   )r   )r   r   r   r   r@   rA   get_repl_toks  s
   zFGemma3MultiModalProcessor._find_mm_placeholders.<locals>.get_repl_toksc                 3   ru   rC   r@   rv   )orig_idxr@   rA   rz     r{   zBGemma3MultiModalProcessor._find_mm_placeholders.<locals>.<genexpr>c                    s$   i | ]\}}| fd d|D qS )c              	      s,   g | ]}t |j|j |j |j|jd qS ))r   r   	start_idxtokensis_embed)r   r   r   r   r   r   )rV   r3   repl_orig_idxsr@   rA   r     s    zNGemma3MultiModalProcessor._find_mm_placeholders.<locals>.<dictcomp>.<listcomp>r@   )rV   r   placeholdersr   r@   rA   rX     s    zCGemma3MultiModalProcessor._find_mm_placeholders.<locals>.<dictcomp>)r   r   r   ri   list	enumerateextendr}   r   r   _find_mm_placeholdersitems)
rG   r   r   rZ   r   r   repl_token_idsorig_tok	repl_toksreplsr   )r   r   r   r   r   r   rA   r     s"   
 


"
z/Gemma3MultiModalProcessor._find_mm_placeholders)r9   r:   r;   r\   r   r   r	   r   r   r   r   r   r   r   r    r   r   ri   r   tupler   r   r   r   __classcell__r@   r@   r   rA   r     sT    


#




$r   c                       s2   e Zd Zdef fddZdejfddZ  ZS )Gemma3MultiModalProjectorconfigc                    s   t    tt|jj|jj| _	t
|jj|jjd| _t|jj|jj | _t|jd | _| j| j | _tj| j| jd| _d S )N)epsrd   )kernel_sizestride)r   __init__r   	Parameterr>   zerosr   hidden_sizetext_configmm_input_projection_weightr   layer_norm_epsmm_soft_emb_normri   r   
patch_sizepatches_per_imagemm_tokens_per_imagetokens_per_sider   	AvgPool2davg_pool)rG   r   r   r@   rA   r     s"   
z"Gemma3MultiModalProjector.__init__vision_outputsc           	      C   sv   |j \}}}|dd}|||| j| j}| }| |}|d}|dd}| |}t	|| j
}||S )Nr&   re   )shape	transposereshaper   
contiguousr   flattenr   r>   matmulr   type_as)	rG   r   
batch_sizerw   
seq_lengthreshaped_vision_outputspooled_vision_outputsnormed_vision_outputsprojected_vision_outputsr@   r@   rA   forward  s   



z!Gemma3MultiModalProjector.forward)	r9   r:   r;   r
   r   r>   r?   r  r   r@   r@   r   rA   r     s    r   )r   dummy_inputsc                       s  e Zd Zg dddgdZeddddd	d
ZededededB fddZ	ddde
def fddZedd ZdededB fddZdedejdejfddZd edeej fd!d"Zdedefd#d$Z	d@dd%d&d'ejd(edB d)ejdB d*edejf
 fd+d,Z		dAd'ejdB d-ejd.edB d/ejdB dedefd0d1Zd2ejdejdB fd3d4Zd5eeeejf  de e fd6d7Z!de"fd8d9Z#d:edefd;d<Z$d=edefd>d?Z%  Z&S )BGemma3ForConditionalGeneration)q_projk_projv_proj	gate_projup_proj)qkv_projgate_up_projzlanguage_model.model.zvision_tower.zmulti_modal_projector.zlanguage_model.lm_head.)zmodel.language_model.zmodel.vision_tower.zmodel.multi_modal_projector.zlm_head.)orig_to_new_prefixr   r   rJ   Nc                 C   s   | drdS td)NrK   z<start_of_image>z Only image modality is supported)
startswith
ValueError)clsr   r   r@   r@   rA   get_placeholder_str  s   
z2Gemma3ForConditionalGeneration.get_placeholder_str prefixvllm_configr  c                   s   t    |jj}|j}|jj}|| _|| _|| _| |d t|j	|t
|dd| _t|| _W d    n1 s;w   Y  | |% t||jt
|ddgd| _t|dd}| jj j|9  _W d    n1 smw   Y  | jj| _d S )	NrK   vision_towerr  language_modelGemma3ForCausalLM)r   	hf_configr  architectureslogit_scaleg      ?)r   r   model_configr$  quant_configmultimodal_configr   _mark_tower_modelr+   r   r/   r!  r   multi_modal_projector_mark_language_modelr.   r   r"  rR   logits_processorscalemake_empty_intermediate_tensors)rG   r   r  r   r(  r)  r&  r   r@   rA   r     s6   
z'Gemma3ForConditionalGeneration.__init__c                 C   s   t |  jS rC   )next
parametersdtyperF   r@   r@   rA   r2  !  s   z$Gemma3ForConditionalGeneration.dtyperH   c                 K   s^   | dd }| dd }| dd }|d u sJ d|d u r d S | jjj}t||||ddS )Nr1   r8   image_embedsz%Gemma3 does not support image_embeds.)r5   r6   )r1   r8   resolve_bindings)popr   r   r   r0   )rG   rH   r1   r8   r3  r   r@   r@   rA   _parse_and_validate_image_input%  s   
z>Gemma3ForConditionalGeneration._parse_and_validate_image_inputr!  r1   c                 C   s   ||S rC   r@   )rG   r!  r1   r@   r@   rA   _image_pixels_to_features7  s   z8Gemma3ForConditionalGeneration._image_pixels_to_featuresimage_inputc                 C   s@   |d }|d }|  | j|}| |}dd || D S )Nr1   r8   c                 S   s   g | ]}| d dqS )r   r&   )r  )rV   er@   r@   rA   r   K  s    zGGemma3ForConditionalGeneration._process_image_input.<locals>.<listcomp>)r7  r!  r+  splittolist)rG   r8  r1   r8   image_featuresr3  r@   r@   rA   _process_image_input>  s   
z3Gemma3ForConditionalGeneration._process_image_inputc                 K   s&   | j di |}|d u rg S | |S )Nr@   )r6  r=  )rG   rH   r8  r@   r@   rA   embed_multimodalM  s   
z/Gemma3ForConditionalGeneration.embed_multimodalT)is_multimodalhandle_oov_mm_token	input_idsmultimodal_embeddingsr?  r@  c                   s0   |d u s|d u rt  |S t  j||||dS )N)rB  r?  r@  )r   embed_input_ids)rG   rA  rB  r?  r@  r   r@   rA   rC  T  s   	z.Gemma3ForConditionalGeneration.embed_input_ids	positionsintermediate_tensorsinputs_embedsc                 K   s,   |d urd }| j j|||fd|i|}|S )NrF  )r"  model)rG   rA  rD  rE  rF  rH   hidden_statesr@   r@   rA   r  h  s   z&Gemma3ForConditionalGeneration.forwardrH  c                 C   s   | j |S rC   )r"  compute_logits)rG   rH  r@   r@   rA   rI  }  s   z-Gemma3ForConditionalGeneration.compute_logitsweightsc                 C   s   t | }|j|| jdS )N)mapper)r,   load_weightshf_to_vllm_mapper)rG   rJ  loaderr@   r@   rA   rL    s   z+Gemma3ForConditionalGeneration.load_weightsc                 C   s   t jddddS )z<
        Get the module prefix in multimodal models
        r"  r+  r!  )r"  	connectortower_model)r   from_string_fieldrF   r@   r@   rA   get_mm_mapping  s
   z-Gemma3ForConditionalGeneration.get_mm_mappingnum_image_tokensc                 C   s   |d S )a  
        Calculate the number of tokens output by the vision encoder.

        The vision encoder processes images into patch embeddings. For Gemma3,
        the relationship between prompt placeholder tokens and actual vision
        encoder output tokens depends on the patch grid size.

        Args:
            num_image_tokens: Number of image placeholder tokens in the prompt
                              (typically mm_tokens_per_image per image)

        Returns:
            Number of tokens output by the vision encoder
           r@   )rG   rS  r@   r@   rA   get_num_mm_encoder_tokens  s   z8Gemma3ForConditionalGeneration.get_num_mm_encoder_tokensnum_vision_tokensc                 C   s   |S )a_  
        Calculate the number of tokens output by the multimodal connector.

        The connector applies projection and normalization but maintains the
        token count for Gemma3.

        Args:
            num_vision_tokens: Number of tokens from vision encoder

        Returns:
            Number of tokens after connector processing
        r@   )rG   rV  r@   r@   rA   get_num_mm_connector_tokens  s   z:Gemma3ForConditionalGeneration.get_num_mm_connector_tokensrC   )NN)'r9   r:   r;   packed_modules_mappingr-   rM  classmethodr\   ri   r  r   r   propertyr2  r   Gemma3ImageInputsr6  r+   r>   r?   r7  r   r=  r'   r>  boolrC  r#   r  rI  r   r   r   rL  r   rR  rU  rW  r   r@   r@   r   rA   r    s    	
 






$
r  )Mrj   collections.abcr   r   r   typingr   r   r   r>   r   transformersr	   r
   r   ,transformers.models.gemma3.processing_gemma3r   vllm.configr   vllm.config.multimodalr   vllm.loggerr   $vllm.model_executor.layers.layernormr   )vllm.model_executor.models.module_mappingr   vllm.multimodalr   vllm.multimodal.inputsr   r   r   vllm.multimodal.parser   r   r   vllm.multimodal.processingr   $vllm.multimodal.processing.processorr   r   r   r   r   r   r    r!   r"   vllm.sequencer#   vllm.utils.tensor_schemar$   r%   
interfacesr'   r(   r)   r*   siglipr+   utilsr,   r-   r.   r/   r9   rf   r0   r[  rB   r   r   Moduler   register_processorr  r@   r@   r@   rA   <module>   sN   , ) ,


