o
    
۾ilv                     @   sl  d dl mZmZmZ d dlmZmZmZ d dlZ	d dl
Z
d dl
mZ d dlmZmZ d dlmZmZmZmZmZmZ d dlmZ d dlmZmZmZ d d	lmZ d d
lmZm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z& d dl'm(Z( d dl)m*Z* d dl+m,Z, d dl-m.Z. d dl/m0Z0 d dl1m2Z2 d dl3m4Z4m5Z5m6Z6 d dl7m8Z8m9Z9m:Z: d dl;m<Z< d dl=m>Z>m?Z?m@Z@mAZAmBZBmCZCmDZDmEZEmFZF d dlGmHZH d dlImJZJmKZK ddlLmMZMmNZNmOZO ddlPmQZQmRZRmSZSmTZT e"eUZVdZWdZXG dd  d eJZYG d!d" d"eJZZeYZ[G d#d$ d$e?Z\G d%d& d&e<e\ Z]G d'd( d(e>e\ Z^G d)d* d*ej_Z`e2jae^e\e]d+G d,d- d-ej_eNeOZbdS ).    )IterableMappingSequence)	AnnotatedAnyLiteralN)nn)	AutoModelBatchFeature)Gemma3nAudioConfigGemma3nAudioFeatureExtractorGemma3nConfigGemma3nProcessorGemma3nTextConfigGemma3nVisionConfig)SiglipImageProcessorFast)ModelConfigSpeechToTextConfig
VllmConfig)BaseDummyOptions)
PromptType
TextPrompt)init_logger)RMSNorm)RowParallelLinear)VocabParallelEmbedding)Gemma3nForCausalLM)(adjust_audio_features_to_expected_length)MultiModelKeys)ISO639_1_SUPPORTED_LANGS)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItems)ImageProcessorItemsMultiModalDataItemsMultiModalDataParser)BaseDummyInputsBuilder)	BaseMultiModalProcessorBaseProcessingInfoMultiModalPromptUpdates"MultiModalPromptUpdatesApplyResultPlaceholderFeaturesInfoPromptReplacementPromptUpdatePromptUpdateDetailsreplace_token_matches)IntermediateTensors)TensorSchemaTensorShape   )MultiModalEmbeddingsSupportsMultiModalSupportsTranscription)AutoWeightsLoaderWeightsMapperinit_vllm_registered_modelmaybe_prefix      c                   @   s>   e Zd ZU dZdZed ed< eej	e
ddddf ed< dS )	Gemma3nImagePixelInputsz
    Dimensions:
        - bn: Batch size * number of images
        - c: Number of channels (3)
        - h: Height of each patch
        - w: Width of each patch
    pixel_valuestypebn   hwN__name__
__module____qualname____doc__r@   r   __annotations__r   torchTensorr3    rM   rM   Y/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/gemma3n_mm.pyr>   J   s   
  r>   c                   @   sT   e Zd ZU dZdZed ed< eej	e
dddf ed< eej	e
ddf ed< d	S )
Gemma3nAudioInputszs
    Dimensions:
        - bn: Batch size * number of audios
        - s: seq_length
        - f: num_features
    audior@   rA   sfinput_features_paddedinput_features_maskNrE   rM   rM   rM   rN   rO   W   s
   
 rO   c                   @   s   e Zd Zdd ZdefddZdedefddZd	d
 Zde	e
edB f fddZdede	e
ef de	e
ef dB fddZdedededB de
fddZdedB de
fddZdS )Gemma3nProcessingInfoc                 C   s   | j tS N)ctxget_hf_configr   selfrM   rM   rN   rX   h   s   z#Gemma3nProcessingInfo.get_hf_configkwargsc                 K   s   | j jtfi |S rV   )rW   get_hf_processorr   rZ   r[   rM   rM   rN   r\   k   s   z&Gemma3nProcessingInfo.get_hf_processorreturnc                 K   s   | j di |jS )NrM   )r\   feature_extractorr]   rM   rM   rN   get_feature_extractorn   s   z+Gemma3nProcessingInfo.get_feature_extractorc                 C   s   |   }t|j|  dS )N)	target_srexpected_hidden_size)r`   r&   sampling_rate_get_expected_hidden_size)rZ   r_   rM   rM   rN   get_data_parserq   s
   z%Gemma3nProcessingInfo.get_data_parserNc                 C   s
   d d dS NimagerP   rM   rY   rM   rM   rN   get_supported_mm_limitsy   s   
z-Gemma3nProcessingInfo.get_supported_mm_limitsseq_len	mm_countsc                 C   s
   t tdS rf   )TOKENS_PER_IMAGETOKENS_PER_AUDIO)rZ   rj   rk   rM   rM   rN   get_max_tokens_per_item|   s   
z-Gemma3nProcessingInfo.get_max_tokens_per_itemimage_widthimage_height	processorc                C   s    |du r|   }t|j|jS )z
        Get the replacement text for image tokens.

        For Gemma3n, this should return the full_image_sequence which includes
        BOI token, repeated image tokens, and EOI token.
        N)r\   r/   select_token_idfull_image_sequenceimage_token_id)rZ   ro   rp   rq   rM   rM   rN   get_image_repl   s
   z$Gemma3nProcessingInfo.get_image_replc                C   s    |du r|   }t|j|jS )z
        Get the replacement text for audio tokens.

        For Gemma3n, this should return the full_audio_sequence which includes
        BOA token, repeated audio tokens, and EOA token.
        N)r\   r/   rr   full_audio_sequenceaudio_token_id)rZ   rq   rM   rM   rN   get_audio_repl   s
   z$Gemma3nProcessingInfo.get_audio_repl)rF   rG   rH   rX   objectr\   r   r`   re   r   strintri   rn   r   ru   rx   rM   rM   rM   rN   rU   g   s6    


rU   c                	   @   sX   e Zd Zdeeef defddZ	d
dedeeef deeef dB defdd	Z	dS )Gemma3nDummyInputsBuilderrk   r^   c                 C   s>   | dd}| dd}| j }|j}|j}|| ||  S )Nrh   r   rP   )getinfor\   image_tokenaudio_token)rZ   rk   
num_images
num_audiosrq   r   r   rM   rM   rN   get_dummy_text   s   
z(Gemma3nDummyInputsBuilder.get_dummy_textNrj   
mm_optionsc                 C   s   | dd}| dd}| j }|j}|j}|j}	|	j dd}
|	j dd}|r/| dnd }|r8| dnd }| j|
|||d| j|||dd	S )
Nrh   r   rP   width   height)r   r   r   	overrides)lengthr   r   rg   )	r}   r~   r\   r_   
fft_lengthimage_processorsize_get_dummy_images_get_dummy_audios)rZ   rj   rk   r   r   r   rq   audio_feature_extractor	audio_lenr   	img_width
img_heightimage_overridesaudio_overridesrM   rM   rN   get_dummy_mm_data   s*   
z+Gemma3nDummyInputsBuilder.get_dummy_mm_datarV   )
rF   rG   rH   r   rz   r{   r   r   r!   r   rM   rM   rM   rN   r|      s    
r|   c                
       s   e Zd Zdedeeef deeef deeef def
 fddZded	eeef deeef fd
dZ	de
d	eeef dedee fddZdee dedeee ef f fddZdee dedeeee f f fddZ  ZS )Gemma3nMultiModalProcessorpromptmm_data	mm_kwargs
tok_kwargsr^   c                    sd   d|v r| d|d< t ||||}d|v r0|d |d< dd t|d |d D }||d< |S )NaudiosrP   input_featuresrS   c                 S   s   g | ]\}}|| qS rM   rM   ).0rR   maskrM   rM   rN   
<listcomp>   s    zAGemma3nMultiModalProcessor._call_hf_processor.<locals>.<listcomp>rT   )popsuper_call_hf_processorzip)rZ   r   r   r   r   processed_outputsunpadded_features	__class__rM   rN   r      s(   
z-Gemma3nMultiModalProcessor._call_hf_processor	hf_inputshf_processor_mm_kwargsc                 C   s    t tdtdtddS )Nrh   rP   )r?   rS   rT   )dictr"   batched)rZ   r   r   rM   rM   rN   _get_mm_fields_config   s
   z0Gemma3nMultiModalProcessor._get_mm_fields_configmm_itemsout_mm_kwargsc           	         s   j jd	i | g }dv r' j}dtf fdd}|td||d dv rB j}dtf fdd}|td||d |S )
Nrh   item_idxc                    s,    dt}|| }jj|j|j dS )Nrh   )ro   rp   rq   )	get_itemsr$   get_image_sizer~   ru   r   r   )r   images
image_sizehf_processorr   rZ   rM   rN   get_replacement_image  s   
zMGemma3nMultiModalProcessor._get_prompt_updates.<locals>.get_replacement_image)modalitytargetreplacementrP   c                    s   j j dS )N)rq   )r~   rx   )r   )r   rZ   rM   rN   get_replacement_audio)  s   zMGemma3nMultiModalProcessor._get_prompt_updates.<locals>.get_replacement_audiorM   )r~   r\   r   r{   appendr-   r   )	rZ   r   r   r   prompt_updatesr   r   r   r   rM   r   rN   _get_prompt_updates  s.   		z.Gemma3nMultiModalProcessor._get_prompt_updatesmm_prompt_updatesc                    s   t  ||\}}| j }| }|d }|d }|d }	|d }
t|||g|	g}t|||g|	g}t|||g|
g}||fS )N









)r   _apply_token_matchesr~   get_tokenizer	get_vocabr0   )rZ   r   r   	token_idsres	tokenizervocab	newline_1	newline_2	newline_3	newline_4r   rM   rN   r   8  s.   
z/Gemma3nMultiModalProcessor._apply_token_matchesnew_token_idsc           
         s   | j  }| }|d  |d |d |d dtdtt f fdd}tt  }tt  t|D ]\}||}|| fd	d
tt|D  q7t	 
||}	fdd|	 D S )Nr   r   r   r   tokr^   c                    s&   | kr gS | krgS | gS rV   rM   )r   )r   r   r   r   rM   rN   get_repl_toksi  s
   zGGemma3nMultiModalProcessor._find_mm_placeholders.<locals>.get_repl_toksc                 3   s    | ]} V  qd S rV   rM   )r   _)orig_idxrM   rN   	<genexpr>v  s    zCGemma3nMultiModalProcessor._find_mm_placeholders.<locals>.<genexpr>c                    s$   i | ]\}}| fd d|D qS )c              	      s,   g | ]}t |j|j |j |j|jd qS ))r   r   	start_idxtokensis_embed)r,   r   r   r   r   r   )r   prepl_orig_idxsrM   rN   r   {  s    zOGemma3nMultiModalProcessor._find_mm_placeholders.<locals>.<dictcomp>.<listcomp>rM   )r   r   placeholdersr   rM   rN   
<dictcomp>z  s    zDGemma3nMultiModalProcessor._find_mm_placeholders.<locals>.<dictcomp>)r~   r   r   r{   list	enumerateextendrangelenr   _find_mm_placeholdersitems)
rZ   r   r   r   r   r   repl_token_idsorig_tok	repl_toksreplsr   )r   r   r   r   r   r   rN   r   \  s"   
 


"
z0Gemma3nMultiModalProcessor._find_mm_placeholders)rF   rG   rH   rz   r   ry   r
   r   r"   r   r%   r   r#   r   r.   r   r   r{   r*   tupler+   r   r,   r   __classcell__rM   rM   r   rN   r      sT    


%




2$r   c                       sX   e Zd ZdZdeeB def fddZ		ddej	dB dej
dB d	ej
fd
dZ  ZS )Gemma3nMultimodalEmbedderzUEmbeds token ids or soft tokens for multimodal content into language
    model space.multimodal_configtext_configc                    s   t    |j| _|j| _|j| _|j| _|j| _t	| j| j| _
t| j| jd| _t| j| jd| _t| j| jdd| _t| j| jdd| _d S )N)epsF)bias)r   
has_weight)r   __init__hidden_sizemultimodal_hidden_sizerms_norm_epsr   vocab_offset
vocab_sizetext_hidden_sizer   	embeddingr   hard_embedding_normsoft_embedding_normr   embedding_projectionembedding_post_projection_norm)rZ   r   r   r   rM   rN   r     s8   
z"Gemma3nMultimodalEmbedder.__init__N	input_idsinputs_embedsr^   c                 C   s^   |du |duA rt d|dur| |}n| || j }| |}| |\}}| |S )a  Embeds token ids or soft tokens for multimodal content into language model space.

        Args:
            input_ids: A torch.LongTensor containing the token ids to embed. Values should be in the range
                `[vocab_offset, vocab_offset + vocab_size)`.
            inputs_embeds: A torch.Tensor containing the soft tokens to embed.

        Returns:
            A torch.Tensor of embeddings with  shape `[batch_size, seq_len, self.config.text_config.hidden_size]`.
        Nz:You must specify exactly one of input_ids or inputs_embeds)
ValueErrorr   r   r   r   r   r   )rZ   r   r   emb_normhard_embemb_norm_projr   rM   rM   rN   forward  s   

z!Gemma3nMultimodalEmbedder.forwardNN)rF   rG   rH   rI   r   r   r   r   rK   
LongTensorrL   r  r   rM   rM   r   rN   r     s     *r   )r~   dummy_inputsc                       s  e Zd ZeZg dddgdZeddddd	d
ddddZdddede	f fddZ
dededB fddZdededB fddZdedefddZdedeej fdd Zd!edeej fd"d#Zdedefd$d%Z	dKdd&d'd(ejd)edB d*ejdB d+edejf
 fd,d-Z		dLd(ejdB d.ejd/edB d0ejdB dedefd1d2Zd3ejdejdB fd4d5Zd6eee	ejf  de e	 fd7d8Z!de"fd9d:Z#e$d;e	d<e%de	dB fd=d>Z&e$d?e'j(d@e)dAe*dBe	dB dCe+dD dEe	dFe	dB de,fdGdHZ-e$dAe*dCe	de)fdIdJZ.  Z/S )MGemma3nForConditionalGeneration)q_projk_projv_proj	gate_projup_proj)qkv_projgate_up_projzembed_audio.zembed_vision.zlanguage_model.model.zvision_tower.zaudio_tower.zmulti_modal_projector.zlanguage_model.lm_head.zlanguage_model.model)zmodel.embed_audio.zmodel.embed_vision.zmodel.language_model.zmodel.vision_tower.zmodel.audio_tower.zmodel.multi_modal_projector.zlm_head.model)orig_to_new_prefix )prefixvllm_configr  c                   sZ  t    |jj}|j}|jj}|| _|| _|| _|jj| _| 	|d t
j|jd| _t|j|j| _W d    n1 s?w   Y  | 	|d t
j|jd| _t|j|j| _W d    n1 sew   Y  | |4 t||jt|ddgd| _tj|jj| jjj| jjj| jjjjj | jjjjj!d| _"W d    d S 1 sw   Y  d S )Nrh   )configrP   language_modelr   )r  	hf_configr  architectures)devicedtype)#r   r   model_configr  quant_configr   r  r   r   _mark_tower_modelr	   from_configvision_configvision_towerr   embed_visionaudio_configaudio_towerembed_audio_mark_language_modelr:   r;   r  rK   zerosscheduler_configmax_num_batched_tokensnum_hidden_layershidden_size_per_layer_inputr  embed_tokensweightr  r  per_layer_embeddings)rZ   r  r  r  r  r   r   rM   rN   r     sF   

	
"z(Gemma3nForConditionalGeneration.__init__r[   r^   Nc                 K   s>   | dd }| dd }|d u sJ d|d u rd S t|dS )Nr?   image_embedsz&Gemma3n does not support image_embeds.)r?   )r   r>   )rZ   r[   r?   r+  rM   rM   rN   _parse_and_validate_image_input  s   
z?Gemma3nForConditionalGeneration._parse_and_validate_image_inputc                 K   s<   | dd }|d u rd S | dd }|d u rd S t||dS )NrS   rT   )rS   rT   )r   rO   )rZ   r[   rS   rT   rM   rM   rN   _parse_and_validate_audio_input+  s   z?Gemma3nForConditionalGeneration._parse_and_validate_audio_inputc                 K   sZ   i }|D ]&}|dv rd|vr| j di ||d< |dkr*d|vr*| jdi ||d< q|S )N)r?   r+  rh   rS   rP   rM   )r,  r-  )rZ   r[   mm_input_by_modality	input_keyrM   rM   rN   %_parse_and_validate_multimodal_inputs;  s   

zEGemma3nForConditionalGeneration._parse_and_validate_multimodal_inputsimage_inputc                 C   sj   |d }| j |dddj}||jd | jjj| jjddd	 }|| jjjd 9 }| j
|d	dS )
Nr?   FT)r?   
do_poolingreturn_dictr      r4   g      ?r   )r  last_hidden_statereshapeshaper  r  r   vision_soft_tokens_per_imagepermute
contiguousr  unbind)rZ   r1  r?   vision_outputsrM   rM   rN   _process_image_inputQ  s    

z4Gemma3nForConditionalGeneration._process_image_inputaudio_inputc                 C   s   |d  d}|d  d}| || }t|tr|\}}n|j}|j}| j|d}tj| j	d ggtj
|jd}| j|d}	t|d|	|}| jj}
t||
|	\}}|dkratd	||
 |dS )
NrS   r4   rT   r5  )r  r  )r   r   z\Gemma3n audio encoder produced %d extra tokens. Truncating to match placeholder count of %d.)squeezer   
isinstancer   r6  audio_mel_maskr!  rK   tensorr   longr  where	unsqueezer  audio_soft_tokens_per_imager   loggerwarningr<  )rZ   r?  r   rT   audio_outputsaudio_encodings
audio_maskaudio_featuresaudio_padding_toksaudio_padding_embsexpected_tokenstokens_truncatedrM   rM   rN   _process_audio_inputi  s4   


z4Gemma3nForConditionalGeneration._process_audio_inputc                 K   sn   | j di |}|d u rg S g }|D ]"}|| }|dkr&| |}|| |dkr4| |}|| q|S )Nrh   rP   rM   )r0  r>  r   rS  )rZ   r[   r.  multimodal_embeddingsr   multimodal_inputvision_embeddingsaudio_embeddingsrM   rM   rN   embed_multimodal  s   



z0Gemma3nForConditionalGeneration.embed_multimodalF)is_multimodalhandle_oov_mm_tokenr   rT  rY  rZ  c                   sz   |d ur%| j j|}|d| jjj| jjj}| jd |j	d  
| |d u s-|d u r3t |S t j||||dS )Nr@  r   )rT  rY  rZ  )r  r  get_per_layer_input_embeddingsr7  r  r   r&  r'  r*  r8  copy_r   embed_input_ids)rZ   r   rT  rY  rZ  per_layer_inputsr   rM   rN   r]    s(   
z/Gemma3nForConditionalGeneration.embed_input_ids	positionsintermediate_tensorsr   c                 K   sB   |d urd }| j d |jd  }| jj||f|||d|}|S )Nr   )r^  r`  r   )r*  r8  r  r  )rZ   r   r_  r`  r   r[   r^  hidden_statesrM   rM   rN   r    s   	z'Gemma3nForConditionalGeneration.forwardra  c                 C   s   | j |S rV   )r  compute_logits)rZ   ra  rM   rM   rN   rb    s   z.Gemma3nForConditionalGeneration.compute_logitsweightsc                 C   s   t | }|j|| jdS )N)mapper)r8   load_weightshf_to_vllm_mapper)rZ   rc  loaderrM   rM   rN   re    s   z,Gemma3nForConditionalGeneration.load_weightsc                 C   s   t jddddS )z<
        Get the module prefix in multimodal models
        r  multi_modal_projectorr  )r  	connectortower_model)r   from_string_fieldrY   rM   rM   rN   get_mm_mapping  s
   z.Gemma3nForConditionalGeneration.get_mm_mappingr   ic                 C   s&   |dkrdS |dkrdS t d| )Nrh   z<image_soft_token>rP   z<audio_soft_token>zUnsupported modality: )r   )clsr   rm  rM   rM   rN   get_placeholder_str  s
   z3Gemma3nForConditionalGeneration.get_placeholder_strrP   
stt_configr  language	task_type)
transcribe	translaterequest_promptto_languagec                 C   s   d}||dkr	dnd7 }|d7 }| j |d}	| j |d}
|dkr,|	r,|d|	 7 }n|dkrB|	r9|d	|	 7 }|
rB|d|
 7 }|d
7 }t|d||jfidS )z
        Gemma3n supports "free-form" transcription.
        We fix its prompt here to standardize transcriptions/translations
        requests.
        z<start_of_turn>user
rs  
Transcribe	Translatez this audior  z into rt  z from z7: <audio_soft_token><end_of_turn>
<start_of_turn>model
rP   )r   multi_modal_data)supported_languagesr}   r   sample_rate)rn  rP   rp  r  rq  rr  ru  rv  r   full_lang_namefull_lang_name_torM   rM   rN   get_generation_prompt
  s"   z5Gemma3nForConditionalGeneration.get_generation_promptc                 C   s   t ddd dS )N   i>  )max_audio_clip_sr{  min_energy_split_window_size)r   )rn  r  rr  rM   rM   rN   get_speech_to_text_config4  s
   z9Gemma3nForConditionalGeneration.get_speech_to_text_configrV   r  )0rF   rG   rH   r   rz  packed_modules_mappingr9   rf  r   rz   r   ry   Gemma3nImageInputsr,  rO   r-  r   r0  r   rK   rL   r>  rS  r5   rX  boolr]  r1   r  rb  r   r   setre  r   rl  classmethodr{   ro  npndarrayr   r   r   r   r~  r  r   rM   rM   r   rN   r    s    (



2&

$
	)r  )ccollections.abcr   r   r   typingr   r   r   numpyr  rK   r   transformersr	   r
   transformers.models.gemma3nr   r   r   r   r   r   transformers.models.siglipr   vllm.configr   r   r   vllm.config.multimodalr   vllm.inputs.datar   r   vllm.loggerr   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   3vllm.model_executor.layers.vocab_parallel_embeddingr   "vllm.model_executor.models.gemma3nr   .vllm.model_executor.models.gemma3n_audio_utilsr   )vllm.model_executor.models.module_mappingr   "vllm.model_executor.models.whisperr   vllm.multimodalr    vllm.multimodal.inputsr!   r"   r#   vllm.multimodal.parser$   r%   r&   vllm.multimodal.processingr'   $vllm.multimodal.processing.processorr(   r)   r*   r+   r,   r-   r.   r/   r0   vllm.sequencer1   vllm.utils.tensor_schemar2   r3   
interfacesr5   r6   r7   utilsr8   r9   r:   r;   rF   rI  rl   rm   r>   rO   r  rU   r|   r   Moduler   register_processorr  rM   rM   rM   rN   <module>   s^    ,B, 5J

