o
    پiMO                     @   s  d dl Z d dlZd dlmZ d dlmZmZmZmZm	Z	m
Z
mZ d dlZd dlmZ d dlmZmZmZmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZm Z  d dl!m"Z"m#Z#m$Z$m%Z% d dl&m'Z' d dl(m)Z)m*Z* d dl+m,Z, d dl-m.Z.m/Z/ d dl0m1Z1 d dl2m3Z3 e 4e5Z6ee3Z7G dd de
Z8G dd de
Z9G dd dej:Z;G dd deZ<e<Z=dS )    N)	lru_cache)IterableListOptionalSetTuple	TypedDictUnion)nn)Gemma3nAudioConfigGemma3nConfigGemma3nTextConfigGemma3nVisionConfigPreTrainedModel)	AutoModel)RowParallelLinear)LogitsProcessor)QuantizationConfig)VocabParallelEmbedding)/MultiModalityDataPaddingPatternMultimodalTokensgeneral_mm_embed_routine)ModalityMultimodalDataItemMultimodalInputsflatten_nested_list)ForwardBatch)default_weight_loadermaybe_remap_kv_scale_name)Gemma3nAudioEncoder)Gemma3nRMSNormGemma3nTextModel)
add_prefix)get_processorc                   @   s   e Zd ZU ejed< dS )Gemma3nImagePixelInputspixel_valuesN__name__
__module____qualname__torchTensor__annotations__ r,   r,   P/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/gemma3n_mm.pyr#   .   s   
 
r#   c                   @   s$   e Zd ZU ejed< 	 ejed< dS )Gemma3nAudioInputsinput_featuresinput_features_maskNr%   r,   r,   r,   r-   r.   3   s
   
 

r.   c                	       sn   e Zd ZdZ		ddeeef dedee	 de
f fdd	Z		dd
eej deej dejfddZ  ZS )Gemma3nMultimodalEmbedderzQEmbeds token ids or soft tokens for multimodal content into language model space.N multimodal_configtext_configquant_configprefixc                    s   t    |j| _|j| _|j| _|j| _|j| _t	| j| j|t
d|d| _t| j| jd| _t| j| jd| _t| j| jd|t
d|d| _t| j| jdd| _d S )N	embeddingr5   r6   )epsFembedding_projection)biasr5   r6   )r9   
with_scale)super__init__hidden_sizemultimodal_hidden_sizerms_norm_epsr9   vocab_offset
vocab_sizetext_hidden_sizer   r!   r7   r   hard_embedding_normsoft_embedding_normr   r:   embedding_post_projection_norm)selfr3   r4   r5   r6   	__class__r,   r-   r>   =   s@   
z"Gemma3nMultimodalEmbedder.__init__	input_idsinputs_embedsreturnc           	      C   s   |du |duA rt d|dur| |}n'| jd }|| j }t|dk ||}t|| jk||}| |}| |}| |\}}| 	|S )a  Embeds token ids or soft tokens for multimodal content into language model space.

        Args:
            input_ids: A torch.LongTensor containing the token ids to embed. Values should be in the range
                `[vocab_offset, vocab_offset + vocab_size)`.
            inputs_embeds: A torch.Tensor containing the soft tokens to embed.

        Returns:
            A torch.Tensor of embeddings with  shape `[batch_size, seq_len, self.config.text_config.hidden_size]`.
        N:You must specify exactly one of input_ids or inputs_embeds   r   )

ValueErrorrF   rC   rB   r)   wherer7   rE   r:   rG   )	rH   rK   rL   emb_normout_of_vocab_idadjusted_idshard_embemb_norm_proj_r,   r,   r-   forwardk   s    




z!Gemma3nMultimodalEmbedder.forwardNr2   )NN)r&   r'   r(   __doc__r	   r   r   r   r   r   strr>   r)   
LongTensorr*   rX   __classcell__r,   r,   rI   r-   r1   :   s,    
0r1   c                       s  e Zd ZeZ	 g dZdddddddZg d	d
dgdZg dZi Z	g Z
dZ		d;dedee deddf fddZdee dedee fddZdejfddZdd Zdee fd d!Zdee dejfd"d#Zdejdeej fd$d%Z	d<d&ejd'eej dejfd(d)Ze  	d<dejd*ejd+e!d,ejd-e"de#fd.d/Z$d0d1 Z%d2e&e'eejf  fd3d4Z(e)*d5Z+d6ede,fd7d8Z-d9d: Z.  Z/S )=Gemma3nForConditionalGeneration)z.gate_proj.z.down_proj.z	.up_proj.z.q_proj.z.k_proj.z.v_proj.z.o_proj.z
.out_proj.)qkv_projr   )r_   rO   )r_      )gate_up_projr   )ra   rO   )projr   )q_projk_projv_proj	gate_projup_projout_proj)rc   rd   re   rf   rg   )r_   ra   )r_   o_projra   	down_projTNr2   configr5   r6   rM   c                    s   t  j|d || _|| _td|}tj|jd| _t	|j|j
|td|d| _t	|j|j
|td|d| _t|j|td|d| _|j
j| _|j
j| _t|j
|td|d| _t|j
| _|   d S )	N)rk   modelembed_visionr8   embed_audioaudio_towerlanguage_model)r6   )r=   r>   rk   r5   r!   r   from_configvision_configvision_towerr1   r4   rm   audio_configrn   r   ro   rC   vocab_size_per_layer_inputr    rp   r   logits_processor	post_init)rH   rk   r5   r6   rI   r,   r-   r>      s>   


z(Gemma3nForConditionalGeneration.__init__rK   	mm_inputsc                 C   s   t  }|||S )z*Pad input IDs with image and audio tokens.)r   pad_input_tokens)rH   rK   rx   patternr,   r,   r-   pad_input_ids   s   z-Gemma3nForConditionalGeneration.pad_input_idsc                 C   
   | j  S N)rp   get_input_embeddingsrH   r,   r,   r-   r~         
z4Gemma3nForConditionalGeneration.get_input_embeddingsc                 C   s   | j jjd S )NrO   )rk   r4   sliding_windowr   r,   r,   r-   !get_attention_sliding_window_size   s   zAGemma3nForConditionalGeneration.get_attention_sliding_window_sizeitemsc           	      C   s  t dd |D }g }|D ]V}| dkr|d}n| dkr'|d}n| dkr5td|j |jd }t|D ]$}|||d  }|j| jj	| j
 d	}| j|d
ddj}|| q>qtj|dd}||jd | jjj| jjddd}|| jjjd 9 }| j|dS )z
        Projects the last hidden state from the vision model into language model space.

        Returns:
            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
        c                 S      g | ]}|j qS r,   feature.0itemr,   r,   r-   
<listcomp>      zEGemma3nForConditionalGeneration.get_image_feature.<locals>.<listcomp>   r         zUnexpected pixel_values shape: rO   devicedtypeFT)r$   
do_poolingreturn_dictdimr`   g      ?rL   )r   r   squeeze	unsqueezerP   shaperangetors   r   rp   r   last_hidden_stateappendr)   catreshaperk   rr   r?   vision_soft_tokens_per_imagepermuterm   )	rH   r   all_pixel_valuesvision_outputs_listpixel_values_batch
batch_sizeipixel_valuevision_outputsr,   r,   r-   get_image_feature  sD   


z1Gemma3nForConditionalGeneration.get_image_featurec                 C   sr  t dd |D }t dd |D }g }t||D ]D\}}| dkr(|d}| dkr3|d}|jt| j j| j	
 d}|j|jd}| ||\}}| j|d	}	||	 q|rtj|dd
}
tj| jd ggtj|
jd}| j|d}t|d||
}
|
j\}}}| jj| }||||}tj|
|fdd
}
|
S tjdd| j	jjt|  j| j	
 dS )a@  
        Projects the last hidden state from the audio encoder into language model space.

        Args:
            items: List of multimodal data items containing audio data.

        Returns:
            audio_features (`torch.Tensor`): Audio feature tensor of shape `(num_audios, audio_length, embed_dim)`).
        c                 S   r   r,   r   r   r,   r,   r-   r   >  r   zEGemma3nForConditionalGeneration.get_audio_feature.<locals>.<listcomp>c                 S   s   g | ]}|j  qS r,   )r0   r   r,   r,   r-   r   @  s    r`   r   rO   r   )r   r   r   )r   r   )rK   )r   zipr   r   r   nextro   
parametersr   rp   r   rn   r   r)   r   tensorrC   longrQ   r   rk   audio_soft_tokens_per_imageexpandemptyr?   )rH   r   all_input_featuresall_input_features_maskaudio_features_listr/   r0   audio_outputs
audio_maskaudio_embedsaudio_featuresaudio_padding_toksaudio_padding_embsaudio_batch_sizeaudio_seq_lenaudio_embed_dimextra_padding_tokensextra_padding_featuresr,   r,   r-   get_audio_feature3  s\   


z1Gemma3nForConditionalGeneration.get_audio_featurec                 C   s   | j |S r}   )rp   get_per_layer_inputs)rH   rK   r,   r,   r-   r     s   z4Gemma3nForConditionalGeneration.get_per_layer_inputsrL   per_layer_inputsc                 C   s   | j ||S r}   )rp   project_per_layer_inputs)rH   rL   r   r,   r,   r-   r     s   z8Gemma3nForConditionalGeneration.project_per_layer_inputs	positionsforward_batchinput_embedskwargsc           
      K   s   |du |duA rt d|d7 }|dur/t|dk|| jk }t||t|}| j|}t||| jt	j
| jt	j| ji||d}	| ||	| jj|S )z$Forward pass for multimodal Gemma3n.NrN   rO   r   )rK   r   rp   data_embedding_funcsr   r   )rP   r)   logical_andru   rQ   
zeros_likerp   r   r   r   IMAGEr   AUDIOr   rv   embed_tokens)
rH   rK   r   r   r   r   per_layer_inputs_maskper_layer_inputs_tokensr   hidden_statesr,   r,   r-   rX     s8   
z'Gemma3nForConditionalGeneration.forwardc                 C   r|   r}   )rp   tie_weightsr   r,   r,   r-   r     r   z+Gemma3nForConditionalGeneration.tie_weightsweightsc                 C   s   g d}	 t |  }t }|D ]h\}}tdd|}|D ](\}}}	||vr'q|||}|dr7||vr7q|| }
|
j}||
||	  n-d|v rP|dd}|drZ||vrZqt||}|d u rdq|| }
t	|
dt
}||
| || q|S )	N))	.qkv_projz.q_projq)r   z.k_projk)r   z.v_projv).gate_up_projz.up_projrO   )r   z
.gate_projr   z^model\.r2   z.biasvision_modelz.self_attn.out_projz.self_attn.projweight_loader)dictnamed_parameterssetresubreplaceendswithr   r   getattrr   add)rH   r   stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
param_nameweight_nameshard_idparamr   r,   r,   r-   load_weights  s8   

z,Gemma3nForConditionalGeneration.load_weightsz]^language_model\.layers\.(\d+)\.(?:self_attn|mlp)\.(?:qkv_proj|o_proj|down_proj|gate_up_proj)module_namec                 C   s   t | j|S r}   )boollora_patternmatch)rH   r   r,   r,   r-   should_apply_lora  s   z1Gemma3nForConditionalGeneration.should_apply_lorac                 C   s   |dkr| j j| j j| j j| j jd   fS |dkr&| j j| j j | j jfS |dkrDtt| j jdks8J d| j j| j jd d fS |dkr`tt| j jdksVJ d| j jd | j jfS t )	Nr_   r`   ri   ra   rO   zCurrently SGLang requires uniform intermediate size for all layers. Please file an issue if you need support for non-uniform intermediate sizes.r   rj   )	rk   r?   head_dimnum_attention_headsnum_key_value_headslenr   intermediate_sizeNotImplementedError)rH   r   	layer_idxr,   r,   r-   get_hidden_dim  s.   
z.Gemma3nForConditionalGeneration.get_hidden_dimrY   r}   )0r&   r'   r(   r   config_class#default_bitsandbytes_target_modules#bitsandbytes_stacked_params_mappingpacked_modules_mappingsupported_lora_modulesembedding_modulesembedding_padding_modulessupports_lorar   r   r[   r>   r   intr   r{   r
   	Embeddingr~   r   r   r   r)   r*   r   r\   r   r   no_gradr   objectr   rX   r   r   r   r   r   compiler   r   r   r   r]   r,   r,   rI   r-   r^      s    
4
	0P

	.+r^   )>loggingr   	functoolsr   typingr   r   r   r   r   r   r	   r)   r
   transformersr   r   r   r   r   &transformers.models.auto.modeling_autor   sglang.srt.layers.linearr   "sglang.srt.layers.logits_processorr   *sglang.srt.layers.quantization.base_configr   *sglang.srt.layers.vocab_parallel_embeddingr   sglang.srt.managers.mm_utilsr   r   "sglang.srt.managers.schedule_batchr   r   r   r   ,sglang.srt.model_executor.forward_batch_infor   $sglang.srt.model_loader.weight_utilsr   r   sglang.srt.models.gemma3n_audior    sglang.srt.models.gemma3n_causalr   r    sglang.srt.utilsr!   &sglang.srt.utils.hf_transformers_utilsr"   	getLoggerr&   loggercached_get_processorr#   r.   Moduler1   r^   
EntryClassr,   r,   r,   r-   <module>   s<    $
V   