o
    پiM                     @   sP  d dl Z d dlZd dlmZ d dlmZmZmZmZm	Z	m
Z
 d dlZd dlmZ d dlmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZ d dlmZmZmZ d dlm Z m!Z! d dl"m#Z#m$Z$ d dl%m&Z& d dl'm(Z( d dl)m*Z* d dl+m,Z, e -e.Z/ee,Z0G dd de
Z1G dd dej2Z3G dd deZ4e4Z5dS )    N)	lru_cache)IterableListOptionalSetTuple	TypedDict)nn)Gemma3ConfigPreTrainedModel)TritonAttnBackend)Gemma3RMSNorm)LogitsProcessor)QuantizationConfig))MultiModalityDataPaddingPatternTokenPairsgeneral_mm_embed_routine)MultimodalDataItemMultimodalInputsflatten_nested_list)ForwardBatchForwardMode)default_weight_loadermaybe_remap_kv_scale_name)Gemma3ForCausalLM)SiglipVisionModel)
add_prefix)get_processorc                   @   s   e Zd ZU ejed< dS )Gemma3ImagePixelInputspixel_valuesN)__name__
__module____qualname__torchTensor__annotations__ r%   r%   O/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/gemma3_mm.pyr   7   s   
 
r   c                       s<   e Zd ZdZdef fddZdejdejfddZ  Z	S )	Gemma3MultiModalProjectorz Projector for Gemma3 multimodal.configc                    s   t    tt|jj|jj| _	t
|jj|jjd| _t|jj|jj | _t|jd | _| j| j | _tj| j| jd| _d S )N)epsg      ?)kernel_sizestride)super__init__r	   	Parameterr"   zerosvision_confighidden_sizetext_configmm_input_projection_weightr   layer_norm_epsmm_soft_emb_normint
image_size
patch_sizepatches_per_imagemm_tokens_per_imagetokens_per_sider*   	AvgPool2davg_pool)selfr(   	__class__r%   r&   r-   ?   s"   
z"Gemma3MultiModalProjector.__init__vision_outputsreturnc           	      C   sv   |j \}}}|dd}|||| j| j}| }| |}|d}|dd}| |}t	|| j
}||S )N      )shape	transposereshaper9   
contiguousr=   flattenr5   r"   matmulr3   type_as)	r>   rA   
batch_size
seq_lengthr1   reshaped_vision_outputspooled_vision_outputsnormed_vision_outputsprojected_vision_outputsr%   r%   r&   forwardU   s   



z!Gemma3MultiModalProjector.forward)
r   r    r!   __doc__r
   r-   r"   r#   rR   __classcell__r%   r%   r?   r&   r'   <   s    r'   c                       sT  e Zd ZeZ	 g dZdddddddZg d	d
dgdZg dZi Z	g Z
dZedZ		d4dedee deddf fddZdee dedee fddZdedejdejfddZdejfd d!Zd"d# Zd$ee  fd%d&Z!e" 	d5dej#d'ejded(ejd)e$de%fd*d+Z&d,ede'fd-d.Z(d/d0 Z)d1e*e+eejf  fd2d3Z,  Z-S )6Gemma3ForConditionalGeneration)z.gate_proj.z.down_proj.z	.up_proj.z.q_proj.z.k_proj.z.v_proj.z.o_proj.z
.out_proj.)qkv_projr   )rV   rC   )rV   rD   )gate_up_projr   )rW   rC   )projr   )q_projk_projv_proj	gate_projup_projout_proj)rY   rZ   r[   r\   r]   )rV   rW   )rV   o_projrW   	down_projTzd^language_model\.model\.layers\.(\d+)\.(?:self_attn|mlp)\.(?:qkv_proj|o_proj|down_proj|gate_up_proj)N r(   quant_configprefixrB   c                    s   t  j|d || _|| _t|ds|jj|_t|ds!|jj|_t|j	|t
d|d| _t|| _|jj| _t|j|t
d|d| _| jjjrWt|dd	}| jj j|9  _|   d S )
N)r(   num_hidden_layersr1   vision_tower)r(   rb   rc   language_model)rc   logit_scaleg      ?)r,   r-   r(   rb   hasattrr2   rd   r1   r   r0   r   re   r'   multi_modal_projector
vocab_sizer   rf   logits_processorrg   getattr	post_init)r>   r(   rb   rc   rg   r?   r%   r&   r-      s.   






z'Gemma3ForConditionalGeneration.__init__	input_idsimage_inputsc                 C   s.   |j }|j}||fg}t|}|||}|S )z Pad input IDs with image tokens.)im_start_id	im_end_idr   pad_input_tokens)r>   rn   ro   rp   rq   media_token_pairspatternidsr%   r%   r&   pad_input_ids   s   
z,Gemma3ForConditionalGeneration.pad_input_idsforward_batch
mask_dtypec              	   C   sT  t |jtr|jtjksJ g }tj|jd tj	|j
d}t|jD ]n}tj|j| |j| |j|  ||j
d}|d |j|j| d}|j| }|jD ]-}	|	 r||	jD ]#\}
}|
|j| kr{d||
|j|  |d |j|  |
|d f< qXqO||  || |  ||d < q"|rtj|dd}||jj_||jj_dS dS dS )z.Prepare attention masks for multimodal inputs.rC   )dtypedevice)diagonalr   dimN)
isinstanceattn_backendr   forward_moder   EXTENDr"   r/   rL   int32rz   rangeemptyextend_seq_lensextend_prefix_lensfill_tril	mm_inputsmm_itemsis_imageoffsetsappendrI   nelementcatforward_metadatamask_indptrcustom_mask)r>   rw   rn   rx   bidirectional_attn_masks_listbidirectional_attn_mask_indptribidirectional_attn_maskr   mm_itemim_beginim_endbidirectional_attn_masksr%   r%   r&   prepare_attn_masks   sl   



)z1Gemma3ForConditionalGeneration.prepare_attn_masksc                 C   
   | j  S N)rf   get_input_embeddingsr>   r%   r%   r&   r        
z3Gemma3ForConditionalGeneration.get_input_embeddingsc                 C   r   )zX
        This value is used to initialize attention backends in `ForwardBatch`.
        )rf   !get_attention_sliding_window_sizer   r%   r%   r&   r     s   
z@Gemma3ForConditionalGeneration.get_attention_sliding_window_sizeitemsc                 C   sF  t dd |D }g }|D ]}| dkr*|jd | jjjkr*||| jj	 q| dkr6|
d}n| dkrB|d}n| dkrPtd|j g }|jd }t|D ]!}|||d	  }|j| jj	| j d
}| j|d}	||	 q[|rtj|dd}
| |
}|| q|rtj|ddS tjg | jj	dS )a1  
        Projects the last hidden state from the vision model into language model space.
        Supports both raw image pixel values and precomputed embeddings.

        Returns:
            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
        c                 S   s   g | ]}|j qS r%   )feature).0itemr%   r%   r&   
<listcomp>#  s    zDGemma3ForConditionalGeneration.get_image_feature.<locals>.<listcomp>      r      zUnexpected pixel_values shape: rC   )rz   ry   )r   r|   )rz   )r   r}   rE   r(   r2   r1   r   torf   rz   squeeze	unsqueeze
ValueErrorr   re   ry   r"   r   ri   tensor)r>   r   all_pixel_valuesfinal_features_listpixel_values_batchbatch_vision_outputsrL   r   pixel_valuevision_outputvision_outputs_catprojected_featuresr%   r%   r&   get_image_feature  sD   	



z0Gemma3ForConditionalGeneration.get_image_feature	positionsinput_embedskwargsc           	      K   s~   |d7 }|dur| j j| jkr|| j jk}| }d||< n|}|jtjkr3| r3| j||t	j
d t||| j| |d}|S )a#  
            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

            logits_to_keep (`int` or `torch.Tensor`, *optional*):
                If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
                If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
                This is useful when using packed tensor format (single dimension for batch and sequence length).

        Returns:

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration

        >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/Gemma3-test-224px-hf")
        >>> processor = AutoProcessor.from_pretrained("google/Gemma3-test-224px-hf")

        >>> prompt = "answer en Where is the cow standing?"
        >>> url = "https://huggingface.co/gv-hf/Gemma3-test-224px-hf/resolve/main/cow_beach_1.png"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, text=prompt,  return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(**inputs, max_length=30)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "answer en Where is the cow standing?\nbeach"
        ```rC   Nr   )rx   )rn   rw   rf   multimodal_modelr   )r(   image_token_indexrj   cloner   r   r   contains_image_inputsr   r"   boolr   rf   )	r>   rn   r   rw   r   r   special_image_maskllm_input_idshsr%   r%   r&   rR   S  s0   0
z&Gemma3ForConditionalGeneration.forwardmodule_namec                 C   s   t | j|S )z5Skip vision tower and multi_modal_projector for LoRA.)r   lora_patternmatch)r>   r   r%   r%   r&   should_apply_lora  s   z0Gemma3ForConditionalGeneration.should_apply_lorac                 C   r   r   )rf   tie_weightsr   r%   r%   r&   r     r   z*Gemma3ForConditionalGeneration.tie_weightsweightsc                 C   s   g d}	 t |  }t }|D ]t\}}d|v r't| ||fg}|| q|D ](\}}	}
|	|vr3q)||	|}|drC||vrCq)|| }|j}||||
  n-d|v r\|dd}|drf||vrfqt	||}|d u rpq|| }t
|dt}||| || q| | }|r	 |S )N))	.qkv_projz.q_projq)r   z.k_projk)r   z.v_projv)rW   r]   rC   )rW   r\   r   rf   z.biasvision_modelz.self_attn.out_projz.self_attn.projweight_loader)dictnamed_parameterssetr   load_weightsupdatereplaceendswithr   r   rl   r   addkeys)r>   r   stacked_params_mappingparams_dictloaded_paramsnameloaded_weightcausal_loaded_params
param_nameweight_nameshard_idparamr   unloaded_paramsr%   r%   r&   r     sL   



z+Gemma3ForConditionalGeneration.load_weights)Nra   r   ).r   r    r!   r
   config_class#default_bitsandbytes_target_modules#bitsandbytes_stacked_params_mappingpacked_modules_mappingsupported_lora_modulesembedding_modulesembedding_padding_modulessupports_lorarecompiler   r   r   strr-   r   r6   r   rv   r   r"   r#   ry   r   r	   	Embeddingr   r   r   r   no_grad
LongTensorobjectr   rR   r   r   r   r   r   r   rT   r%   r%   r?   r&   rU   o   s    %

;9O$rU   )6loggingr   	functoolsr   typingr   r   r   r   r   r   r"   r	   transformersr
   r   *sglang.srt.layers.attention.triton_backendr   sglang.srt.layers.layernormr   "sglang.srt.layers.logits_processorr   *sglang.srt.layers.quantization.base_configr   sglang.srt.managers.mm_utilsr   r   "sglang.srt.managers.schedule_batchr   r   r   ,sglang.srt.model_executor.forward_batch_infor   r   $sglang.srt.model_loader.weight_utilsr   r   sglang.srt.models.gemma3_causalr   sglang.srt.models.siglipr   sglang.srt.utilsr   &sglang.srt.utils.hf_transformers_utilsr   	getLoggerr   loggercached_get_processorr   Moduler'   rU   
EntryClassr%   r%   r%   r&   <module>   s6    
3  w