o
    wik                     @   s  d dl Z d dlZd dlmZ d dlmZmZmZ d dlZd dl	m
Z
 d dlZddlmZmZ ddlmZmZ ddlmZmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddl m!Z! ddl"m#Z#m$Z$m%Z%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4 ddl5m6Z6m7Z7m8Z8m9Z9 ddl:m;Z; e&<e=Z>G dd de*eZ?G dd deZ@G dd de9ZAG dd de6ZBG dd de
jCZDG dd  d e.ZEG d!d" d"e1ZFG d#d$ d$e2ZGG d%d& d&e,ZHG d'd( d(eZIdZJG d)d* d*e0ZKG d+d, d,e/ZLG d-d. d.e-ZMG d/d0 d0e
jNZOd1eejP d2eQd3ee fd4d5ZRG d6d7 d7e8ZSG d8d9 d9e7ZTg d:ZUdS );    N)Callable)AnyOptionalUnion   )CacheDynamicCache)PretrainedConfiglayer_type_validation)create_causal_maskcreate_masks_for_generate!create_sliding_window_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPast)rope_config_validation)ALL_ATTENTION_FUNCTIONS)Unpack)auto_docstringcan_return_tupleis_torchdynamo_compilinglogging)deprecate_kwarg   )Gemma2Config)	Gemma2AttentionGemma2ForCausalLM	Gemma2MLPGemma2ModelGemma2PreTrainedModelGemma2RMSNormGemma2RotaryEmbeddingapply_rotary_pos_embeager_attention_forward)PaligemmaCausalLMOutputWithPast!PaliGemmaForConditionalGenerationPaliGemmaModelPaligemmaModelOutputWithPast)SiglipVisionConfigc                   @   sl   e Zd ZdZdZ									
																			dddZedd Zejdd ZdS )Gemma3TextConfigaN   
    This is the configuration class to store the configuration of a [`Gemma3TextModel`]. It is used to instantiate an Gemma3Text
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the Gemma3Text-7B.
    e.g. [google/gemma3_text-7b](https://huggingface.co/google/gemma3_text-7b)
    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    Args:
        vocab_size (`int`, *optional*, defaults to 262208):
            Vocabulary size of the Gemma3Text model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`Gemma3TextModel`]
        hidden_size (`int`, *optional*, defaults to 2304):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 9216):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 26):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer decoder.
        num_key_value_heads (`int`, *optional*, defaults to 4):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details, check out [this
            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
            `num_attention_heads`.
        head_dim (`int`, *optional*, defaults to 256):
            The attention head dimension.
        hidden_activation (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
            The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"`
            if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function.
        max_position_embeddings (`int`, *optional*, defaults to 131072):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*, defaults to 0):
            Padding token id.
        eos_token_id (`int`, *optional*, defaults to 1):
            End of stream token id.
        bos_token_id (`int`, *optional*, defaults to 2):
            Beginning of stream token id.
        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
            Whether to tie weight embeddings
        rope_theta (`float`, *optional*, defaults to 1000000.0):
            The base period of the RoPE embeddings.
        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        query_pre_attn_scalar (`float`, *optional*, defaults to 256):
            Scaling factor used on the attention scores
        sliding_window (`int`, *optional*, defaults to 4096):
            In Gemma3Text, every other layer uses sliding window attention. This is the size of the sliding window.
        layer_types (`list`, *optional*):
            Attention pattern for each layer.
        final_logit_softcapping (`float`, *optional*):
            Scaling factor when applying tanh softcapping on the logits.
        attn_logit_softcapping (`float`, *optional*):
            Scaling factor when applying tanh softcapping on the attention scores.
        rope_scaling (`Dict`, *optional*):
            Dictionary containing the scaling configuration for the RoPE embeddings used in global attention. NOTE: if you apply new rope type
            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
            accordingly.
            Expected contents:
                `rope_type` (`str`):
                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
                    'llama3'], with 'default' being the original RoPE implementation.
                `factor` (`float`, *optional*):
                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
                    original maximum pre-trained length.
                `original_max_position_embeddings` (`int`, *optional*):
                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
                    pretraining.
                `attention_factor` (`float`, *optional*):
                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
                    computation. If unspecified, it defaults to value recommended by the implementation, using the
                    `factor` field to infer the suggested value.
                `beta_fast` (`float`, *optional*):
                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
                    ramp function. If unspecified, it defaults to 32.
                `beta_slow` (`float`, *optional*):
                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
                    ramp function. If unspecified, it defaults to 1.
                `short_factor` (`list[float]`, *optional*):
                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                    size divided by the number of attention heads divided by 2
                `long_factor` (`list[float]`, *optional*):
                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                    size divided by the number of attention heads divided by 2
                `low_freq_factor` (`float`, *optional*):
                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
                `high_freq_factor` (`float`, *optional*):
                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
        rope_local_base_freq (float, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings for local attention.

    ```python
    >>> from transformers import Gemma3TextModel, Gemma3TextConfig
    >>> # Initializing a Gemma3Text gemma3_text-7b style configuration
    >>> configuration = Gemma3TextConfig()
    >>> # Initializing a model from the gemma3_text-7b style configuration
    >>> model = Gemma3TextModel(configuration)
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    gemma3_text@   	   $              gelu_pytorch_tanh   {Gz?ư>Tr      r       .AF           N     @c                    s   t jd||||d| | _|	 _| _| _| _| _| _| _	|
 _
| _| _| _| _| _| _| _| _| _| _| _| _| _t  |dd _ jd u rl fddt jD  _t j d S )N)pad_token_idbos_token_ideos_token_idtie_word_embeddingssliding_window_pattern   c                    s&   g | ]}t |d   j rdndqS )r6   sliding_attentionfull_attention)bool_sliding_window_pattern).0iself f/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/gemma3/modular_gemma3.py
<listcomp>   s    z-Gemma3TextConfig.__init__.<locals>.<listcomp>rI   )r	   __init__
vocab_sizemax_position_embeddingshidden_sizeintermediate_sizenum_hidden_layersnum_attention_headshead_dimnum_key_value_headsinitializer_rangerms_norm_eps	use_cache
rope_thetaattention_biasattention_dropouthidden_activationquery_pre_attn_scalarsliding_windowfinal_logit_softcappingattn_logit_softcappinglayer_typesrope_local_base_freqrope_scalingr   getrD   ranger
   )rH   rM   rO   rP   rQ   rR   rT   rS   r[   rN   rU   rV   rW   r;   r=   r<   r>   rX   rY   rZ   r\   r]   r`   r^   r_   rb   ra   kwargsrI   rG   rJ   rL      sJ   

zGemma3TextConfig.__init__c                 C   s   t dt | jS )NzTThe `sliding_window_pattern` attribute is deprecated and will be removed in v4.55.0.)warningswarnFutureWarningrD   rG   rI   rI   rJ   r?      s
   z'Gemma3TextConfig.sliding_window_patternc                 C   s
   || _ d S N)rD   )rH   valuerI   rI   rJ   r?     s   
)r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   Tr   r6   r   Tr7   Fr8   r1   r9   NNNNr:   )	__name__
__module____qualname____doc__
model_typerL   propertyr?   setterrI   rI   rI   rJ   r)   <   sD    t
H
r)   c                       s   e Zd ZdZdZddddZeedZ					
			dde	e
eeeef f  de	e
eeeef f  dededededef fddZ  ZS )Gemma3Configa  
    This is the configuration class to store the configuration of a [`Gemma3ForConditionalGeneration`]. It is used to instantiate an
    Gemma3ForConditionalGeneration according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the PaliGemma-2B.

    e.g. [google/gemma-3-4b](https://huggingface.co/google/gemma-3-4b)

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        text_config (`Union[Gemma3TextConfig, dict]`, *optional*):
            The config object of the text backbone.
        vision_config (`Union[AutoConfig, dict]`,  *optional*):
            Custom vision config or dict.
        mm_tokens_per_image (`int`, *optional*, defaults to 256):
            The number of tokens per image embedding.
        boi_token_index (`int`, *optional*, defaults to 255999):
            The begin-of-image token index to wrap the image prompt.
        eoi_token_index (`int`, *optional*, defaults to 256000):
            The end-of-image token index to wrap the image prompt.
        image_token_index (`int`, *optional*, defaults to 262144):
            The image token index to encode the image prompt.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.


    Example:

    ```python
    >>> from transformers import Gemma3ForConditionalGeneration, Gemma3Config, SiglipVisionConfig, Gemma3TextConfig

    >>> # Initializing a Siglip-like vision config
    >>> vision_config = SiglipVisionConfig()

    >>> # Initializing a Gemma3 Text config
    >>> text_config = Gemma3TextConfig()

    >>> # Initializing a Gemma3 gemma-3-4b style configuration
    >>> configuration = Gemma3Config(vision_config, text_config)

    >>> # Initializing a model from the gemma-3-4b style configuration
    >>> model = Gemma3TextConfig(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```gemma3image_token_indexboi_token_indexeoi_token_index)image_token_idboi_token_ideoi_token_id)text_configvision_configNr1         r4   rz   r{   mm_tokens_per_imagerU   c           	         s   |d u rt  }td nt|trt di |}t|tr&tdi |}n|d u r2t }td || _|| _|| _|| _	|| _
|| _|| _t jdi | d S )Nz@text_config is None, using default Gemma3TextConfig text config.zFvision_config is None, using default SiglipVisionConfig vision config.rI   )r)   loggerinfo
isinstancedictr(   rz   r{   r   ru   rv   rt   rU   superrL   )	rH   rz   r{   r   ru   rv   rt   rU   re   	__class__rI   rJ   rL   D  s$   


zGemma3Config.__init__)NNr1   r|   r}   r~   r4   )rk   rl   rm   rn   ro   attribute_mapr)   r(   sub_configsr   r   r   strr   intfloatrL   __classcell__rI   rI   r   rJ   rr     s@    0rr   c                   @      e Zd ZdS )Gemma3ModelOutputWithPastNrk   rl   rm   rI   rI   rI   rJ   r   f      r   c                   @   r   )Gemma3CausalLMOutputWithPastNr   rI   rI   rI   rJ   r   j  r   r   c                	       sH   e Zd ZdZddedededef fddZd	ejf fd
dZ	  Z
S )Gemma3TextScaledWordEmbeddingz\
    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
          ?num_embeddingsembedding_dimpadding_idxembed_scalec                    s*   t  ||| | jdt|dd d S )Nr   F)
persistent)r   rL   register_buffertorchtensor)rH   r   r   r   r   r   rI   rJ   rL   s  s   z&Gemma3TextScaledWordEmbedding.__init__	input_idsc                    s   t  || j| jj S ri   )r   forwardr   toweightdtype)rH   r   r   rI   rJ   r   w  s   z%Gemma3TextScaledWordEmbedding.forward)r   )rk   rl   rm   rn   r   r   rL   r   Tensorr   r   rI   rI   r   rJ   r   n  s     r   c                       s"   e Zd Zdef fddZ  ZS )	Gemma3MLPconfigc                       t  | d S ri   r   rL   rH   r   r   rI   rJ   rL   |     zGemma3MLP.__init__rk   rl   rm   r)   rL   r   rI   rI   r   rJ   r   {  s    r   c                       s(   e Zd Zddedef fddZ  ZS )Gemma3RMSNormr5   dimepsc                    s   t    d S ri   r   )rH   r   r   r   rI   rJ   rL     s   zGemma3RMSNorm.__init__)r5   )rk   rl   rm   r   r   rL   r   rI   rI   r   rJ   r     s     r   c                       s$   e Zd Zddef fddZ  ZS )Gemma3RotaryEmbeddingNr   c                    r   ri   r   )rH   r   devicer   rI   rJ   rL     r   zGemma3RotaryEmbedding.__init__ri   r   rI   rI   r   rJ   r     s    r   c                       s   e Zd Zdedef fddZ		ddejdejdeej d	ee	 d
eej
 dee deejeej eeej  f fddZ  ZS )Gemma3Attentionr   	layer_idxc                    sT   |j | dk| _t   | jr|jnd | _t|j|jd| _t|j|jd| _	d S )NrA   )r   r   )
r`   
is_slidingr   rL   r]   r   rS   rV   q_normk_normrH   r   r   r   rI   rJ   rL     s
   
zGemma3Attention.__init__Nhidden_statesposition_embeddingsattention_maskpast_key_valuecache_positionre   returnc                 K   s<  |j d d }g |d| jR }| ||dd}	| ||dd}
| ||dd}| |	}	| |
}
|\}}t	|	|
||\}	}
|d ura|||d}|
|
|| j|\}
}t}| jjdkrot| jj }|| |	|
||f| jr|| jnd| j| jd|\}}|jg |dR   }| |}||fS )Nr6   r   )sincosr   eagerr8   )dropoutscalingr]   )shaperS   q_projview	transposek_projv_projr   r   r"   updater   r#   r   _attn_implementationr   trainingrZ   r   r]   reshape
contiguouso_proj)rH   r   r   r   r   r   re   input_shapehidden_shapequery_states
key_statesvalue_statesr   r   cache_kwargsattention_interfaceattn_outputattn_weightsrI   rI   rJ   r     s>   	

	

zGemma3Attention.forward)NN)rk   rl   rm   r)   r   rL   r   r   r   r   
LongTensorr   r   tupler   r   rI   rI   r   rJ   r     s&    r   c                       s   e Zd Zdedef fddZeddd								dd
ejdejdejde	ej de	ej
 de	e de	e de	e de	ej
 deeje	eejejf  f fddZ  ZS )Gemma3DecoderLayerr   r   c                    s   t    || _|j| _|| _|j| | _t||d| _t	|| _
t| j|jd| _t| j|jd| _t| j|jd| _t| j|jd| _d S )N)r   r   r   )r   rL   r   rO   r   r`   attention_typer   	self_attnr   mlpr   rV   input_layernormpost_attention_layernormpre_feedforward_layernormpost_feedforward_layernormr   r   rI   rJ   rL     s   

zGemma3DecoderLayer.__init__last_cache_positionz4.53.0)versionNFr   position_embeddings_globalposition_embeddings_localr   position_idsr   output_attentionsrW   r   r   c
                 K   s   |}|  |}| jjr|}n|}| jd||||||||	d|
\}}| |}|| }|}| |}| |}| |}|| }|f}|rK||f7 }|S )N)r   r   r   r   r   r   rW   r   rI   )r   r   r   r   r   r   r   )rH   r   r   r   r   r   r   r   rW   r   re   residualr   self_attn_weightsoutputsrI   rI   rJ   r     s8   
	





zGemma3DecoderLayer.forward)NNNFFN)rk   rl   rm   r)   r   rL   r   r   r   r   r   r   rC   r   FloatTensorr   r   rI   rI   r   rJ   r     s<    
	
r   c                   @   s    e Zd ZdZg dZdd ZdS )Gemma3PreTrainedModel )r   SiglipVisionEmbeddingsSiglipEncoderLayer#SiglipMultiheadAttentionPoolingHeadc                 C   s   | j j}t|tjtjfr%|jjjd|d |j	d ur#|j	j
  d S d S t|tjrF|jjjd|d |jd urD|jj|j 
  d S d S t|trT|jjd d S t|tra|jj
  d S d S )Nr8   )meanstdr   )r   rU   r   nnLinearConv2dr   datanormal_biaszero_	Embeddingr   r   fill_Gemma3MultiModalProjectormm_input_projection_weight)rH   moduler   rI   rI   rJ   _init_weights  s    



z#Gemma3PreTrainedModel._init_weightsN)rk   rl   rm   base_model_prefix_no_split_modulesr   rI   rI   rI   rJ   r   
  s    r   c                       s   e Zd ZeZdef fddZ									ddeej deej	 deej dee
 d	eej d
ee dee dee deej dee defddZ  ZS )Gemma3TextModelr   c                    sX   t  | t|j|j| j| jjd d| _t	|}|j
|_ddi|_t|d| _d S )N      ?)r   	rope_typedefault)r   )r   rL   r   rM   rO   r   r   embed_tokenscopydeepcopyra   rX   rb   r   rotary_emb_localr   r   rI   rJ   rL   '  s   

zGemma3TextModel.__init__Nr   r   r   past_key_valuesinputs_embedsrW   r   output_hidden_statesr   flash_attn_kwargsr   c
                 K   s  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u |d uA r*td| jr9| jr9|r9td d}|d u rB| 	|}|rN|d u rN| jsNt
 }|	d u rj|d urZ| nd}tj|||jd  |jd}	|d u rs|	d}t| }ts| j |||	||d}td	i |td	i |d}|}| ||}| ||}|rd	nd }|rd	nd }| jd | j j D ]*}|r||f7 }||f||||j |||||	d
|
}|d }|r||d f7 }q| |}|r||f7 }t||||dS )N:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fr   r6   r   r   input_embedsr   r   r  r   rB   rA   rI   )r   r   r   r   r   r   rW   r   )last_hidden_stater  r   
attentions)r   r   r
  rW   
ValueErrorgradient_checkpointingr   r   warning_oncer  r   get_seq_lengthr   aranger   r   	unsqueezer   r   r   r   
rotary_embr  layersrQ   r   normr   )rH   r   r   r   r  r	  rW   r   r
  r   r  past_seen_tokenscausal_mask_mappingmask_kwargsr   r   r   all_hidden_statesall_self_attnsdecoder_layerlayer_outputsrI   rI   rJ   r   6  s   






zGemma3TextModel.forward)	NNNNNNNNN)rk   rl   rm   r)   config_classrL   r   r   r   r   r   r   rC   r   r   r   r   r   rI   rI   r   rJ   r   $  sF    	
r   c                       s*   e Zd ZeZdZdef fddZ  ZS )Gemma3ForCausalLMlanguage_modelr   c                    s   t  | t|| _d S ri   )r   rL   r   modelr   r   rI   rJ   rL     s   zGemma3ForCausalLM.__init__)rk   rl   rm   r)   r#  r   rL   r   rI   rI   r   rJ   r$    s    r$  c                       s2   e Zd Zdef fddZdejfddZ  ZS )r   r   c                    s   t    tt|jj|jj| _	t
|jj|jjd| _t|jj|jj | _t|jd | _| j| j | _tj| j| jd| _d S )Nr   r  )kernel_sizestride)r   rL   r   	Parameterr   zerosr{   rO   rz   r   r   layer_norm_epsmm_soft_emb_normr   
image_size
patch_sizepatches_per_imager   tokens_per_sider'  	AvgPool2davg_poolr   r   rI   rJ   rL     s   
z"Gemma3MultiModalProjector.__init__vision_outputsc           	      C   sv   |j \}}}|dd}|||| j| j}| }| |}|d}|dd}| |}t	|| j
}||S )Nr6   r   )r   r   r   r/  r   r2  flattenr,  r   matmulr   type_as)	rH   r3  
batch_size_
seq_lengthreshaped_vision_outputspooled_vision_outputsnormed_vision_outputsprojected_vision_outputsrI   rI   rJ   r     s   



z!Gemma3MultiModalProjector.forward)	rk   rl   rm   rr   rL   r   r   r   r   rI   rI   r   rJ   r     s    r   token_type_idstokens_per_imager   c              
      s4    du rdS dt dt dt dt dtf
 fdd}|S )	z
    This function adds the correct offsets to the `q_idx` and `kv_idx` as the torch API can only accept lengths,
    not start and end indices.
    N	batch_idxhead_idxq_idxkv_idxr   c                    s:   t || k} | |f dk | |f dk@ }||@ S )Nr6   )r   abs)r@  rA  rB  rC  same_image_blockis_image_blockr>  r?  rI   rJ   
inner_mask  s    z0token_type_ids_mask_function.<locals>.inner_mask)r   rC   )r>  r?  rH  rI   rG  rJ   token_type_ids_mask_function  s   $	rI  c                !   @   s   e Zd ZdZdejdejfddZdd Zee														dd	ej
dejd
eej deej
 deeeej ef  deej
 deej
 deej deej
 dee dee dee dee deeef fddZdS )Gemma3ModelFpixel_valuesr   c                 C   s   | j |dj}| |}|S )a  
        Projects the last hidden state from the vision model into language model space.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
               The tensors corresponding to the input images.
        Returns:
            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
        )rK  )vision_towerr  multi_modal_projector)rH   rK  r3  image_featuresrI   rI   rJ   get_image_features  s   

zGemma3Model.get_image_featuresc                 K      t dNzWe don't want to inherit itAttributeErrorrH   super_kwargsrI   rI   rJ   _update_causal_mask     zGemma3Model._update_causal_maskNr   r   r   r  r>  r   r	  labelsrW   r   r
  return_dictc                 K   s  |d u |d uA rt d|d ur|n| jj}|d ur|n| jj}|d ur&|n| jj}|d urD| jj| jkrD|| jjk}| }d||< n|}|d u rP|  |}|d u rl|d ur\|	 nd}t
j|||jd  |jd}|d ur| |}|d u r||  t
j| jjt
j|jdk}n|| jjkd}|||j}t s||  | kr|jddjddd }t d| d	|jd |jd   d
||j|j}|||}t| }ts| j |||||d}|d ur|jd dkrt||j| jj|d< tdi |tdi |d}| j d|||||
||d|d	|}t!|j"|
r0|j#nd |j$|j%|d ur>|dS d dS )Nr  r   r6   r  )r   r   r   )r   zVNumber of images does not match number of special image tokens in the input text. Got z image tokens in the text but z tokens from image embeddings.r  or_mask_functionr  T)	r   r   r  r	  rW   r   r
  rY  r   )r  r  r   r  image_hidden_statesrI   )&r  r   r   r
  use_return_dictrw   rM   cloneget_input_embeddingsr  r   r  r   r   rO  r   longr  	expand_asr   r   numelsumr   masked_scatterr   r   get_text_configrI  r   r   r   r%  r   r  r  r   r  )rH   r   rK  r   r   r  r>  r   r	  rX  rW   r   r
  rY  	lm_kwargsspecial_image_maskllm_input_idsr  rN  image_tokens_in_textr  r  r   rI   rI   rJ   r     s   


zGemma3Model.forward)NNNNNNNNNNNNN)rk   rl   rm   accepts_loss_kwargsr   r   rO  rV  r   r   r   r   r   r   listr   rC   r   r   r   rI   rI   rI   rJ   rJ    s`    	

rJ  c                "       sJ  e Zd Ze														ddejdejdeej deej dee	e
ej ef  deej d	eej d
eej deej dee dee dee dee de	eejf de	eef fddZ										d fdd	Zdd Ze	ddedejdeej d	ejdee deej deej defddZ  ZS ) Gemma3ForConditionalGenerationNr   r   rK  r   r   r  r>  r   r	  rX  rW   r   r
  rY  logits_to_keepr   c                 K   s  |dur|n| j j}|dur|n| j j}|dur|n| j j}| jd||||||||
|	||||d|}|d }t|trCt| dn|}| |dd|ddf }d}|	dur|	 }|dddddf }|	dddf }|dur|dd|j
d  df |j}|||jdk  }|||jdk  }n| }| }t }|d| j jj}|d|j}|||}|s|f|dd  }|dur|f| S |S t|||j|j|j|jdS )	a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration

        >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma-3-4b-it")
        >>> processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it")

        >>> messages = [
        ...     {
        ...         "role": "system",
        ...         "content": [
        ...             {"type": "text", "text": "You are a helpful assistant."}
        ...         ]
        ...     },
        ...     {
        ...         "role": "user", "content": [
        ...             {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
        ...             {"type": "text", "text": "Where is the cat standing?"},
        ...         ]
        ...     },
        ... ]

        >>> inputs = processor.apply_chat_template(
        ...     messages,
        ...     tokenize=True,
        ...     return_dict=True,
        ...     return_tensors="pt",
        ...     add_generation_prompt=True
        ... )
        >>> # Generate
        >>> generate_ids = model.generate(**inputs)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "user\nYou are a helpful assistant.\n\n\n\n\n\nWhere is the cat standing?\nmodel\nBased on the image, the cat is standing in a snowy area, likely outdoors. It appears to"
        ```
        N)r   rK  r>  r   r   r  r	  rW   rX  r   r
  rY  r   r   .r   r6   )losslogitsr  r   r  r[  rI   )r   r   r
  r\  r&  r   r   slicelm_headr   r   r   r   r   r   CrossEntropyLossr   rz   rM   r   r  r   r  r[  )rH   r   rK  r   r   r  r>  r   r	  rX  rW   r   r
  rY  rl  re  r   r   slice_indicesrn  rm  shift_logitsshift_labelsshift_attention_maskloss_fctflat_logitsflat_labelsoutputrI   rI   rJ   r   j  sd   @$
z&Gemma3ForConditionalGeneration.forwardTc                    s>   t  j|f||||||	|
|d|}|d dkr||d< |S )N)r  r	  r   r   r   rW   rl  r>  r   rK  )r   prepare_inputs_for_generation)rH   r   r  r	  r   r   rK  r   r>  rW   rl  rX  re   model_inputsr   rI   rJ   rz    s"   
z<Gemma3ForConditionalGeneration.prepare_inputs_for_generationc                 K   rP  rQ  rR  rT  rI   rI   rJ   5_prepare_4d_causal_attention_mask_with_cache_position  rW  zTGemma3ForConditionalGeneration._prepare_4d_causal_attention_mask_with_cache_positionr   r  c           	      K   sR   |   |||||d}|d ur"|jd dkr"t||j| j|d< tdi |S )Nr  r6   rZ  rI   )rd  r   rI  r   r   r   r   )	r   r  r   r   r  r   r>  re   r  rI   rI   rJ   r     s   	z8Gemma3ForConditionalGeneration.create_masks_for_generate)NNNNNNNNNNNNNr   )
NNNNNNNTNNri   )rk   rl   rm   r   r   r   r   r   r   r   rj  r   rC   r   r   r   r   rz  r|  staticmethodr	   r   r   r   rI   rI   r   rJ   rk  i  s    	

 $	rk  )rr   r)   r   r   r$  rk  rJ  )Vr  rf   collections.abcr   typingr   r   r   r   torch.nnr   torch.utils.checkpointcache_utilsr   r   configuration_utilsr	   r
   masking_utilsr   r   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   modeling_rope_utilsr   modeling_utilsr   processing_utilsr   utilsr   r   r   r   utils.deprecationr   gemma2.configuration_gemma2r   gemma2.modeling_gemma2r   r   r   r   r   r    r!   r"   r#   paligemma.modeling_paligemmar$   r%   r&   r'   siglipr(   
get_loggerrk   r   r)   rr   r   r   r   r   r   r   r   r   r   GEMMA3_START_DOCSTRINGr   r   r$  Moduler   r   r   rI  rJ  rk  __all__rI   rI   rI   rJ   <module>   sZ   ,
 M^:B~	 $  F