o
    eiu                     @   s.  d dl mZ d dlmZmZmZ d dlZd dlmZ ddl	m
Z ddlmZmZ ddlmZmZ ddlmZmZmZ dd	lmZmZ dd
lmZmZmZ ddlmZmZm Z  ddl!m"Z"m#Z# ddl$m%Z% ddl&m'Z'm(Z(m)Z)m*Z* ddl+m,Z, ddl-m.Z. ddl/m0Z0 ddl1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8m9Z9m:Z: ddl;m<Z<m=Z=m>Z>m?Z?m@Z@ ddlAmBZB e*CeDZEG dd de0eZFG dd deZGG dd de?ZHG dd de<ZIG dd dejJZKG d d! d!e4ZLG d"d# d#e7ZMG d$d% d%e8ZNG d&d' d'e2ZOG d(d) d)eZPdZQG d*d+ d+e6ZRd,eSd-eeSeSeSeSgeTf fd.d/ZUG d0d1 d1e5ZVG d2d3 d3e3ZWG d4d5 d5ejXZYe,d6d7d8d9			:	dOd;ed8ejZd<ejZdB d=ejZd>edB d?ejZdB d@ejZdB dAej[dB dBeTdCeTdB d-e\fdDdEZ]G dFdG dGe>Z^G dHdI dIe=Z_G dJdK dKeRZ`G dLdM dMeeRZag dNZbdS )P    )Callable)AnyLiteralOptionalN   )initialization)CacheDynamicCache)PreTrainedConfiglayer_type_validation)create_causal_maskcreate_masks_for_generate!create_sliding_window_causal_mask) GenericForSequenceClassificationGradientCheckpointingLayer)BaseModelOutputWithPastBaseModelOutputWithPooling SequenceClassifierOutputWithPast)ROPE_INIT_FUNCTIONSRopeParametersdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)deprecate_kwarg)maybe_autocast   )Gemma2Config)	Gemma2AttentionGemma2ForCausalLM	Gemma2MLPGemma2ModelGemma2PreTrainedModelGemma2RMSNormGemma2RotaryEmbeddingapply_rotary_pos_embeager_attention_forward)PaliGemmaCausalLMOutputWithPast!PaliGemmaForConditionalGenerationPaliGemmaModelPaligemmaModelOutputWithPasttoken_type_ids_mask_function)SiglipVisionConfigc                4   @   s>  e Zd ZdZdZdddZ						
																				d7dedB dedB dedB dedB dedB dedB dedB d edB d!edB d"edB d#edB d$e	dB d%edB d&edB d'edB d(e	dB d)edB d*edB d+edB d,e
e dB d-edB d.edB d/eed0 ef dB d1e	dB d2e	dB f2d3d4Zd8d5d6ZdS )9Gemma3TextConfigay  
    This is the configuration class to store the configuration of a [`Gemma3TextModel`]. It is used to instantiate an Gemma3Text
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the Gemma3Text-7B.
    e.g. [google/gemma3_text-7b](https://huggingface.co/google/gemma3_text-7b)
    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        vocab_size (`int`, *optional*, defaults to 262208):
            Vocabulary size of the Gemma3Text model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`Gemma3TextModel`]
        hidden_size (`int`, *optional*, defaults to 2304):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 9216):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 26):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer decoder.
        num_key_value_heads (`int`, *optional*, defaults to 4):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details, check out [this
            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
            `num_attention_heads`.
        head_dim (`int`, *optional*, defaults to 256):
            The attention head dimension.
        hidden_activation (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
            The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"`
            if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function.
        max_position_embeddings (`int`, *optional*, defaults to 131072):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*, defaults to 0):
            Padding token id.
        eos_token_id (`int`, *optional*, defaults to 1):
            End of stream token id.
        bos_token_id (`int`, *optional*, defaults to 2):
            Beginning of stream token id.
        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        query_pre_attn_scalar (`float`, *optional*, defaults to 256):
            Scaling factor used on the attention scores
        sliding_window (`int`, *optional*, defaults to 4096):
            In Gemma3Text, every other layer uses sliding window attention. This is the size of the sliding window.
        layer_types (`list`, *optional*):
            Attention pattern for each layer.
        final_logit_softcapping (`float`, *optional*):
            Scaling factor when applying tanh softcapping on the logits.
        attn_logit_softcapping (`float`, *optional*):
            Scaling factor when applying tanh softcapping on the attention scores.
        rope_parameters (`dict`, *optional*):
            Dictionary mapping attention patterns (`"full_attention"`, `"sliding_attention"`) to `RopeParameters`.
            Each value should be a dictionary containing `rope_type` and optional scaling parameters.
        use_bidirectional_attention (`bool`, *optional*, defaults to `False`):
            If True, the model will attend to all text tokens instead of using a causal mask. This does not change
            behavior for vision tokens.
        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
            Whether to tie weight embeddings

    ```python
    >>> from transformers import Gemma3TextModel, Gemma3TextConfig
    >>> # Initializing a Gemma3Text gemma3_text-7b style configuration
    >>> configuration = Gemma3TextConfig()
    >>> # Initializing a model from the gemma3_text-7b style configuration
    >>> model = Gemma3TextModel(configuration)
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    gemma3_textg    .Ag     @)globallocal@   	   $              gelu_pytorch_tanh   {Gz?ư>Tr      r    F           N
vocab_sizehidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsnum_key_value_headshead_dimhidden_activationmax_position_embeddingsinitializer_rangerms_norm_eps	use_cachepad_token_ideos_token_idbos_token_idattention_biasattention_dropoutquery_pre_attn_scalarsliding_windowlayer_typesfinal_logit_softcappingattn_logit_softcappingrope_parametersfull_attentionsliding_attentionuse_bidirectional_attentiontie_word_embeddingsc                    s   | _ | _| _| _| _|	 _| _| _| _| _	| _
| _|
 _| _| _| _| _| _| _| _| _| _| _| _|rR jd d  _|dd _ jd u rk fddt jD  _t j j | _tjdi | d S )Nr    r@   sliding_window_pattern   c                    s&   g | ]}t |d   j rdndqS )r@   r\   r[   )bool_sliding_window_pattern).0iself g/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/gemma3/modular_gemma3.py
<listcomp>   s    z-Gemma3TextConfig.__init__.<locals>.<listcomp>rg   )rO   rQ   rP   r^   rC   rK   rD   rE   rF   rG   rI   rH   rL   rM   rN   rR   rS   rJ   rT   rU   rW   rX   rV   r]   getrb   ranger   rY   r
   __init__)rf   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   r]   r^   kwargsrg   re   rh   rl      sD   

zGemma3TextConfig.__init__c                 K   s   | dd }ddiddid}| jd ur| jn|| _|d ur&| jd | | jdd u r5ddi| jd< | jd d| d| jd  | jdd u rTddi| jd< | jd d| d	| jd
  |   | j|d |S )Nrope_scaling	rope_typedefault)r\   r[   r[   
rope_thetar3   r\   rope_local_base_freqr4   )ignore_keys)poprY   updaterj   
setdefaultdefault_thetastandardize_rope_paramsvalidate_rope)rf   ignore_keys_at_rope_validationrm   rn   default_rope_paramsrg   rg   rh   convert_rope_params_to_dict   s(   

z,Gemma3TextConfig.convert_rope_params_to_dict)r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   Tr   r@   r    FrA   r;   rB   NNNNFTN)__name__
__module____qualname____doc__
model_typerw   intstrfloatra   listdictr   r   rl   r|   rg   rg   rg   rh   r1   >   s    R
	


Fr1   c                       s   e Zd ZdZdZddddZeedZ					
				ddee	e
ef B dB dee	e
ef B dB dedB dedB dedB dedB dedB dedB f fddZ  ZS )Gemma3Configa	  
    This is the configuration class to store the configuration of a [`Gemma3ForConditionalGeneration`]. It is used to instantiate an
    Gemma3ForConditionalGeneration according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the PaliGemma-2B.

    e.g. [google/gemma-3-4b](https://huggingface.co/google/gemma-3-4b)

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        text_config (`Union[Gemma3TextConfig, dict]`, *optional*):
            The config object of the text backbone.
        vision_config (`Union[AutoConfig, dict]`,  *optional*):
            Custom vision config or dict.
        mm_tokens_per_image (`int`, *optional*, defaults to 256):
            The number of tokens per image embedding.
        boi_token_index (`int`, *optional*, defaults to 255999):
            The begin-of-image token index to wrap the image prompt.
        eoi_token_index (`int`, *optional*, defaults to 256000):
            The end-of-image token index to wrap the image prompt.
        image_token_index (`int`, *optional*, defaults to 262144):
            The image token index to encode the image prompt.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
            Whether to tie weight embeddings

    Example:

    ```python
    >>> from transformers import Gemma3ForConditionalGeneration, Gemma3Config, SiglipVisionConfig, Gemma3TextConfig

    >>> # Initializing a Siglip-like vision config
    >>> vision_config = SiglipVisionConfig()

    >>> # Initializing a Gemma3 Text config
    >>> text_config = Gemma3TextConfig()

    >>> # Initializing a Gemma3 gemma-3-4b style configuration
    >>> configuration = Gemma3Config(vision_config, text_config)

    >>> # Initializing a model from the gemma-3-4b style configuration
    >>> model = Gemma3TextConfig(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```gemma3image_token_indexboi_token_indexeoi_token_index)image_token_idboi_token_ideoi_token_id)text_configvision_configNr;         r>   Tr   r   mm_tokens_per_imagerL   r^   c	           
         s   |d u rt  }td nt|trt di |}t|tr&tdi |}n|d u r2t }td || _|| _|| _|| _	|| _
|| _|| _|| _t jdi |	 d S )Nz@text_config is None, using default Gemma3TextConfig text config.zFvision_config is None, using default SiglipVisionConfig vision config.rg   )r1   loggerinfo
isinstancer   r0   r   r   r   r   r   r   rL   r^   superrl   )
rf   r   r   r   r   r   r   rL   r^   rm   	__class__rg   rh   rl   6  s&   


zGemma3Config.__init__)NNr;   r   r   r   r>   T)r~   r   r   r   r   attribute_mapr1   r0   sub_configsr   r   r   r   r   ra   rl   __classcell__rg   rg   r   rh   r      sF    1	r   c                   @      e Zd ZdS )Gemma3ModelOutputWithPastNr~   r   r   rg   rg   rg   rh   r   Z      r   c                   @   r   )Gemma3CausalLMOutputWithPastNr   rg   rg   rg   rh   r   ^  r   r   c                	       sH   e Zd ZdZddedededef fddZd	ejf fd
dZ	  Z
S )Gemma3TextScaledWordEmbeddingz\
    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
          ?num_embeddingsembedding_dimpadding_idxembed_scalec                    s0   t  ||| || _| jdt|dd d S )Nr   F
persistent)r   rl   scalar_embed_scaleregister_buffertorchtensor)rf   r   r   r   r   r   rg   rh   rl   g  s   z&Gemma3TextScaledWordEmbedding.__init__	input_idsc                    s   t  || j| jj S r}   )r   forwardr   toweightdtype)rf   r   r   rg   rh   r   l  s   z%Gemma3TextScaledWordEmbedding.forward)r   )r~   r   r   r   r   r   rl   r   Tensorr   r   rg   rg   r   rh   r   b  s     r   c                       s"   e Zd Zdef fddZ  ZS )	Gemma3MLPconfigc                    s   t  | d S r}   r   rl   rf   r   r   rg   rh   rl   q     zGemma3MLP.__init__)r~   r   r   r1   rl   r   rg   rg   r   rh   r   p  s    r   c                       s(   e Zd Zddedef fddZ  ZS )Gemma3RMSNormr?   dimepsc                    s   t  j||d d S )Nr   r   r   )rf   r   r   r   rg   rh   rl   v  s   zGemma3RMSNorm.__init__)r?   )r~   r   r   r   r   rl   r   rg   rg   r   rh   r   u  s     r   c                   @   sv   e Zd ZddefddZe				ddedB ded dedB dedB d	e	d
e
f f
ddZe edddZdS )Gemma3RotaryEmbeddingNr   c                 C   s   t j  |j| _|j| _|| _tt|j	| _	i | _
| j	D ]P}| jj| }|d u r+q|d | j
|< | j}| j
| dkrCt| j
|  }|| j||d\}}| j| d|dd | j| d| dd t| | d| qd S )	Nro   rp   
layer_type	_inv_freqFr   _original_inv_freq_attention_scaling)nnModulerl   rK   max_seq_len_cachedoriginal_max_seq_lenr   r   setrV   ro   rY   compute_default_rope_parametersr   r   clonesetattr)rf   r   devicer   rope_paramsrope_init_fncurr_inv_freqcurr_attention_scalingrg   rg   rh   rl   {  s&   

zGemma3RotaryEmbedding.__init__r   ztorch.deviceseq_lenr   returnztorch.Tensorc                 C   s^   | j | d }t| ddp| j| j }d}d|tjd|dtjdj|tjd|   }||fS )	a|  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
            layer_type (`str`, *optional*):
                The current layer type if the model has different RoPE parameters per type.
                Should not be used unless `config.layer_types is not None`

        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        rq   rI   Nr   r   r    r   r   r   )	rY   getattrrD   rG   r   arangeint64r   r   )r   r   r   r   baser   attention_factorinv_freqrg   rg   rh   r     s   &z5Gemma3RotaryEmbedding.compute_default_rope_parametersc                 C   s  t | | d}t | | d}|d d d d f  |jd dd|j}|d d d d d f  }t|jjtrE|jjdkrE|jjnd}t	|dd	) | |  
dd
}	tj|	|	fdd}
|
 | }|
 | }W d    n1 syw   Y  |j|jd|j|jdfS )Nr   r   r   r@   mpscpuF)device_typeenabledr    r   r   )r   r   expandshaper   r   r   typer   r   	transposer   catcossinr   )rf   xposition_idsr   r   attention_scalinginv_freq_expandedposition_ids_expandedr   freqsembr   r   rg   rg   rh   r     s   .&zGemma3RotaryEmbedding.forward)NNNNNNr}   )r~   r   r   r1   rl   staticmethodr   r   r   tupler   r   r   no_gradr   r   rg   rg   rg   rh   r   z  s*    
#r   c                       s   e Zd Zdedef fddZ				ddejdejdejdB d	edB d
ej	dB de
e deejejdB eej dB f fddZ  ZS )Gemma3Attentionr   	layer_idxc                    sd   t  || | jdkr|jnd | _| jdk| _| jj | _t|j	|j
d| _t|j	|j
d| _d S )Nr\   r   )r   rl   r   rU   
is_slidingr   r]   	is_causalr   rI   rM   q_normk_normrf   r   r   r   rg   rh   rl     s   zGemma3Attention.__init__Nhidden_statesposition_embeddingsattention_maskpast_key_valuescache_positionrm   r   c                 K   s0  |j d d }g |d| jR }| ||dd}	| ||dd}
| ||dd}| |	}	| |
}
|\}}t	|	|
||\}	}
|d ura|||d}|
|
|| j|\}
}t| jjt}|| |	|
||f| jrv| jnd| j| jd|\}}|jg |dR   }| |}||fS )Nr   r@   r    )r   r   r   rA   )dropoutscalingrU   )r   rI   q_projviewr   k_projv_projr   r   r)   ru   r   r   get_interfacer   _attn_implementationr*   trainingrS   r  rU   reshape
contiguouso_proj)rf   r   r   r   r   r   rm   input_shapehidden_shapequery_states
key_statesvalue_statesr   r   cache_kwargsattention_interfaceattn_outputattn_weightsrg   rg   rh   r     s>   	

	

zGemma3Attention.forwardr   )r~   r   r   r1   r   rl   r   r   r   
LongTensorr   r   r   r   r   rg   rg   r   rh   r     s*    r   c                       s   e Zd Zdedef fddZ					ddejdejdejdB d	ejdB d
e	dB dejdB de
e deejeejejf dB f fddZ  ZS )Gemma3DecoderLayerr   r   c                    s   t    || _|j| _|| _|j| | _t||d| _t	|| _
t| j|jd| _t| j|jd| _t| j|jd| _t| j|jd| _d S )N)r   r   r   )r   rl   r   rD   r   rV   attention_typer   	self_attnr   mlpr   rM   input_layernormpost_attention_layernormpre_feedforward_layernormpost_feedforward_layernormr   r   rg   rh   rl     s   

zGemma3DecoderLayer.__init__Nr   r   r   r   r   r   rm   r   c           
   	   K   sp   |}|  |}| jd||||||d|\}}	| |}|| }|}| |}| |}| |}|| }|S )N)r   r   r   r   r   r   rg   )r  r  r  r  r  r  )
rf   r   r   r   r   r   r   rm   residual_rg   rg   rh   r     s(   



	


zGemma3DecoderLayer.forward)NNNNN)r~   r   r   r1   r   rl   r   r   r  r   r   r   r   FloatTensorr   r   rg   rg   r   rh   r    s0    	r  c                   @   s,   e Zd ZdZdZg dZe dd ZdS )Gemma3PreTrainedModelmodel)imagetext)r  SiglipVisionEmbeddingsSiglipEncoderLayer#SiglipMultiheadAttentionPoolingHeadc                 C   s   t | | t|trt|j d S d|jjv r!t|j	 d S t|t
r0t|j|j d S t|trm|jD ]6}|j}|j| dkrKt|j|  }||j|d\}}tt|| d| tt|| d| q8d S d S )NRMSNormrp   r   r   r   )r   _init_weightsr   Gemma3MultiModalProjectorinitzeros_mm_input_projection_weightr   r~   r   r   	constant_r   r   r   rV   r   ro   r   r   copy_r   )rf   moduler   r   r   r   rg   rg   rh   r*  B  s"   



z#Gemma3PreTrainedModel._init_weightsN)	r~   r   r   base_model_prefixinput_modalities_no_split_modulesr   r   r*  rg   rg   rg   rh   r"  8  s    r"  rU   r   c              
      s&   dt dt dt dt dtf
 fdd}|S )zA
    Enables a bidirectional mask within the sliding window.
    	batch_idxhead_idxq_idxkv_idxr   c                    s   t ||  k S )zA token can attend to any other token if their absolute distance is within
        the (exclusive) sliding window size (distance < sliding_window).)abs)r5  r6  r7  r8  rU   rg   rh   
inner_mask[  s   z1_bidirectional_window_overlay.<locals>.inner_mask)r   ra   )rU   r;  rg   r:  rh   _bidirectional_window_overlayV  s   "r<  c                       s   e Zd ZU eed< dZdef fddZ							ddejdB dej	dB dejdB d	e
dB d
ejdB dedB dejdB dee defddZ  ZS )Gemma3TextModelr   r%  c                    s0   t  | t|j|j| j| jjd d| _d S )N      ?)r   )r   rl   r   rC   rD   r   r   embed_tokensr   r   rg   rh   rl   g  s   zGemma3TextModel.__init__Nr   r   r   r   inputs_embedsrN   r   rm   r   c              	   K   sv  |d u |d uA rt d|d u r| |}|r!|d u r!t| jd}|d u r=|d ur-| nd}	tj|	|	|jd  |jd}|d u rF|	d}t
| }
ts|| j|||||d}| }| jjrmdd |d	< t| jj|d	< tdi |tdi |d
}
|}i }| jjD ]}| |||||< q| jd | jj D ]}||f|
|j ||j |||d|}q| |}t||dS )N:You must specify exactly one of input_ids or inputs_embeds)r   r   r@   r   r   rA  r   r   r   r   c                  W   s   t jdt jdS )NTr   )r   r   ra   )argsrg   rg   rh   <lambda>  s    z)Gemma3TextModel.forward.<locals>.<lambda>or_mask_functionrZ   )r   r   r   r   r   )last_hidden_stater   rg   )
ValueErrorr@  r	   r   get_seq_lengthr   r   r   r   	unsqueezer   r   copyr]   r<  rU   r   r   rV   
rotary_emblayersrF   r  normr   )rf   r   r   r   r   rA  rN   r   rm   past_seen_tokenscausal_mask_mappingmask_kwargssliding_mask_kwargsr   r   r   decoder_layerrg   rg   rh   r   o  sb   



zGemma3TextModel.forward)NNNNNNN)r~   r   r   r1   __annotations__r3  rl   r   r  r   r   r!  ra   r   r   r   r   r   rg   rg   r   rh   r=  c  s<   
 
	
r=  c                       s,   e Zd ZU eed< def fddZ  ZS )Gemma3ForCausalLMr   c                    s   t  | t|| _d S r}   )r   rl   r=  r#  r   r   rg   rh   rl     s   zGemma3ForCausalLM.__init__)r~   r   r   r1   rU  rl   r   rg   rg   r   rh   rV    s   
 rV  c                       s2   e Zd Zdef fddZdejfddZ  ZS )r+  r   c                    s   t    tt|jj|jj| _	t
|jj|jjd| _t|jj|jj | _t|jd | _| j| j | _tj| j| jd| _d S )Nr  r?  )kernel_sizestride)r   rl   r   	Parameterr   zerosr   rD   r   r.  r   layer_norm_epsmm_soft_emb_normr   
image_size
patch_sizepatches_per_imager   tokens_per_siderW  	AvgPool2davg_poolr   r   rg   rh   rl     s   
z"Gemma3MultiModalProjector.__init__vision_outputsc           	      C   sv   |j \}}}|dd}|||| j| j}| }| |}|d}|dd}| |}t	|| j
}||S )Nr@   r    )r   r   r	  r_  r
  rb  flattenr\  r   matmulr.  type_as)	rf   rc  
batch_sizer   rD   reshaped_vision_outputspooled_vision_outputsnormed_vision_outputsprojected_vision_outputsrg   rg   rh   r     s   



z!Gemma3MultiModalProjector.forward)	r~   r   r   r   rl   r   r   r   r   rg   rg   r   rh   r+    s    r+  input_embedsz5.6.0rA  )versionnew_nameFr   r   r   r   r   token_type_idspixel_valuesis_trainingis_first_iterationc
                 K   s   |r
|du r
t d|  |||||d}|	dur|	n|du p&|j p&|du}	|durh|	rh|dk|j}tjj|dddddddf }|| @ }tj	|
 dd	d }t||d}t||j||d
< tdi |S )a  
    Overwrites the base `create_masks_for_generate` with `token_type_ids` masking to create the causal mask mapping
    for all kinds of forward passes. Gemma3 uses a bidirectional mask for images.

    Uses `pixel_values` as an optional input to disambiguate edge cases.
    Nz;`token_type_ids` is required as a model input when trainingrD  r@   )r@   r   r   )valuer   r   rG  rg   )rI  get_text_configis_initializedr   r   r   
functionalpadr   cumsumr   wherer/   r   )r   rA  r   r   r   r   ro  rp  rq  rr  rm   rR  is_imageis_previous_imagenew_image_startimage_group_idsrg   rg   rh   create_causal_mask_mapping  s.   "
r~  c                       s   e Zd ZdZdef fddZeedddej	de
e d	eeB fd
dZee										ddejdB dej	dB dejdB dejdB dedB dejdB dejdB dej	dB dejdB dedB de
e d	eeB fddZ  ZS )Gemma3ModelFr   c                    s   t  | | `d S r}   )r   rl   text_config_dtyper   r   rg   rh   rl   %  s   zGemma3Model.__init__zOProjects the last hidden state from the vision model into language model space.)custom_introrp  rm   r   c                 K   s,   | j d|dd|}|j}| ||_|S )NT)rp  return_dictrg   )vision_towerrH  multi_modal_projectorpooler_output)rf   rp  rm   rc  rH  rg   rg   rh   get_image_features)  s   zGemma3Model.get_image_featuresNr   r   r   r   ro  r   rA  labelsrN   	lm_kwargsc                 K   sV  |d u |d uA rt d|d ur&| jj| jkr&|| jjk}| }d||< n|}|d u r2|  |}|d u rN|d ur>| nd}tj|||j	d  |j
d}|d urp| j|ddj}||j
|j}| j|||d}|||}t| }tst| j|||||||| jd	}| jd|||||
d|d	|}t|j|j|j|j|d ur|d
S d d
S )NrB  r   r@   rC  T)r  )rA  image_features)rq  )r   r   r   rA  rN   r  r   )rH  r   r   
attentionsimage_hidden_statesrg   )rI  r   r   rC   r   get_input_embeddingsrJ  r   r   r   r   r  r  r   r   get_placeholder_maskmasked_scatterr   r   r~  r  language_modelr   rH  r   r   r  )rf   r   rp  r   r   r   ro  r   rA  r  rN   r  special_image_maskllm_input_idsrP  r  rQ  outputsrg   rg   rh   r   4  sj   

zGemma3Model.forward)
NNNNNNNNNN)r~   r   r   accepts_loss_kwargsr   rl   r   r   r   r!  r   r   r   r   r  r  r   r   ra   r   r   r   rg   rg   r   rh   r  !  sb    		
r  c                       s   e Zd ZdZee											ddejdB dejdB dej	dB dejdB de
dB d	ejdB d
ejdB dejdB dejdB dedB deej	B dee deeB fddZ											d fdd	Z  ZS )Gemma3ForConditionalGenerationFNr   r   rp  r   r   r   ro  r   rA  r  rN   logits_to_keepr  r   c                 K   s^  | j d||||||||
|	|d
|}|d }t|tr"t| dn|}| |dd|ddf }d}|	dur| }|dddddf }|	dddf }|dur~|dd|jd  df |j}|||jdk 	 }|||jdk 	 }n|	 }|	 }t
 }|d| jjj}|d|j}|||}t|||j|j|j|jdS )	a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration

        >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma-3-4b-it")
        >>> processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it")

        >>> messages = [
        ...     {
        ...         "role": "system",
        ...         "content": [
        ...             {"type": "text", "text": "You are a helpful assistant."}
        ...         ]
        ...     },
        ...     {
        ...         "role": "user", "content": [
        ...             {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
        ...             {"type": "text", "text": "Where is the cat standing?"},
        ...         ]
        ...     },
        ... ]

        >>> inputs = processor.apply_chat_template(
        ...     messages,
        ...     tokenize=True,
        ...     return_dict=True,
        ...     return_tensors="pt",
        ...     add_generation_prompt=True
        ... )
        >>> # Generate
        >>> generate_ids = model.generate(**inputs)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "user\nYou are a helpful assistant.\n\n\n\n\n\nWhere is the cat standing?\nmodel\nBased on the image, the cat is standing in a snowy area, likely outdoors. It appears to"
        ```
        )
r   rp  ro  r   r   r   rA  rN   r  r   r   N.r   r@   )losslogitsr   r   r  r  rg   )r#  r   r   slicelm_headr   r   r   r   r
  r   CrossEntropyLossr  r   r   rC   r   r   r   r  r  )rf   r   rp  r   r   r   ro  r   rA  r  rN   r  r  r  r   slice_indicesr  r  shift_logitsshift_labelsshift_attention_maskloss_fctflat_logitsflat_labelsrg   rg   rh   r     sP   >$
z&Gemma3ForConditionalGeneration.forwardTc                    s<   t  j|f||||||	|
||d	|}|s|	s||d< |S )N)	r   rA  r   r   r   rN   r  ro  rr  rp  )r   prepare_inputs_for_generation)rf   r   r   rA  r   r   rp  r   ro  rN   r  r  rr  rm   model_inputsr   rg   rh   r    s$   z<Gemma3ForConditionalGeneration.prepare_inputs_for_generation)NNNNNNNNNNr   )NNNNNNNTNNF)r~   r   r   r  r   r   r   r  r!  r   r   ra   r   r   r   r   r   r   r  r   rg   rg   r   rh   r    sl    	
qr  c                       s   e Zd ZddddZ fddZdd Zd	d
 Zee									dde	j
dB de	jdB de	jdB de	j
dB dedB de	jdB de	j
dB de	j
dB dedB dee defddZ  ZS )Gemma3ForSequenceClassificationzmodel.language_modelzmodel.vision_towerzmodel.multi_modal_projector)z^language_model.modelz^vision_towerz^multi_modal_projectorc                    sB   t  | |j| _t|| _tj|jj| jdd| _	| 
  d S )NF)bias)r   rl   
num_labelsr  r#  r   Linearr   rD   score	post_initr   r   rg   rh   rl   (  s
   
z(Gemma3ForSequenceClassification.__init__c                 C   s
   | j  S r}   )r#  r  re   rg   rg   rh   r  1  s   
z4Gemma3ForSequenceClassification.get_input_embeddingsc                 C   s   | j | d S r}   )r#  set_input_embeddings)rf   rs  rg   rg   rh   r  4  r   z4Gemma3ForSequenceClassification.set_input_embeddingsNr   rp  r   r   r   rA  ro  r  rN   rm   r   c
              
   K   s6  | j |f|||||||	d|
}|j}| |}|dur#|jd }n|jd }| jjjdu r7|dkr7td| jjjdu rAd}n2|durg|| jjjk|j	t
j}t
j|jd |j	t
jd}|| d}nd}t| jj d |t
j||j	d	|f }d}|dur| j|||| jd
}t|||j|j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        )r   rp  r   r   rA  ro  rN   Nr   r@   z=Cannot handle batch sizes > 1 if no padding token is defined.r   r   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`rC  )r  r  pooled_logitsr   )r  r  r   r   r  )r#  rH  r  r   r   r   rO   rI  r   r   r   int32r   argmaxr   warning_oncer   r~   loss_functionr   r   r   r  )rf   r   rp  r   r   r   rA  ro  r  rN   rm   transformer_outputsr   r  rg  last_non_pad_tokennon_pad_masktoken_indicesr  r  rg   rg   rh   r   7  sR   	

z'Gemma3ForSequenceClassification.forward)	NNNNNNNNN)r~   r   r   _checkpoint_conversion_mappingrl   r  r  r   r   r   r  r!  r   r   ra   r   r   r   r   r   rg   rg   r   rh   r  !  sT    		
r  c                   @   s   e Zd ZU dZeed< dZdS )#Gemma3TextForSequenceClassificationz
    Gemma3TextForSequenceClassification is a text-only sequence classification model that works with Gemma3TextConfig.
    It uses the generic sequence classification implementation for efficiency and consistency.
    r   r>  N)r~   r   r   r   r1   rU  r3  rg   rg   rg   rh   r    s   
 r  )	r   r1   r"  r=  rV  r  r  r  r  )NNFN)ccollections.abcr   typingr   r   r   r   torch.nnr    r   r,  cache_utilsr   r	   configuration_utilsr
   r   masking_utilsr   r   r   modeling_layersr   r   modeling_outputsr   r   r   modeling_rope_utilsr   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.deprecationr   utils.genericr   gemma2.configuration_gemma2r!   gemma2.modeling_gemma2r"   r#   r$   r%   r&   r'   r(   r)   r*   paligemma.modeling_paligemmar+   r,   r-   r.   r/   siglipr0   
get_loggerr~   r   r1   r   r   r   	Embeddingr   r   r   r   r   r  GEMMA3_START_DOCSTRINGr"  r   ra   r<  r=  rV  r   r+  r   r!  r   r~  r  r  r  r  __all__rg   rg   rg   rh   <module>   s   ,
 <aP:1"Y$	
8b ^
