o
    ei                    @   s@  d dl Z d dlmZ d dlmZmZ d dlZd dlmZ ddl	m
Z ddlmZmZmZ ddlmZmZ ddlmZmZmZ dd	lmZ dd
lmZ ddlmZmZmZmZm Z m!Z!m"Z" ddl#m$Z$m%Z% ddl&m'Z'm(Z( ddl)m*Z* ddl+m,Z,m-Z-m.Z.m/Z/m0Z0 ddl1m2Z2 ddl3m4Z4m5Z5 ddl6m7Z7 ddl8m9Z9m:Z: ddl;m<Z<m=Z=m>Z>m?Z?m@Z@mAZAmBZBmCZCmDZDmEZEmFZF ddlGmHZH ddlImJZJmKZKmLZLmMZM e/NeOZPG dd de:eZQG dd de9ZRG dd de:eZSG dd deZTG d d! d!e@ZUG d"d# d#e=ZVG d$d% d%eAZWG d&d' d'e<ZXG d(d) d)e<ZYdLd+eZd,efd-d.Z[G d/d0 d0eKZ\G d1d2 d2eKZ]G d3d4 d4eLZ^G d5d6 d6eJZ_G d7d8 d8e>Z`G d9d: d:eBZae-G d;d< d<e?ZbG d=d> d>ebZcG d?d@ d@ebZdG dAdB dBebZee-G dCdD dDebZfG dEdF dFebeZge-G dGdH dHebZhe-G dIdJ dJebZig dKZjdS )M    N)Callable)AnyOptional   )initialization)DynamicCacheEncoderDecoderCacheStaticCache)PreTrainedConfiglayer_type_validation)GenerationConfigGenerationMixinGenerationMode)create_bidirectional_mask)FlashAttentionKwargs)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsBaseModelOutputWithPoolingSeq2SeqLMOutputSeq2SeqModelOutputSequenceClassifierOutputTokenClassifierOutput)ROPE_INIT_FUNCTIONSRopeParameters)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleloggingtorch_compilable_check)merge_with_config_defaults)OutputRecordercapture_outputs   )	AutoModel)Gemma3ConfigGemma3TextConfig)Gemma3Attention	Gemma3MLPGemma3MultiModalProjectorGemma3PreTrainedModelGemma3RMSNormGemma3RotaryEmbeddingGemma3TextScaledWordEmbeddingapply_rotary_pos_embcreate_causal_mask!create_sliding_window_causal_maskeager_attention_forward)SiglipVisionConfig)T5GemmaClassificationHeadT5GemmaEncoderLayerT5GemmaLMHeadbidirectional_mask_functionc                2   @     e Zd ZdZdZ									
																d/dedB dedB dedB dedB dedB dedB dedB dedB dedB dedB d edB d!edB d"edB d#edB d$edB d%edB d&edB d'edB d(edB d)e	e dB d*edB d+edB d,e
eee
f B dB f.d-d.ZdS )0T5Gemma2TextConfiga  
    This is the configuration class to store the configuration of a [`T5Gemma2TextModel`]. It is used to instantiate the encoder's
    text model portion of the T5Gemma2 Model according to the specified arguments, defining the model architecture. Instantiating
    a configuration with the defaults will yield a similar configuration to that of the T5Gemma2Text-7B.
    e.g. [google/t5gemma2_text-7b](https://huggingface.co/google/t5gemma2_text-7b)
    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        vocab_size (`int`, *optional*, defaults to 262208):
            Vocabulary size of the T5Gemma2Text model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`T5Gemma2TextModel`]
        hidden_size (`int`, *optional*, defaults to 2304):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 9216):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 26):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer decoder.
        num_key_value_heads (`int`, *optional*, defaults to 4):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details, check out [this
            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
            `num_attention_heads`.
        head_dim (`int`, *optional*, defaults to 256):
            The attention head dimension.
        hidden_activation (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
            The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"`
            if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function.
        max_position_embeddings (`int`, *optional*, defaults to 131072):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*, defaults to 0):
            Padding token id.
        eos_token_id (`int`, *optional*, defaults to 1):
            End of stream token id.
        bos_token_id (`int`, *optional*, defaults to 2):
            Beginning of stream token id.
        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        query_pre_attn_scalar (`float`, *optional*, defaults to 256):
            Scaling factor used on the attention scores
        sliding_window (`int`, *optional*, defaults to 4096):
            In T5Gemma2Text, every other layer uses sliding window attention. This is the size of the sliding window.
        layer_types (`list`, *optional*):
            Attention pattern for each layer.
        final_logit_softcapping (`float`, *optional*):
            Scaling factor when applying tanh softcapping on the logits.
        attn_logit_softcapping (`float`, *optional*):
            Scaling factor when applying tanh softcapping on the attention scores.
        rope_parameters (`RopeParameters`, *optional*):
            Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
            a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
            with longer `max_position_embeddings`.
    t5gemma2_text@   	   $              gelu_pytorch_tanh   {Gz?ư>Tr      r%   F           N
vocab_sizehidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsnum_key_value_headshead_dimhidden_activationmax_position_embeddingsinitializer_rangerms_norm_eps	use_cachepad_token_ideos_token_idbos_token_idattention_biasattention_dropoutquery_pre_attn_scalarsliding_windowlayer_typesfinal_logit_softcappingattn_logit_softcappingrope_parametersc                       | _ | _| _| _|	 _| _| _| _| _| _	| _
|
 _| _| _| _| _| _| _| _| _| _| _|dd _ jd u r[ fddt jD  _t j j | _tjdi | d S )Nsliding_window_pattern   c                    &   g | ]}t |d   j rdndqS rG   sliding_attentionfull_attentionbool_sliding_window_pattern.0iself k/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/t5gemma2/modular_t5gemma2.py
<listcomp>       z/T5Gemma2TextConfig.__init__.<locals>.<listcomp>rp   rV   rX   rW   rJ   rR   rK   rL   rM   rN   rP   rO   rS   rT   rU   rY   rZ   rQ   r[   r\   r^   r_   r]   getrj   ranger   r`   r
   __init__ro   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   rZ   r[   r\   r]   r^   r_   r`   kwargsrp   rn   rq   rw      <   

zT5Gemma2TextConfig.__init__r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   Tr   rG   r%   FrH   rB   rI   NNNN__name__
__module____qualname____doc__
model_typeintstrfloatri   listr   dictrw   rp   rp   rp   rq   r:   L       D	

r:   c                   @   s   e Zd ZdZeedZdS )T5Gemma2EncoderConfigt5gemma2_encoder)text_configvision_configN)r}   r~   r   r   r:   r4   sub_configsrp   rp   rp   rq   r      s
    
r   c                2   @   r9   )0T5Gemma2DecoderConfiga
  
    This is the configuration class to store the configuration of a [`T5Gemma2DecoderModel`]. It is used to instantiate the decoder
    text model portion of the T5Gemma2 Model according to the specified arguments, defining the model architecture. Instantiating
    a configuration with the defaults will yield a similar configuration to that of the T5Gemma2Decoder-7B.
    e.g. [google/t5gemma2_text-7b](https://huggingface.co/google/t5gemma2_text-7b)
    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        vocab_size (`int`, *optional*, defaults to 262208):
            Vocabulary size of the T5Gemma2Decoder model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`T5Gemma2DecoderModel`]
        hidden_size (`int`, *optional*, defaults to 2304):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 9216):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 26):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer decoder.
        num_key_value_heads (`int`, *optional*, defaults to 4):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details, check out [this
            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
            `num_attention_heads`.
        head_dim (`int`, *optional*, defaults to 256):
            The attention head dimension.
        hidden_activation (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
            The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"`
            if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function.
        max_position_embeddings (`int`, *optional*, defaults to 131072):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*, defaults to 0):
            Padding token id.
        eos_token_id (`int`, *optional*, defaults to 1):
            End of stream token id.
        bos_token_id (`int`, *optional*, defaults to 2):
            Beginning of stream token id.
        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        query_pre_attn_scalar (`float`, *optional*, defaults to 256):
            Scaling factor used on the attention scores
        sliding_window (`int`, *optional*, defaults to 4096):
            In T5Gemma2Decoder, every other layer uses sliding window attention. This is the size of the sliding window.
        layer_types (`list`, *optional*):
            Attention pattern for each layer.
        final_logit_softcapping (`float`, *optional*):
            Scaling factor when applying tanh softcapping on the logits.
        attn_logit_softcapping (`float`, *optional*):
            Scaling factor when applying tanh softcapping on the attention scores.
        rope_parameters (`RopeParameters`, *optional*):
            Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
            a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
            with longer `max_position_embeddings`.
    t5gemma2_decoderr<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   Tr   rG   r%   FrH   rI   NrJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   rZ   r[   r\   r]   r^   r_   r`   c                    ra   )Nrb   rc   c                    rd   re   rh   rk   rn   rp   rq   rr   Y  rs   z2T5Gemma2DecoderConfig.__init__.<locals>.<listcomp>rp   rt   rx   rp   rn   rq   rw   #  rz   zT5Gemma2DecoderConfig.__init__r{   r|   rp   rp   rp   rq   r      r   r   c                       s   e Zd ZdZdZdgZeedZdddZ						
	
	
				ddee
eef B dB dee
eef B dB dedededededededB f fddZ  ZS )T5Gemma2ConfigaV  
    This is the configuration class to store the configuration of a [`T5Gemma2Model`]. It is used to instantiate an T5Gemma2
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to a hypothetical balanced Gemma3 encoder-decoder model.
    e.g. [google/t5gemma-2-270m-270m](https://huggingface.co/google/t5gemma-2-270m-270m)
    Configuration objects inherit from [PreTrainedConfig] and can be used to control the model outputs. Read the
    documentation from [PreTrainedConfig] for more information.

    Args:
        encoder (`Union[T5Gemma2EncoderConfig, dict]`, optional, *optional*):
            Configuration for the encoder.
        decoder (`Union[T5Gemma2DecoderConfig, dict]`, optional, *optional*):
            Configuration for the decoder.
        is_encoder_decoder (bool, optional, *optional*, defaults to `True`):
            Whether the model is used as an encoder/decoder or not.
        dropout_rate (`float`, *optional*, defaults to 0.0):
            The ratio for all dropout layers (following T5).
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for attention.
        classifier_dropout_rate (`float`, *optional*, defaults to 0.0):
            The dropout ratio for classifier (following T5).
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        image_token_index (`int`, *optional*, defaults to 256001):
            The image token index to encode the image prompt. Defaults to 256001, which is right after the eoi_token_index.
            Note this is different from Gemma 3.
        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
            Whether to tie weight embeddings

    ```python
    >>> from transformers import T5Gemma2Config, T5Gemma2Model
    >>> t5gemma2_config = T5Gemma2Config.from_pretrained("google/t5gemma-270m-270m")
    >>> model = T5Gemma2Model(t5gemma2_config)
    ```
    t5gemma2past_key_values)encoderdecoderimage_token_indexeoi_token_index)image_token_ideoi_token_idNTrH   rE    r   r   is_encoder_decoderdropout_raterZ   classifier_dropout_raterS   tie_word_embeddingsc
                    s  t |trtdi |}n|d u rt }td nt |ts(tt| dt |tr5tdi |}n|d u rBt }td nt |tsPtt| d|jj	|j	kretd|jj	 d|j	 d|sktd|jj
|j
krtd|jj
 d|j
 d||j_||j_||j_||_|| _||_||_|| _d	D ]}||
vrt|||
|< q|| _|| _|j| _|| _|	| _t jdd
|i|
 d S )NzDencoder is None, using default T5Gemma2EncoderConfig encoder config.z is not supported.zDdecoder is None, using default T5Gemma2DecoderConfig decoder config.zBImbalanced encoder-decoder is not supported in T5Gemma2: encoder (z) vs decoder (z).z4T5Gemma2Model only support encoder-decoder modeling.zRImbalanced encoder-decoder vocabulary size is not supported in T5Gemma2: encoder ()rX   rV   rW   rJ   r   rp   )
isinstancer   r   loggerinfo
ValueErrortyper   r   rK   rJ   r   rZ   r   r   r   r   getattrr   rS   r   r   superrw   )ro   r   r   r   r   rZ   r   rS   r   r   ry   special_token_key	__class__rp   rq   rw     sd   



zT5Gemma2Config.__init__)	NNTrH   rH   rH   rE   r   T)r}   r~   r   r   r   keys_to_ignore_at_inferencer   r   r   attribute_mapr   r   r   ri   r   r   rw   __classcell__rp   rp   r   rq   r   c  sL    $	
r   c                   @      e Zd ZdS )T5Gemma2RMSNormNr}   r~   r   rp   rp   rp   rq   r         r   c                       s*   e Zd Zdef fddZdd Z  ZS )T5Gemma2MLPconfigc                    s   t  | t|j| _d S N)r   rw   nnDropoutr   dropoutro   r   r   rp   rq   rw     s   zT5Gemma2MLP.__init__c                 C   s2   |  | || | }| |}| |}|S r   )act_fn	gate_projup_projr   	down_proj)ro   xhidden_statesr   rp   rp   rq   forward  s   

zT5Gemma2MLP.forward)r}   r~   r   r:   rw   r   r   rp   rp   r   rq   r     s    r   c                       sl   e Zd Zddef fddZe				ddedB ded dedB dedB d	e	d
e
f f
 fddZ  ZS )T5Gemma2RotaryEmbeddingNr   c                    s   t  || d S r   r   rw   )ro   r   devicer   rp   rq   rw     s   z T5Gemma2RotaryEmbedding.__init__r   ztorch.deviceseq_len
layer_typereturnztorch.Tensorc                    s   t  | |||S r   )r   compute_default_rope_parameters)r   r   r   r   r   rp   rq   r     s   z7T5Gemma2RotaryEmbedding.compute_default_rope_parametersr   )NNNN)r}   r~   r   r:   rw   staticmethodr   r   r   tupler   r   r   rp   rp   r   rq   r     s$    
r   c                       s&   e Zd Zdedef fddZ  ZS )T5Gemma2SelfAttentionr   	layer_idxc                       t  || d| _d S NFr   rw   	is_causalro   r   r   r   rp   rq   rw        
zT5Gemma2SelfAttention.__init__)r}   r~   r   r:   r   rw   r   rp   rp   r   rq   r     s    r   c                       s   e Zd ZdZdedef fddZ		ddejde	ejejf d	ejdB d
ejde
dB dejdB dee de	ejejdB e	ej dB f fddZ  ZS )T5Gemma2MergedAttentionz6Merged self-attention and cross-attention for decoder.r   r   c                    r   r   r   r   r   rp   rq   rw     r   z T5Gemma2MergedAttention.__init__Nr   position_embeddingsmerged_attention_maskencoder_hidden_statesr   cache_positionry   r   c                 K   s^  |j d d }g |d| jR }	|j d d }
g |
d| jR }| ||	dd}| ||	dd}| ||	dd}| |}| |}|\}}t	||||\}}|d ur|||d}|j
}|||| j|\}}|j| j}|j}|d u s|s| ||dd}| ||dd}| |}|d ur|||| j\}}d|j| j< n|j| j j}|j| j j}|}|
d }tj||gdd}tj||gdd}t| jjt}|| ||||f| jr| jnd| jd|\}}|jg |dR   }| |}|d ur&|d	d | f }|d	| d f }nd
\}}|||fS )NrG   r%   )sincosr   TdimrH   )r   scaling.NN) shaperP   q_projview	transposek_projv_projq_normk_normr0   self_attention_cacheupdater   
is_updatedru   cross_attention_cachelayerskeysvaluestorchcatr   get_interfacer   _attn_implementationr3   trainingrZ   r   reshape
contiguouso_proj)ro   r   r   r   r   r   r   ry   input_shapehidden_shapecross_input_shapecross_hidden_shapequery_states
key_statesvalue_statesr   r   cache_kwargsr   r   r   cross_key_statescross_value_statescross_key_sizeattention_interfaceattn_outputattn_weightsself_attn_weightscross_attn_weightsrp   rp   rq   r     sr   







zT5Gemma2MergedAttention.forwardr   )r}   r~   r   r   r:   r   rw   r   Tensorr   r   
LongTensorr   r   r   r   rp   rp   r   rq   r      s,    	
r   Tr\   r   c              
      s(   dt dt dt dt dtf
 fdd}|S )zL
    This creates uni/bidirectional attention mask with sliding window.
    	batch_idxhead_idxq_idxkv_idxr   c           	         s\    rd}}nd d d d }}|| }|dk||k @ }|dk | |k @ }||B S )Nr   rG   r%   rp   )	r  r  r  r  left_window_sizeright_window_sizedist	left_mask
right_maskr   r\   rp   rq   
inner_maskh  s   z0sliding_window_mask_function.<locals>.inner_mask)r   ri   )r\   r   r  rp   r  rq   sliding_window_mask_functionc  s   $r  c                   @   r   )T5Gemma2EncoderLayerNr   rp   rp   rp   rq   r  v  r   r  c                       s   e Zd ZdZdef fddZ						ddejdeejejf d	ejdB d
ej	dB de
dB dedB dej	dB dejdB dejfddZ  ZS )T5Gemma2DecoderLayerzFDecoder sub-layer: merged attention instead of vanilla self-attention.r   c                    s    t  || t||d| _d S )N)r   r   )r   rw   r   	self_attnr   r   rp   rq   rw   }  s
   zT5Gemma2DecoderLayer.__init__NFr   r   r   position_idsr   rU   r   r   r   c	                 K   s   |}
|  |}| jd||||||||d|	\}}}| |}|
| | }|}
| |}| |}| |}|
| | }|S )N)r   r   r   r  r   rU   r   r   rp   )pre_self_attn_layernormr  post_self_attn_layernormr   pre_feedforward_layernormmlppost_feedforward_layernorm)ro   r   r   r   r  r   rU   r   r   ry   residual_rp   rp   rq   r     s,   
	



zT5Gemma2DecoderLayer.forward)NNNFNN)r}   r~   r   r   r   rw   r   r   r   r  r   ri   FloatTensorr   r   rp   rp   r   rq   r  z  s8    	r  c                   @   r   )T5Gemma2LMHeadNr   rp   rp   rp   rq   r    r   r  c                   @   r   )T5Gemma2ClassificationHeadNr   rp   rp   rp   rq   r    r   r  c                       s"   e Zd Zdef fddZ  ZS )T5Gemma2MultiModalProjectorr   c                    s   t  | d S r   r   r   r   rp   rq   rw        z$T5Gemma2MultiModalProjector.__init__)r}   r~   r   r   rw   r   rp   rp   r   rq   r    s    r  c                       sP   e Zd ZdZ		ddededededef
 fd	d
Zdejf fddZ	  Z
S )T5Gemma2TextScaledWordEmbeddingzCT5Gemma2 Embedding: override to add eoi token embedding separately.      ?  num_embeddingsembedding_dimpadding_idxembed_scaler   c                    s0   t  |||| || _tt| j| _d S r   )	r   rw   r   r   	Parameterr   zerosr"  eoi_embedding)ro   r!  r"  r#  r$  r   r   rp   rq   rw     s   z(T5Gemma2TextScaledWordEmbedding.__init__	input_idsc                    s8   t  || j| jj }| j|j||| jk< |S r   )r   r   r$  toweightdtyper'  r   )ro   r(  input_embeddingsr   rp   rq   r     s   z'T5Gemma2TextScaledWordEmbedding.forward)r  r   )r}   r~   r   r   r   r   rw   r   r   r   r   rp   rp   r   rq   r    s     r  c                   @   sp   e Zd ZU eed< dZdZdZdZg dZ	e
egeedddeedddeed	d
dgdZdd Zdd ZdS )T5Gemma2PreTrainedModelr   modelTF)r  r  SiglipVisionEmbeddingsSiglipEncoderLayer#SiglipMultiheadAttentionPoolingHeadrG   r  )index
layer_namer%   
cross_attn)r   
attentionsc                 C   sX  t | | t|trt|j d S t|tr(t|j t	|j
|j d S t|tr]|jjjd d }tj|jjd| jj| d t|jdrY|jjd ur[t|jj d S d S d S d|jjv rkt|j d S t|tr|jD ]6}|j}|j| dkrt|j|  }||j|d\}}tt|| d	| tt|| d
| qsd S d S )Nr   g      rH   )meanstdbiasRMSNormdefault)r   	_inv_freq_original_inv_freq)r   _init_weightsr   r  initzeros_mm_input_projection_weightr  r'  	constant_r$  scalar_embed_scaler  out_projr*  r   normal_r   rS   hasattrr8  r   r}   r   r]   r   	rope_typer   copy_r   )ro   modulescaler   rope_init_fncurr_inv_freqr  rp   rp   rq   r=    s0   




z%T5Gemma2PreTrainedModel._init_weightsc                 C   s|   | j j}|j}|j}|du rtd||j}|dddf  |dddf< ||d< |du r4td||dk| |S )	z
        Shifts input_ids to the right, prepends the decoder_start_token_id, and handles
        pad_token_id replacement for labels that were -100.
        This is a common preparation step for decoder inputs in sequence-to-sequence models.
        Nz:self.model.config.decoder.bos_token_id has to be defined. .r   rG   ).r   z9self.model.config.decoder.pad_token_id has to be defined.i)	r   r   rX   rV   r   	new_zerosr   clonemasked_fill_)ro   r(  decoder_configdecoder_start_token_idrV   shifted_input_idsrp   rp   rq   %prepare_decoder_input_ids_from_labels  s    z=T5Gemma2PreTrainedModel.prepare_decoder_input_ids_from_labelsN)r}   r~   r   r   __annotations__base_model_prefixsupports_gradient_checkpointing_supports_flash_attn_supports_flex_attn_no_split_modulesr  r  r#   r   r   _can_record_outputsr=  rR  rp   rp   rp   rq   r-    s   
 	r-  c                       s   e Zd ZU eed< eedZ	ddedef fddZ	e
ee					ddejdB d	ejdB d
ejdB dejdB dejdB dee defddZ  ZS )T5Gemma2TextEncoderr   )r5  r   r   r   c                    s   t     j| _ j| _t j j| j jd |d| _t j j	d| _
d| _t fddt jD | _t j| _t | _|   d S )N      ?r$  r   epsFc                       g | ]}t  |qS rp   )r  rl   r   r   rp   rq   rr   8      z0T5Gemma2TextEncoder.__init__.<locals>.<listcomp>r   rw   rV   r#  rJ   r  rK   embed_tokensr   rT   normgradient_checkpointingr   
ModuleListrv   rM   r   r   r   r   r   
rotary_emb	post_initro   r   r   r   ra  rq   rw   $  s$   
zT5Gemma2TextEncoder.__init__Nr(  attention_maskr  inputs_embedstoken_type_idsry   r   c              	   K   s4  |d u |d uA rt d|dd  |d u r| |}|d u r.tjd|jd |jdd}t| }t	sU| j
||d}tdi |tdi |dt| j
jdd	id
}|}	i }
| j
jD ]}| |	|||
|< q]| |	}	| jd | j
j D ]}||	|
|j ||j |fi |}	qw| |	}	| |	}	t|	dS )N:You must specify exactly one of input_ids or inputs_embedsr   r   rG   r   )r   rl  rk  and_mask_functionF)r   rg   rf   )last_hidden_staterp   )r   poprd  r   aranger   r   	unsqueezer   r   r   r   r  r\   r]   rh  r   r   rM   attention_typere  r   )ro   r(  rk  r  rl  rm  ry   self_attn_mask_mappingmask_kwargsr   r   r   layer_modulerp   rp   rq   r   @  sL   
	


zT5Gemma2TextEncoder.forwardr   )NNNNN)r}   r~   r   r:   rS  r   r  rY  r   rw   r"   r$   r   r   r  r   r  r   r   r   r   r   rp   rp   r   rq   rZ    sD   
 	rZ  c                       s   e Zd ZU eed< 	ddedef fddZdd Zdd	 Ze	e
d
ejdee deeB fddZdejdB dejdB dejfddZe
						ddejdB dejdB dejdB dejdB d
ejdB dejdB dee defddZ  ZS )T5Gemma2Encoderr   r   r   c                    sD   t  | tj|j|d| _tj|jd| _	t
|| _|   d S )N)r   ra  )r   rw   rZ  _from_configr   
text_modelr&   from_configr   vision_towerr  multi_modal_projectorri  rj  r   rp   rq   rw     s
   
zT5Gemma2Encoder.__init__c                 C   
   | j  S r   )r}  get_input_embeddingsrn   rp   rp   rq   r       
z$T5Gemma2Encoder.get_input_embeddingsc                 C      | j |S r   )r}  set_input_embeddingsro   new_embeddingsrp   rp   rq   r       z$T5Gemma2Encoder.set_input_embeddingspixel_valuesry   r   c                 K   s0   | j d|dd|}|j}| |}||_|S )NT)r  return_dictrp   )r  rr  r  pooler_output)ro   r  ry   vision_outputsrr  image_featuresrp   rp   rq   get_image_features  s
   
z"T5Gemma2Encoder.get_image_featuresr(  Nrl  r  c                 C   s   | j j}|du r&|du rtd||  tj|tj|jdk}|d}n||k}|	 }|
d||j}|jd |jd  }t||  | kd| d|  |S )	z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        Nz9Either `input_ids` or `inputs_embeds` has to be provided.)r+  r   r   r   rG   z6Image features and image tokens do not match: tokens: z, features )r   r   r   r  r   tensorlongr   allsumru  	expand_asr)  r   r!   numel)ro   r(  rl  r  r   special_image_maskn_image_tokensn_image_featuresrp   rp   rq   get_image_placeholder_mask  s"   
z*T5Gemma2Encoder.get_image_placeholder_maskrk  r  rm  c                 K   s   |d u |d uA rt d|d u r| j|}|d ur8| j|ddj}||j|j}| j|||d}	|	|	|}| jd|||d|}
|
S )Nrn  T)r  )rl  r  )rl  rk  r  rp   )
r   r}  rd  r  r  r)  r   r+  r  masked_scatter)ro   r(  rk  r  rl  r  rm  ry   r  
image_maskoutputsrp   rp   rq   r     s&   zT5Gemma2Encoder.forwardrz  )NNNNNN)r}   r~   r   r   rS  r   rw   r  r  r   r   r   r   r   r   r   r   r  r  r  r  r   r   r   rp   rp   r   rq   r{    sd   
 
	
r{  c                       s   e Zd ZU eed< eeddeeddedZddede	f fdd	Z
eee	
	
	
	
	
	
	
	
	
ddejd
B dejd
B dejd
B ded
B dejd
B ded
B dejd
B dejd
B dejd
B dee defddZ  ZS )T5Gemma2Decoderr   rG   )r2  r%   )r5  cross_attentionsr   r   r   c                    s   t     j| _ j| _t j j j jd |d| _t j j	d| _
d| _t fddt jD | _t j| _t | _|   d S )Nr[  r\  r]  Fc                    r_  rp   )r  r`  ra  rp   rq   rr     rb  z,T5Gemma2Decoder.__init__.<locals>.<listcomp>rc  rj  r   ra  rq   rw     s$   
zT5Gemma2Decoder.__init__Nr(  rk  r  r   rl  rU   r   r   encoder_attention_maskry   r   c
              
   K   s  |d u |d uA rt d|d u rt d|d u r| |}| js0|r0|d u r0tt| jdt }|d u rL|d ur<| nd}tj|||j	d  |j
d}|d u rU|d}t| }ts| j||||d urh|jnd |d}dd	 |d
< tdi |tdi |d}t|	 }ts| j||	|d d d}dtdi |dt|	ii}tj|d |d gddtj|d |d gddd}|}i }| jjD ]}| |||||< q| |}| jd | jj D ]}||||j ||j |||||fi |
}q| |}| |}t||dS )Nrn  z0`encoder_hidden_states` must be given in decoderra  r   rG   ro  )r   rl  rk  r   r   r  c                  W   s   t jdt jdS )NT)r+  )r   r  ri   )argsrp   rp   rq   <lambda>7  s    z)T5Gemma2Decoder.forward.<locals>.<lambda>rp  rq  rg   or_mask_functionr   r   rf   )rr  r   rp   )r   rd  r   r   r   r   get_seq_lengthr   rt  r   r   ru  r   r   r   r1   r2   r8   r   r]   rh  r   r   rM   rv  re  r   )ro   r(  rk  r  r   rl  rU   r   r   r  ry   past_seen_tokensrw  rx  cross_attn_mask_mappingmerged_attn_mask_mappingr   r   r   ry  rp   rp   rq   r     s   


	

	

zT5Gemma2Decoder.forwardrz  )	NNNNNNNNN)r}   r~   r   r   rS  r#   r   r  rY  r   rw   r"   r$   r   r   r  r   r   r  ri   r   r   r   r   r   rp   rp   r   rq   r    sT   
 

	
r  c                !       s  e Zd ZdddZdef fddZdd Zd	d
 Zdd Zdd Z	e
e													d!dejdB dejdB dejdB dejdB dejdB dejdB dejdB dedB dedB dejdB dejdB dedB dejdB dee defdd Z  ZS )"T5Gemma2Modelz&encoder.text_model.embed_tokens.weightz-encoder.text_model.embed_tokens.eoi_embedding)zdecoder.embed_tokens.weightz"decoder.embed_tokens.eoi_embeddingr   c                    s8   t  | t|j|j| _t|j|j| _|   d S r   )r   rw   r{  r   r   r  r   ri  r   r   rp   rq   rw   }  s   zT5Gemma2Model.__init__c                 C      | j S r   )r   rn   rp   rp   rq   get_encoder     zT5Gemma2Model.get_encoderc                 C   r  r   r   rn   rp   rp   rq   get_decoder  r  zT5Gemma2Model.get_decoderc                 C   r  r   )r   r  rn   rp   rp   rq   r    r  z"T5Gemma2Model.get_input_embeddingsc                 C   r  r   )r   r  r  rp   rp   rq   r    r  z"T5Gemma2Model.set_input_embeddingsNr(  r  rk  r  decoder_input_idsdecoder_attention_maskdecoder_position_idsencoder_outputsr   rl  decoder_inputs_embedsrU   r   ry   r   c                 K   sz   |du r| j d||||
|dd|}|j}| jd|||||	||||dd
|}t|j|j|j|j|j|j|j|jdS )aX  
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
            config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
        NT)r(  rk  r  rl  r  r  )
r(  rk  r  rl  r   r   r  rU   r   r  )rr  r   decoder_hidden_statesdecoder_attentionsr  encoder_last_hidden_stater   encoder_attentionsrp   )r   rr  r   r   r   r   r5  r  )ro   r(  r  rk  r  r  r  r  r  r   rl  r  rU   r   ry   r   decoder_outputsrp   rp   rq   r     sH   
zT5Gemma2Model.forward)NNNNNNNNNNNNN)r}   r~   r   _tied_weights_keysr   rw   r  r  r  r  r   r   r   r  r  
BoolTensorr   r   r   ri   r   r   r   r   r   rp   rp   r   rq   r  v  sn    		
r  c                &       s  e Zd ZddiZddiZddgdgfiZdef fdd	Zd
d Zdd Z	dd Z
dd Zdd Zdd Zeedejdee deeB fddZedd Zee															d6dejdB dejdB d ejdB d!ejdB d"ejdB d#ejdB d$ejdB d%edB d&edB d'ejdB d(ejdB d)ejdB d*edB d+ejdB d,eejB dee deej e B f"d-d.Z!d/e"d0e#d1e$d2ed3edef fd4d5Z%  Z&S )7 T5Gemma2ForConditionalGenerationzlm_head.out_proj.weightz,model.encoder.text_model.embed_tokens.weightzlm_head.out_projcolwise_gather_outputr   logitsr   c                    sD   t  | t|| _|jj| _t|jj| j| _d| _	| 
  d S )NForMaskedLM)r   rw   r  r.  r   rJ   r  rK   lm_head	loss_typeri  r   r   rp   rq   rw     s   

z)T5Gemma2ForConditionalGeneration.__init__c                 C   s   || j _d S r   r  rC  r  rp   rp   rq   set_output_embeddings  r  z6T5Gemma2ForConditionalGeneration.set_output_embeddingsc                 C   s   | j jS r   r  rn   rp   rp   rq   get_output_embeddings  s   z6T5Gemma2ForConditionalGeneration.get_output_embeddingsc                 C   r  r   r.  r  rn   rp   rp   rq   r    r  z5T5Gemma2ForConditionalGeneration.get_input_embeddingsc                 C      | j | d S r   r.  r  ro   valuerp   rp   rq   r    r  z5T5Gemma2ForConditionalGeneration.set_input_embeddingsc                 C   r  r   )r.  r  rn   rp   rp   rq   r    r  z,T5Gemma2ForConditionalGeneration.get_encoderc                 C   r  r   )r.  r  rn   rp   rp   rq   r    r  z,T5Gemma2ForConditionalGeneration.get_decoderr  ry   r   c                 K   s   |   j|fi |S r   )r  r  )ro   r  ry   rp   rp   rq   r    s   z3T5Gemma2ForConditionalGeneration.get_image_featuresc                 C   s
   |   jS r   )r  r  rn   rp   rp   rq   r     s   
z-T5Gemma2ForConditionalGeneration.vision_towerNr   r(  rk  r  r  r  r  r  r   rl  r  labelsrU   r   logits_to_keepc                 K   s  |dur|du r|du r|  |}| jd|||||||||	|
|||d|}|j}t|tr5t| dn|}| |dd|ddf }| jj}|j	dur]||j	 }t
|}||j	 }d}|duro| j||| jfi |}t|||j|j|j|j|j|j|jd	S )a  
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
            config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        N)r(  r  rk  r  r  r  r  r  r   rl  r  rU   r   )	lossr  r   r  r  r  r  r   r  rp   )rR  r.  rr  r   r   slicer  r   r   r^   r   tanhloss_functionrJ   r   r   r  r  r  r  r   r  )ro   r(  r  rk  r  r  r  r  r  r   rl  r  r  rU   r   r  ry   r  r   slice_indicesr  rO  r  rp   rp   rq   r     sR   "




z(T5Gemma2ForConditionalGeneration.forwardgeneration_configmodel_kwargsgeneration_mode
batch_sizemax_cache_lengthc                    sJ  t  ||||| |jdu rdS |j}|du rd}nd|jv }t| jjdd}|`|`	||d}	|
d}
|
durqt|
tsEtdt|
jd	krT|
j
d	rTdS t|
j}|tkrh|d
 d	 jd |	d< |di |	|
_nttdi | jjdd|dt |d< t| dr| jdurt| jtstd|d | _dS dS dS )zMOverride cache preparation to support T5Gemma2-specific EncoderDecoder Cache.FN	offloadedTr  )r   
offloadingr   zaThe `past_key_values` in `model_kwargs` must be of type `EncoderDecoderCache` for T5Gemma2 model.r   r  rG   max_cache_len_cachezLThe internal cache must be of type `EncoderDecoderCache` for T5Gemma2 model.rp   )r   _prepare_cache_for_generationrU   cache_implementationcopydeepcopyr   get_text_configr\   r]   ru   r   r   r   lenr   r   r   r	   r   r   rE  r  )ro   r  r  r  r  r  r  offload_cachecross_attn_configcross_attn_cache_kwargsr   cross_attn_clsr   rp   rq   r  W  sX   
	




z>T5Gemma2ForConditionalGeneration._prepare_cache_for_generation)NNNNNNNNNNNNNNr   )'r}   r~   r   r  _tp_plan_pp_planr   rw   r  r  r  r  r  r  r   r   r   r   r   r   r   r   r  propertyr  r  r  r  r   r   ri   r   r   r   r   r   r   r  r   rp   rp   r   rq   r    s    

	
Qr  c                          e Zd Zdef fddZdd Zdd Zee											dd	e	j
dB d
e	jdB de	jdB de	j
dB de	j
dB de	jdB de	j
dB dedB de	jdB de	jdB de	j
dB dee defddZ  ZS )!T5Gemma2ForSequenceClassificationr   c                    R   t  | |j| _|jj| _t|| _t|dd}t| j| j|| _	| 
  d S Nr   g?r   rw   
num_labelsr   rK   r  r.  r   r  scoreri  ro   r   classifier_dropoutr   rp   rq   rw     s   

z*T5Gemma2ForSequenceClassification.__init__c                 C   r  r   r  rn   rp   rp   rq   r    r  z6T5Gemma2ForSequenceClassification.get_input_embeddingsc                 C   r  r   r  r  rp   rp   rq   r    r  z6T5Gemma2ForSequenceClassification.set_input_embeddingsNr(  r  rk  r  r  r  r  r  rl  r  r  ry   r   c                 K   s4  |	dus|
durt d| jj d|du rtd|du r#| |}| j|f||||||||	|
dd
|}|j}|j}|j}| 	|}|j
d }|| jjk|jtj}tj|j
d |jtjd	}|| d}tj||j
d d
 d}|tj||jd|f }d}|dur| j|||| jd}t||||dS )  
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
            config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N8Passing input embeddings is currently not supported for .You have to specify input_idsF
r  rk  r  r  r  r  r  rl  r  rU   r   r   )r   r+  rG   )maxro  )r  r  pooled_logitsr   r  r  r   r5  )NotImplementedErrorr   r}   r   rR  r.  rr  r  r  r  r   r   rV   r)  r   r   int32rt  argmaxclampr  r   )ro   r(  r  rk  r  r  r  r  r  rl  r  r  ry   r  rr  r   r5  r  r  non_pad_masktoken_indiceslast_non_pad_tokenr  r  rp   rp   rq   r     sV   


z)T5Gemma2ForSequenceClassification.forwardNNNNNNNNNNN)r}   r~   r   r   rw   r  r  r   r   r   r  r  r   r   r   r   r   r   r   rp   rp   r   rq   r    sX    	
r  c                       r  )T5Gemma2ForTokenClassificationr   c                    r  r  r  r  r   rp   rq   rw     s   

z'T5Gemma2ForTokenClassification.__init__c                 C   r  r   r  rn   rp   rp   rq   r    r  z3T5Gemma2ForTokenClassification.get_input_embeddingsc                 C   r  r   r  r  rp   rp   rq   r    r  z3T5Gemma2ForTokenClassification.set_input_embeddingsNr(  r  rk  r  r  r  r  r  rl  r  r  ry   r   c                 K   s   |	dus|
durt d| jj d|du rtd|du r#| |}| j|f||||||||	|
dd
|}|j}|j}|j}| 	|}d}|durS| 
||| j}t||||dS )r  Nr  r  r  Fr  r  )r  r   r}   r   rR  r.  rr  r  r  r  r  r   r   )ro   r(  r  rk  r  r  r  r  r  rl  r  r  ry   r  rr  r   r5  r  r  rp   rp   rq   r     sJ   

z&T5Gemma2ForTokenClassification.forwardr  )r}   r~   r   r   rw   r  r  r   r   r   r  r  r   r   r   r   r   r   r   rp   rp   r   rq   r    sX    	
r  )
r   r:   r   r   r  r  r{  r-  r  r  )T)kr  collections.abcr   typingr   r   r   torch.nnr    r   r>  cache_utilsr   r   r	   configuration_utilsr
   r   
generationr   r   r   masking_utilsr   modeling_flash_attention_utilsr   modeling_outputsr   r   r   r   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r    r!   utils.genericr"   utils.output_capturingr#   r$   autor&   gemma3.configuration_gemma3r'   r(   gemma3.modeling_gemma3r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   siglipr4   t5gemma.modeling_t5gemmar5   r6   r7   r8   
get_loggerr}   r   r:   r   r   r   r   r   r   r   r   r   r  r  r  r  r  r  r  r-  rZ  r{  r  r  r  r  r  __all__rp   rp   rp   rq   <module>   sr   $	4
 	 yc1Oeh _ NaX