o
    ei                     @   s  d dl mZ d dlmZ d dlmZ d dlZd dlmZ ddl	m
Z ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZmZmZ ddlmZmZ ddlm Z m!Z!m"Z"m#Z# ddl$m%Z%m&Z& ddl'm(Z(m)Z) ddl*m+Z+ ddl,m-Z-m.Z.m/Z/m0Z0m1Z1m2Z2 ddl3m4Z4 ddl5m6Z6m7Z7 ddl8m9Z9 ddl:m;Z; ddl<m=Z=m>Z> e1?e@ZAee/ddG dd de ZBee/ddG dd  d e-ZCG d!d" d"ejDZEG d#d$ d$ejFZGG d%d& d&ejFZHG d'd( d(ejFZId)d* ZJed+did,d-ZKd.ejLd/eMd0ejLfd1d2ZN	3		djd4ejFd5ejLd6ejLd7ejLd8ejLdB d9eOd:eOdB d;eOdB d0ePejLejLf fd<d=ZQeeKG d>d? d?ejFZRG d@dA dAeZSe/G dBdC dCe)ZTdDeMd0eeMeMeMeMgeUf fdEdFZVe/G dGdH dHeTZWe/G dIdJ dJeTeZXG dKdL dLejFZYdMejLdB dNejLdB d0edB fdOdPZZe4dQdRdSdT			U	dkdVedSejLd8ejLdB dWejLdXedB dYejLdB dMejLdB dZej[dB d[eUd\eUdB d0e\fd]d^Z]e/d_dG d`da daeTZ^e/d_dG dbdc dceTeZ_G ddde deeTZ`G dfdg dgeeTZag dhZbdS )l    )Callable)	dataclass)OptionalN   )initialization)ACT2FN)CacheDynamicCache)PreTrainedConfig)GenerationMixin)use_kernel_func_from_hubuse_kernelized_func)create_causal_maskcreate_masks_for_generate!create_sliding_window_causal_mask) GenericForSequenceClassificationGradientCheckpointingLayer)BaseModelOutputWithPastBaseModelOutputWithPoolingCausalLMOutputWithPast SequenceClassifierOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tupleloggingtorch_compilable_check)deprecate_kwarg)maybe_autocastmerge_with_config_defaults)capture_outputs   )	AutoModel   )Gemma3ConfigGemma3TextConfigzK
    Base class for Gemma3 outputs, with hidden states and attentions.
    custom_introc                   @   s$   e Zd ZU dZdZejdB ed< dS )Gemma3ModelOutputWithPasta  
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nimage_hidden_states)__name__
__module____qualname____doc__r.   torchFloatTensor__annotations__ r6   r6   h/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/gemma3/modeling_gemma3.pyr-   8   s   
 r-   zR
    Base class for Gemma3 causal language model (or autoregressive) outputs.
    c                   @   s   e Zd ZU dZdZejdB ed< dZejdB ed< dZ	e
dB ed< dZeej dB ed< dZeej dB ed< dZejdB ed< dS )	Gemma3CausalLMOutputWithPasta8  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
    Nlosslogitspast_key_valueshidden_states
attentionsr.   )r/   r0   r1   r2   r9   r3   r4   r5   r:   r;   r   r<   tupler=   r.   r6   r6   r6   r7   r8   H   s   
 r8   c                	       sH   e Zd ZdZddedededef fddZd	ejf fd
dZ	  Z
S )Gemma3TextScaledWordEmbeddingz\
    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
          ?num_embeddingsembedding_dimpadding_idxembed_scalec                    s0   t  ||| || _| jdt|dd d S )NrD   F
persistent)super__init__scalar_embed_scaleregister_bufferr3   tensor)selfrA   rB   rC   rD   	__class__r6   r7   rH   k   s   z&Gemma3TextScaledWordEmbedding.__init__	input_idsc                    s   t  || j| jj S N)rG   forwardrD   toweightdtype)rL   rO   rM   r6   r7   rQ   p   s   z%Gemma3TextScaledWordEmbedding.forward)r@   )r/   r0   r1   r2   intfloatrH   r3   TensorrQ   __classcell__r6   r6   rM   r7   r?   f   s     r?   c                       s*   e Zd Zdef fddZdd Z  ZS )	Gemma3MLPconfigc                    sr   t    || _|j| _|j| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _	t
|j | _d S NFbias)rG   rH   rZ   hidden_sizeintermediate_sizennLinear	gate_projup_proj	down_projr   hidden_activationact_fnrL   rZ   rM   r6   r7   rH   u   s   
zGemma3MLP.__init__c                 C   s$   |  | | || | }|S rP   )rd   rf   rb   rc   )rL   xrd   r6   r6   r7   rQ      s    zGemma3MLP.forward)r/   r0   r1   r*   rH   rQ   rX   r6   r6   rM   r7   rY   t   s    
rY   c                       s@   e Zd Zddedef fddZdd Zdd	 Zd
d Z  Z	S )Gemma3RMSNormư>dimepsc                    s&   t    || _tt|| _d S rP   )rG   rH   rl   r`   	Parameterr3   zerosrS   )rL   rk   rl   rM   r6   r7   rH      s   
zGemma3RMSNorm.__init__c                 C   s$   |t |djddd| j  S )Nr&   T)keepdim)r3   rsqrtpowmeanrl   )rL   rh   r6   r6   r7   _norm   s   $zGemma3RMSNorm._normc                 C   s*   |  | }|d| j   }||S )Nr@   )rt   rV   rS   type_as)rL   rh   outputr6   r6   r7   rQ      s   
zGemma3RMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)r>   rS   shaperl   rL   r6   r6   r7   
extra_repr   s   zGemma3RMSNorm.extra_repr)rj   )
r/   r0   r1   rU   rV   rH   rt   rQ   ry   rX   r6   r6   rM   r7   ri      s
    ri   c                       s   e Zd ZU ejed< ddef fddZe				ddedB de	d de
dB d	edB d
edef f
ddZe edddZ  ZS )Gemma3RotaryEmbeddinginv_freqNrZ   c                    s   t    |j| _|j| _|| _tt|j| _i | _	| jD ]P}| jj
| }|d u r+q|d | j	|< | j}| j	| dkrCt| j	|  }|| j||d\}}| j| d|dd | j| d| dd t| | d| qd S )	N	rope_typedefault
layer_type	_inv_freqFrE   _original_inv_freq_attention_scaling)rG   rH   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrZ   listsetlayer_typesr|   rope_parameterscompute_default_rope_parametersr   rJ   clonesetattr)rL   rZ   devicer   rope_paramsrope_init_fncurr_inv_freqcurr_attention_scalingrM   r6   r7   rH      s&   

zGemma3RotaryEmbedding.__init__r   ztorch.deviceseq_lenr   returnztorch.Tensorc                 C   s^   | j | d }t| ddp| j| j }d}d|tjd|dtjdj|tjd|   }||fS )	a|  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
            layer_type (`str`, *optional*):
                The current layer type if the model has different RoPE parameters per type.
                Should not be used unless `config.layer_types is not None`

        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetahead_dimNr@   r   r&   rT   r   rT   )	r   getattrr^   num_attention_headsr3   arangeint64rR   rV   )rZ   r   r   r   baserk   attention_factorr{   r6   r6   r7   r      s   &z5Gemma3RotaryEmbedding.compute_default_rope_parametersc                 C   s  t | | d}t | | d}|d d d d f  |jd dd|j}|d d d d d f  }t|jjtrE|jjdkrE|jjnd}t	|dd	) | |  
dd
}	tj|	|	fdd}
|
 | }|
 | }W d    n1 syw   Y  |j|jd|j|jdfS )Nr   r   r   ro   r(   mpscpuF)device_typeenabledr&   rk   r   )r   rV   expandrw   rR   r   
isinstancetypestrr#   	transposer3   catcossinrT   )rL   rh   position_idsr   r{   attention_scalinginv_freq_expandedposition_ids_expandedr   freqsembr   r   r6   r6   r7   rQ      s   .&zGemma3RotaryEmbedding.forward)NNNNNNrP   )r/   r0   r1   r3   rW   r5   r*   rH   staticmethodr   rU   r   r>   rV   r   no_gradr   rQ   rX   r6   r6   rM   r7   rz      s,   
 

#rz   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..Nro   r&   r   )rw   r3   r   )rh   x1x2r6   r6   r7   rotate_half   s   r   rotary_pos_embc                 C   sD   | |}| |}| | t| |  }|| t||  }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer   )qkr   r   unsqueeze_dimq_embedk_embedr6   r6   r7   apply_rotary_pos_emb   s
   

r   r<   n_repr   c                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r(   N)rw   r   reshape)r<   r   batchnum_key_value_headsslenr   r6   r6   r7   	repeat_kv
  s
   0r           modulequerykeyvalueattention_maskdropoutscalingsoftcapc                 K   s   |d u r	| j d }t|| j}	t|| j}
t||	dd| }|d ur2|| }t|}|| }|d ur:|| }tjj	|dtj
d|j}tjj||| jd}t||
}|dd }||fS )N      r&   r   ro   )rk   rT   )ptrainingr(   )r   r   num_key_value_groupsr3   matmulr   tanhr`   
functionalsoftmaxfloat32rR   rT   r   r   
contiguous)r   r   r   r   r   r   r   r   kwargs
key_statesvalue_statesattn_weightsattn_outputr6   r6   r7   eager_attention_forward  s    

r   c                       s   e Zd ZdZdedef fddZ				ddejdejd	ejdB d
e	dB dej
dB dee deejejdB eej dB f fddZ  ZS )Gemma3Attentionz=Multi-headed attention from 'Attention Is All You Need' paperrZ   	layer_idxc                    sB  t    t|dr|j| nd | _|| _|| _t|d|j|j	 | _
|j	|j | _|jd | _| jj| _| jj | _tj|j|j	| j
 |jd| _tj|j|j| j
 |jd| _tj|j|j| j
 |jd| _tj|j	| j
 |j|jd| _| jj| _| jdkr|jnd | _| jdk| _t|j
|jd| _t|j
|jd| _d S )Nr   r   r   r\   sliding_attention)rk   rl   ) rG   rH   hasattrr   r   rZ   r   r   r^   r   r   r   r   query_pre_attn_scalarr   attention_dropoutuse_bidirectional_attention	is_causalr`   ra   attention_biasq_projk_projv_projo_projattn_logit_softcappingsliding_window
is_slidingri   rms_norm_epsq_normk_normrL   rZ   r   rM   r6   r7   rH   <  s4   


zGemma3Attention.__init__Nr<   position_embeddingsr   r;   cache_positionr   r   c                 K   s0  |j d d }g |d| jR }| ||dd}	| ||dd}
| ||dd}| |	}	| |
}
|\}}t	|	|
||\}	}
|d ura|||d}|
|
|| j|\}
}t| jjt}|| |	|
||f| jrv| jnd| j| jd|\}}|jg |dR   }| |}||fS )Nro   r(   r&   )r   r   r   r   )r   r   r   )rw   r   r   viewr   r   r   r   r   r   updater   r   get_interfacerZ   _attn_implementationr   r   r   r   r   r   r   r   )rL   r<   r   r   r;   r   r   input_shapehidden_shapequery_statesr   r   r   r   cache_kwargsattention_interfacer   r   r6   r6   r7   rQ   Z  s>   	

	

zGemma3Attention.forwardr   )r/   r0   r1   r2   r*   rU   rH   r3   rW   r   
LongTensorr   r   r>   rQ   rX   r6   r6   rM   r7   r   8  s,    !r   c                       s   e Zd Zdedef fddZ					ddejdejdejdB d	ejdB d
e	dB dejdB de
e deejeejejf dB f fddZ  ZS )Gemma3DecoderLayerrZ   r   c                    s   t    || _|j| _|| _|j| | _t||d| _t	|| _
t| j|jd| _t| j|jd| _t| j|jd| _t| j|jd| _d S )N)rZ   r   rl   )rG   rH   rZ   r^   r   r   attention_typer   	self_attnrY   mlpri   r   input_layernormpost_attention_layernormpre_feedforward_layernormpost_feedforward_layernormr   rM   r6   r7   rH     s   

zGemma3DecoderLayer.__init__Nr<   r   r   r   r;   r   r   r   c           
   	   K   sp   |}|  |}| jd||||||d|\}}	| |}|| }|}| |}| |}| |}|| }|S )N)r<   r   r   r   r;   r   r6   )r  r  r  r  r  r  )
rL   r<   r   r   r   r;   r   r   residual_r6   r6   r7   rQ     s(   



	


zGemma3DecoderLayer.forward)NNNNN)r/   r0   r1   r*   rU   rH   r3   rW   r   r   r   r   r>   r4   rQ   rX   r6   r6   rM   r7   r     s0    	r   c                       sf   e Zd ZU eed< dZdZg dZdgZdZ	dZ
dZdZdZeedZdZe  fdd	Z  ZS )
Gemma3PreTrainedModelrZ   modelT)r   SiglipVisionEmbeddingsSiglipEncoderLayer#SiglipMultiheadAttentionPoolingHeadr;   )r<   r=   )imagetextc                    s   t  | t|trt|j d S d|jjv r!t|j	 d S t|t
r0t|j|j d S t|trm|jD ]6}|j}|j| dkrKt|j|  }||j|d\}}tt|| d| tt|| d| q8d S d S )NRMSNormr}   r~   r   r   )rG   _init_weightsr   Gemma3MultiModalProjectorinitzeros_mm_input_projection_weightrN   r/   rS   r?   	constant_rD   rI   rz   r   r   r|   r   rZ   copy_r   )rL   r   r   r   r   r  rM   r6   r7   r    s"   



z#Gemma3PreTrainedModel._init_weights)r/   r0   r1   r)   r5   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr   r   _can_record_outputsinput_modalitiesr3   r   r  rX   r6   r6   rM   r7   r	    s"   
 r	  r   c              
      s&   dt dt dt dt dtf
 fdd}|S )zA
    Enables a bidirectional mask within the sliding window.
    	batch_idxhead_idxq_idxkv_idxr   c                    s   t ||  k S )zA token can attend to any other token if their absolute distance is within
        the (exclusive) sliding window size (distance < sliding_window).)abs)r#  r$  r%  r&  r   r6   r7   
inner_mask  s   z1_bidirectional_window_overlay.<locals>.inner_maskrU   bool)r   r)  r6   r(  r7   _bidirectional_window_overlay  s   "r,  c                       s   e Zd ZU eed< dZdef fddZeee								dde
jdB de
jdB de
jdB d	edB d
e
jdB dedB de
jdB dee defddZ  ZS )Gemma3TextModelrZ   r  c                    s   t     j| _ j| _t j j| j| jjd d| _t	
 fddt jD | _t j jd| _t | _d| _|   d S )N      ?)rD   c                    s   g | ]}t  |qS r6   )r   ).0r   rZ   r6   r7   
<listcomp>  s    z,Gemma3TextModel.__init__.<locals>.<listcomp>r   F)rG   rH   pad_token_idrC   
vocab_sizer?   r^   rZ   embed_tokensr`   
ModuleListrangenum_hidden_layerslayersri   r   normrz   
rotary_embgradient_checkpointing	post_initrg   rM   r1  r7   rH     s   
zGemma3TextModel.__init__NrO   r   r   r;   inputs_embeds	use_cacher   r   r   c              	   K   sv  |d u |d uA rt d|d u r| |}|r!|d u r!t| jd}|d u r=|d ur-| nd}	tj|	|	|jd  |jd}|d u rF|	d}t
| }
ts|| j|||||d}| }| jjrmdd |d	< t| jj|d	< tdi |tdi |d
}
|}i }| jjD ]}| |||||< q| jd | jj D ]}||f|
|j ||j |||d|}q| |}t||dS )N:You must specify exactly one of input_ids or inputs_embedsr1  r   r(   r   rZ   r>  r   r   r;   r   c                  W   s   t jdt jdS )NTr   )r3   rK   r+  )argsr6   r6   r7   <lambda>:  s    z)Gemma3TextModel.forward.<locals>.<lambda>or_mask_function)full_attentionr   )r   r   r   r;   r   )last_hidden_stater;   r6   )
ValueErrorr5  r	   rZ   get_seq_lengthr3   r   rw   r   r   r   dictcopyr   r,  r   r   r   r   r;  r9  r8  r   r:  r   )rL   rO   r   r   r;   r>  r?  r   r   past_seen_tokenscausal_mask_mappingmask_kwargssliding_mask_kwargsr<   r   r   decoder_layerr6   r6   r7   rQ     sb   



zGemma3TextModel.forward)NNNNNNN)r/   r0   r1   r*   r5   r"  rH   r$   r%   r   r3   r   rW   r   r4   r+  r   r   r   rQ   rX   r6   r6   rM   r7   r-    sB   
 	
r-  c                       s   e Zd ZU ddiZddiZddgdgfiZeed< def fdd	Ze	e
	
	
	
	
	
	
	
	
	ddejd
B dejd
B dejd
B ded
B dejd
B dejd
B ded
B dejd
B deejB dee defddZ  ZS )Gemma3ForCausalLMlm_head.weightzmodel.embed_tokens.weightlm_headcolwise_gather_outputr<   r:   rZ   c                    s@   t  | t|| _|j| _tj|j|jdd| _| 	  d S r[   )
rG   rH   r-  r
  r4  r`   ra   r^   rS  r=  rg   rM   r6   r7   rH   c  s
   
zGemma3ForCausalLM.__init__Nr   rO   r   r   r;   r>  labelsr?  r   logits_to_keepr   r   c
              
   K   s   | j d|||||||d|
}|j}t|	trt|	 dn|	}| |dd|ddf }| jjdurE|| jj }t	|}|| jj }d}|durW| j
||| jfi |
}t|||j|j|jdS )a  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, Gemma3ForCausalLM

        >>> model = Gemma3ForCausalLM.from_pretrained("google/gemma-2-9b")
        >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")

        >>> prompt = "What is your favorite condiment?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "What is your favorite condiment?"
        ```)rO   r   r   r;   r>  r?  r   Nr9   r:   r;   r<   r=   r6   )r
  rG  r   rU   slicerS  rZ   final_logit_softcappingr3   r   loss_functionr4  r   r;   r<   r=   )rL   rO   r   r   r;   r>  rU  r?  r   rV  r   outputsr<   slice_indicesr:   r9   r6   r6   r7   rQ   l  s8   !
zGemma3ForCausalLM.forward)	NNNNNNNNr   )r/   r0   r1   _tied_weights_keys_tp_plan_pp_planr*   r5   rH   r   r   r3   r   rW   r   r4   r+  rU   r   r   r   rQ   rX   r6   r6   rM   r7   rQ  \  sP   
 		
rQ  c                       s2   e Zd Zdef fddZdejfddZ  ZS )r  rZ   c                    s   t    tt|jj|jj| _	t
|jj|jjd| _t|jj|jj | _t|jd | _| j| j | _tj| j| jd| _d S )Nr   r/  )kernel_sizestride)rG   rH   r`   rm   r3   rn   vision_configr^   text_configr  ri   layer_norm_epsmm_soft_emb_normrU   
image_size
patch_sizepatches_per_imagemm_tokens_per_imagetokens_per_sider`  	AvgPool2davg_poolrg   rM   r6   r7   rH     s   
z"Gemma3MultiModalProjector.__init__vision_outputsc           	      C   sv   |j \}}}|dd}|||| j| j}| }| |}|d}|dd}| |}t	|| j
}||S )Nr(   r&   )rw   r   r   rh  r   rl  flattenre  r3   r   r  ru   )	rL   rm  
batch_sizer  r^   reshaped_vision_outputspooled_vision_outputsnormed_vision_outputsprojected_vision_outputsr6   r6   r7   rQ     s   



z!Gemma3MultiModalProjector.forward)	r/   r0   r1   r)   rH   r3   rW   rQ   rX   r6   r6   rM   r7   r    s    r  token_type_idsimage_group_idsc              
      s4   du rdS dt dt dt dt dtf
 fdd}|S )	z
    This function adds the correct offsets to the `q_idx` and `kv_idx` as the torch API can only accept lengths,
    not start and end indices.
    Nr#  r$  r%  r&  r   c                    s   t |jd k |d}t |jd k |d}| |f }t |jd k |d}| |f }t |jd k |d} | |f }t | jd k |d} | |f }	t | jd k |	d}	|dk|dk@ }
||	k}|
|@ S )Nr(   r   ro   )r3   whererw   )r#  r$  r%  r&  
safe_q_idxsafe_kv_idxtoken_type_ids_at_q_idxtoken_type_ids_at_kv_idximage_group_ids_at_q_idximage_group_ids_at_kv_idxis_image_blocksame_image_blockru  rt  r6   r7   r)    s   z0token_type_ids_mask_function.<locals>.inner_maskr*  )rt  ru  r)  r6   r  r7   token_type_ids_mask_function  s   	$r  input_embeds5.6.0r>  versionnew_nameFrZ   r   r;   r   pixel_valuesis_trainingis_first_iterationc
                 K   s   |r
|du r
t d|  |||||d}|	dur|	n|du p&|j p&|du}	|durh|	rh|dk|j}tjj|dddddddf }|| @ }tj	|
 dd	d }t||d}t||j||d
< tdi |S )a  
    Overwrites the base `create_masks_for_generate` with `token_type_ids` masking to create the causal mask mapping
    for all kinds of forward passes. Gemma3 uses a bidirectional mask for images.

    Uses `pixel_values` as an optional input to disambiguate edge cases.
    Nz;`token_type_ids` is required as a model input when trainingrB  r(   )r(   r   r   )r   ro   r   rE  r6   )rH  get_text_configis_initializedrR   r   r`   r   padr3   cumsumrU   rv  r  r   )rZ   r>  r   r   r;   r   rt  r  r  r  r   rN  is_imageis_previous_imagenew_image_startru  r6   r6   r7   create_causal_mask_mapping  s.   "
r  zy
    The Base Gemma3 model which consists of a vision backbone and a language model without language modeling head.,
    c                       s   e Zd ZddiZdZdef fddZdd Zd	d
 Ze	e
dddejdee deeB fddZdejdejdejfddZe	e
										d"dejdB dejdB dejdB dejdB dedB dejdB dejdB dejdB dejdB dedB dee deeB fd d!Z  ZS )#Gemma3Modelzlanguage_model.modellanguage_modelFrZ   c                    sP   t  | tj|jd| _t|| _|jj	| _	tj|jd}|| _
|   d S )Nr1  )rG   rH   r'   from_configrb  vision_towerr  multi_modal_projectorrc  r4  r  r=  )rL   rZ   r  rM   r6   r7   rH   =  s   

zGemma3Model.__init__c                 C   
   | j  S rP   )r  get_input_embeddingsrx   r6   r6   r7   r  G     
z Gemma3Model.get_input_embeddingsc                 C      | j | d S rP   )r  set_input_embeddingsrL   r   r6   r6   r7   r  J     z Gemma3Model.set_input_embeddingszOProjects the last hidden state from the vision model into language model space.r+   r  r   r   c                 K   s,   | j d|dd|}|j}| ||_|S )NT)r  return_dictr6   )r  rG  r  pooler_output)rL   r  r   rm  rG  r6   r6   r7   get_image_featuresM  s   zGemma3Model.get_image_featuresrO   r>  image_featuresc                 C   s   |du r||   tj| jjtj|jdk}|d}n|| jjk}| }|j	d |j	d  }|
d||j}t||  | kd| d|  |S )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        N)rT   r   ro   r   r(   z6Image features and image tokens do not match, tokens: z, features: )r  r3   rK   rZ   image_token_idlongr   allsumrw   r   	expand_asrR   r!   numel)rL   rO   r>  r  special_image_maskn_image_tokensn_image_featuresr6   r6   r7   get_placeholder_maskX  s   z Gemma3Model.get_placeholder_maskNr   r   r;   rt  r   rU  r?  	lm_kwargsc                 K   sV  |du |duA rt d|dur&| jj| jkr&|| jjk}| }d||< n|}|du r2|  |}|du rN|dur>| nd}tj|||j	d  |j
d}|durp| j|ddj}||j
|j}| j|||d}|||}t| }tst| j|||||||| jd		}| jd|||||
d|d
|}t|j|j|j|j|dur|dS ddS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration

        >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma32-3b-mix-224")
        >>> processor = AutoProcessor.from_pretrained("google/gemma32-3b-mix-224")

        >>> prompt = "Where is the cat standing?"
        >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(images=image, text=prompt,  return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(**inputs,)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Where is the cat standing?\nsnow"
        ```Nr@  r   r(   rA  T)r  )r>  r  )r  )r   r   r;   r>  r?  r  r   )rG  r;   r<   r=   r.   r6   )rH  rZ   r  r4  r   r  rI  r3   r   rw   r   r  r  rR   rT   r  masked_scatterr   rJ  r  r   r  r-   rG  r;   r<   r=   )rL   rO   r  r   r   r;   rt  r   r>  rU  r?  r  r  llm_input_idsrL  r  rM  r[  r6   r6   r7   rQ   p  sj   -

zGemma3Model.forward)
NNNNNNNNNN)r/   r0   r1   _checkpoint_conversion_mappingaccepts_loss_kwargsr)   rH   r  r  r   r   r3   r4   r   r   r>   r   r  r   r  rW   r   r+  r-   rQ   rX   r6   r6   rM   r7   r  3  sv    
	
	
r  c                       s  e Zd ZdddddZddiZdZd	ef fd
dZdd Zdd Z	e
dejdee fddZee
											d-dejdB dejdB dejdB dejdB dedB dejdB dejdB dejdB dejdB dedB deejB d ee d!eeB fd"d#Z								$			d. fd%d&	Zeed'd(dd)		d/d	edejdejdB dejdedB dejdB dejdB d*edB d!efd+d,Z  ZS )0Gemma3ForConditionalGenerationmodel.language_modelmodel.vision_towermodel.multi_modal_projectorrS  )^language_model.model^vision_tower^multi_modal_projectorz^language_model.lm_headrR  z(model.language_model.embed_tokens.weightFrZ   c                    s<   t  | t|| _tj|jj|jjdd| _	| 
  d S r[   )rG   rH   r  r
  r`   ra   rc  r^   r4  rS  r=  rg   rM   r6   r7   rH     s   
z'Gemma3ForConditionalGeneration.__init__c                 C   r  rP   r
  r  rx   r6   r6   r7   r    r  z3Gemma3ForConditionalGeneration.get_input_embeddingsc                 C   r  rP   r
  r  r  r6   r6   r7   r    r  z3Gemma3ForConditionalGeneration.set_input_embeddingsr  r   c                 K   s   | j j|fi |S rP   )r
  r  )rL   r  r   r6   r6   r7   r    s   z1Gemma3ForConditionalGeneration.get_image_featuresNr   rO   r   r   r;   rt  r   r>  rU  r?  rV  r  r   c                 K   s^  | j d||||||||
|	|d
|}|d }t|tr"t| dn|}| |dd|ddf }d}|	dur| }|dddddf }|	dddf }|dur~|dd|jd  df |j}|||jdk 	 }|||jdk 	 }n|	 }|	 }t
 }|d| jjj}|d|j}|||}t|||j|j|j|jdS )	a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration

        >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma-3-4b-it")
        >>> processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it")

        >>> messages = [
        ...     {
        ...         "role": "system",
        ...         "content": [
        ...             {"type": "text", "text": "You are a helpful assistant."}
        ...         ]
        ...     },
        ...     {
        ...         "role": "user", "content": [
        ...             {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
        ...             {"type": "text", "text": "Where is the cat standing?"},
        ...         ]
        ...     },
        ... ]

        >>> inputs = processor.apply_chat_template(
        ...     messages,
        ...     tokenize=True,
        ...     return_dict=True,
        ...     return_tensors="pt",
        ...     add_generation_prompt=True
        ... )
        >>> # Generate
        >>> generate_ids = model.generate(**inputs)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "user\nYou are a helpful assistant.\n\n\n\n\n\nWhere is the cat standing?\nmodel\nBased on the image, the cat is standing in a snowy area, likely outdoors. It appears to"
        ```
        )
rO   r  rt  r   r   r;   r>  r?  rU  r   r   N.ro   r(   )r9   r:   r;   r<   r=   r.   r6   )r
  r   rU   rX  rS  rV   rw   rR   r   r   r`   CrossEntropyLossr   rZ   rc  r4  r8   r;   r<   r=   r.   )rL   rO   r  r   r   r;   rt  r   r>  rU  r?  rV  r  r[  r<   r\  r:   r9   shift_logitsshift_labelsshift_attention_maskloss_fctflat_logitsflat_labelsr6   r6   r7   rQ     sP   >$
z&Gemma3ForConditionalGeneration.forwardTc                    s<   t  j|f||||||	|
||d	|}|s|	s||d< |S )N)	r;   r>  r   r   r   r?  rV  rt  r  r  )rG   prepare_inputs_for_generation)rL   rO   r;   r>  r   r   r  r   rt  r?  rV  rU  r  r   model_inputsrM   r6   r7   r  m  s$   z<Gemma3ForConditionalGeneration.prepare_inputs_for_generationr  r  r  r  c           	      K   s.   t | ||||||fd|idd | D S )Nr  c                 S   s   i | ]\}}|d kr||qS )r  r6   )r0  r   vr6   r6   r7   
<dictcomp>  s    zLGemma3ForConditionalGeneration.create_masks_for_generate.<locals>.<dictcomp>)r  items)	rZ   r>  r   r   r;   r   rt  r  r   r6   r6   r7   r     s   	z8Gemma3ForConditionalGeneration.create_masks_for_generate)NNNNNNNNNNr   )NNNNNNNTNNF)NF) r/   r0   r1   r  r]  r  r)   rH   r  r  r   r3   r4   r   r   r  r   r   rW   r   r+  rU   r>   r8   rQ   r  r   r"   r
   rJ  r   rX   r6   r6   rM   r7   r    s    	
q(
r  c                       s   e Zd ZddddZ fddZdd Zd	d
 Zee									dde	j
dB de	jdB de	jdB de	j
dB dedB de	jdB de	j
dB de	j
dB dedB dee defddZ  ZS )Gemma3ForSequenceClassificationr  r  r  )r  r  r  c                    sB   t  | |j| _t|| _tj|jj| jdd| _	| 
  d S r[   )rG   rH   
num_labelsr  r
  r`   ra   rc  r^   scorer=  rg   rM   r6   r7   rH     s
   
z(Gemma3ForSequenceClassification.__init__c                 C   r  rP   r  rx   r6   r6   r7   r    r  z4Gemma3ForSequenceClassification.get_input_embeddingsc                 C   r  rP   r  r  r6   r6   r7   r    r  z4Gemma3ForSequenceClassification.set_input_embeddingsNrO   r  r   r   r;   r>  rt  rU  r?  r   r   c
              
   K   s6  | j |f|||||||	d|
}|j}| |}|dur#|jd }n|jd }| jjjdu r7|dkr7td| jjjdu rAd}n2|durg|| jjjk|j	t
j}t
j|jd |j	t
jd}|| d}nd}t| jj d |t
j||j	d	|f }d}|dur| j|||| jd
}t|||j|j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        )r   r  r   r;   r>  rt  r?  Nr   r(   z=Cannot handle batch sizes > 1 if no padding token is defined.ro   r   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`rA  )r:   rU  pooled_logitsrZ   rW  )r
  rG  r  rw   rZ   rc  r3  rH  rR   r   r3   int32r   argmaxloggerwarning_oncerN   r/   rZ  r   r;   r<   r=   )rL   rO   r  r   r   r;   r>  rt  rU  r?  r   transformer_outputsr<   r:   ro  last_non_pad_tokennon_pad_masktoken_indicesr  r9   r6   r6   r7   rQ     sR   	

z'Gemma3ForSequenceClassification.forward)	NNNNNNNNN)r/   r0   r1   r  rH   r  r  r   r   r3   r   r4   rW   r   r+  r   r   r   rQ   rX   r6   r6   rM   r7   r    sT    		
r  c                   @   s   e Zd ZU dZeed< dZdS )#Gemma3TextForSequenceClassificationz
    Gemma3TextForSequenceClassification is a text-only sequence classification model that works with Gemma3TextConfig.
    It uses the generic sequence classification implementation for efficiency and consistency.
    rZ   r.  N)r/   r0   r1   r2   r*   r5   r"  r6   r6   r6   r7   r    s   
 r  )r	  r-  rQ  r  r  r  r  )r(   )r   NN)NNFN)ccollections.abcr   dataclassesr   typingr   r3   torch.nnr`    r   r  activationsr   cache_utilsr   r	   configuration_utilsr
   
generationr   integrationsr   r   masking_utilsr   r   r   modeling_layersr   r   modeling_outputsr   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r    r!   utils.deprecationr"   utils.genericr#   r$   utils.output_capturingr%   autor'   configuration_gemma3r)   r*   
get_loggerr/   r  r-   r8   	Embeddingr?   ModulerY   ri   rz   r   r   rW   rU   r   rV   r>   r   r   r   r	  r+  r,  r-  rQ  r  r  r4   rJ  r  r  r  r  r  __all__r6   r6   r6   r7   <module>   s    

Q

"Q1"+gQ$
(	
8 % P^
