o
    iM                     @   sl  d dl Z d dlmZ d dlmZ d dlmZmZ d dlZd dl	m
Z
 ddlmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZmZmZ ddlmZ ddlmZmZ ddlmZmZm Z  ddl!m"Z"m#Z# ddl$m%Z%m&Z& ddl'm(Z( ddl)m*Z*m+Z+m,Z,m-Z-m.Z. ddl/m0Z0 ddl1m2Z2 ddl3m4Z4 ddl5m6Z6m7Z7 e.8e9Z:ee,ddG dd deZ;ee,ddG dd de*Z<G dd  d e
j=Z>G d!d" d"e
j?Z@G d#d$ d$e
j?ZAG d%d& d&e
j?ZBd'd( ZCdYd)d*ZDd+ejEd,eFd-ejEfd.d/ZG	0		dZd1e
j?d2ejEd3ejEd4ejEd5eejE d6eHd7eeH d8eeH d-eIejEejEf fd9d:ZJG d;d< d<e
j?ZKG d=d> d>eZLe,G d?d@ d@e&ZMdAeFd-eeFeFeFeFgeNf fdBdCZOe,G dDdE dEeMZPe,G dFdG dGeMeZQG dHdI dIe
j?ZRdJeejE dKeejE dLeFd-ee fdMdNZSe,dOdG dPdQ dQeMZTe,dOdG dRdS dSeMeZUG dTdU dUeMZVG dVdW dWeeMZWg dXZXdS )[    N)Callable)	dataclass)OptionalUnion   )ACT2FN)CacheDynamicCache)PretrainedConfig)GenerationMixin)create_causal_maskcreate_masks_for_generate!create_sliding_window_causal_mask)FlashAttentionKwargs) GenericForSequenceClassificationGradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast SequenceClassifierOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuplelogging)deprecate_kwarg)check_model_inputs   )	AutoModel   )Gemma3ConfigGemma3TextConfigzK
    Base class for Gemma3 outputs, with hidden states and attentions.
    )custom_introc                   @   s$   e Zd ZU dZdZeej ed< dS )Gemma3ModelOutputWithPasta  
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nimage_hidden_states)	__name__
__module____qualname____doc__r(   r   torchFloatTensor__annotations__ r0   r0   c/home/ubuntu/LTX-2/.venv/lib/python3.10/site-packages/transformers/models/gemma3/modeling_gemma3.pyr'   3   s   
 r'   zR
    Base class for Gemma3 causal language model (or autoregressive) outputs.
    c                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
ee ed< dZeeej  ed< dZeeej  ed< dZeej ed< dS )	Gemma3CausalLMOutputWithPasta8  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
    Nlosslogitspast_key_valueshidden_states
attentionsr(   )r)   r*   r+   r,   r3   r   r-   r.   r/   r4   r5   r   r6   tupler7   r(   r0   r0   r0   r1   r2   C   s   
 r2   c                	       sH   e Zd ZdZddedededef fddZd	ejf fd
dZ	  Z
S )Gemma3TextScaledWordEmbeddingz\
    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
          ?num_embeddingsembedding_dimpadding_idxembed_scalec                    s*   t  ||| | jdt|dd d S )Nr>   F
persistent)super__init__register_bufferr-   tensor)selfr;   r<   r=   r>   	__class__r0   r1   rB   f   s   z&Gemma3TextScaledWordEmbedding.__init__	input_idsc                    s   t  || j| jj S N)rA   forwardr>   toweightdtype)rE   rH   rF   r0   r1   rJ   j   s   z%Gemma3TextScaledWordEmbedding.forward)r:   )r)   r*   r+   r,   intfloatrB   r-   TensorrJ   __classcell__r0   r0   rF   r1   r9   a   s     r9   c                       s*   e Zd Zdef fddZdd Z  ZS )	Gemma3MLPconfigc                    sr   t    || _|j| _|j| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _	t
|j | _d S NFbias)rA   rB   rS   hidden_sizeintermediate_sizennLinear	gate_projup_proj	down_projr   hidden_activationact_fnrE   rS   rF   r0   r1   rB   o   s   
zGemma3MLP.__init__c                 C   s$   |  | | || | }|S rI   )r]   r_   r[   r\   )rE   xr]   r0   r0   r1   rJ   y   s    zGemma3MLP.forward)r)   r*   r+   r%   rB   rJ   rQ   r0   r0   rF   r1   rR   n   s    
rR   c                       s@   e Zd Zddedef fddZdd Zdd	 Zd
d Z  Z	S )Gemma3RMSNormư>dimepsc                    s&   t    || _tt|| _d S rI   )rA   rB   re   rY   	Parameterr-   zerosrL   )rE   rd   re   rF   r0   r1   rB      s   
zGemma3RMSNorm.__init__c                 C   s$   |t |djddd| j  S )Nr!   T)keepdim)r-   rsqrtpowmeanre   )rE   ra   r0   r0   r1   _norm   s   $zGemma3RMSNorm._normc                 C   s*   |  | }|d| j   }||S )Nr:   )rm   rO   rL   type_as)rE   ra   outputr0   r0   r1   rJ      s   
zGemma3RMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)r8   rL   shapere   rE   r0   r0   r1   
extra_repr   s   zGemma3RMSNorm.extra_repr)rc   )
r)   r*   r+   rN   rO   rB   rm   rJ   rr   rQ   r0   r0   rF   r1   rb   ~   s
    rb   c                       sD   e Zd ZU ejed< ddef fddZe e	dd Z
  ZS )	Gemma3RotaryEmbeddinginv_freqNrS   c                    s   t    t|drt|jtr|jd|jd| _nd| _|j| _	|j| _
|| _t| j | _| | j|\}| _| jd|dd | j| _d S )Nrope_scaling	rope_typetypedefaultrt   Fr?   )rA   rB   hasattr
isinstanceru   dictgetrv   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrS   r   rope_init_fnattention_scalingrC   rt   original_inv_freq)rE   rS   devicert   rF   r0   r1   rB      s   
zGemma3RotaryEmbedding.__init__c           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtr6|jjdkr6|jjnd}t	j
|dd+ | |  dd}t	j||fdd	}| | j }| | j }	W d    n1 smw   Y  |j|jd
|	j|jd
fS )Nr   rh   r#   mpscpuF)device_typeenabledr!   rd   rM   )rt   rO   expandrp   rK   r   rz   rw   strr-   autocast	transposecatcosr   sinrM   )
rE   ra   position_idsinv_freq_expandedposition_ids_expandedr   freqsembr   r   r0   r0   r1   rJ      s   0&zGemma3RotaryEmbedding.forwardrI   )r)   r*   r+   r-   rP   r/   r%   rB   no_gradr   rJ   rQ   r0   r0   rF   r1   rs      s   
 
rs   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..Nrh   r!   r   )rp   r-   r   )ra   x1x2r0   r0   r1   rotate_half   s   r   c                 C   sD   | |}| |}| | t| |  }|| t||  }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer   )qkr   r   r   unsqueeze_dimq_embedk_embedr0   r0   r1   apply_rotary_pos_emb   s
   

r   r6   n_repreturnc                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r#   N)rp   r   reshape)r6   r   batchnum_key_value_headsslenhead_dimr0   r0   r1   	repeat_kv   s
   0r           modulequerykeyvalueattention_maskdropoutscalingsoftcapc                 K   s   |d u r	| j d }t|| j}	t|| j}
t||	dd| }|d ur2|| }t|}|| }|d urM|d d d d d d d |	jd f }|| }tj	j
|dtjd|j}tj	j||| jd}t||
}|dd }||fS )	N      r!   r   rh   )rd   rM   )ptrainingr#   )r   r   num_key_value_groupsr-   matmulr   tanhrp   rY   
functionalsoftmaxfloat32rK   rM   r   r   
contiguous)r   r   r   r   r   r   r   r   kwargs
key_statesvalue_statesattn_weightscausal_maskattn_outputr0   r0   r1   eager_attention_forward   s"   

&r   c                       s   e Zd ZdZdedef fddZedddd		
	
ddej	dej	de
ej	 de
e de
ej dee deej	e
ej	 e
eej	  f fddZ  ZS )Gemma3Attentionz=Multi-headed attention from 'Attention Is All You Need' paperrS   	layer_idxc                    s(  t    |j| dk| _|| _|| _t|d|j|j | _	|j|j
 | _|jd | _| jj| _| jj | _tj|j|j| j	 |jd| _tj|j|j
| j	 |jd| _tj|j|j
| j	 |jd| _tj|j| j	 |j|jd| _| jj| _| jr}|jnd | _t|j	|jd| _t|j	|jd| _d S )Nsliding_attentionr   r   rU   )rd   re   )rA   rB   layer_types
is_slidingrS   r   getattrrW   num_attention_headsr   r   r   query_pre_attn_scalarr   attention_dropoutuse_bidirectional_attention	is_causalrY   rZ   attention_biasq_projk_projv_projo_projattn_logit_softcappingsliding_windowrb   rms_norm_epsq_normk_normrE   rS   r   rF   r0   r1   rB   
  s2   


zGemma3Attention.__init__past_key_valuer5   4.58new_nameversionNr6   position_embeddingsr   cache_positionr   r   c                 K   s<  |j d d }g |d| jR }| ||dd}	| ||dd}
| ||dd}| |	}	| |
}
|\}}t	|	|
||\}	}
|d ura|||d}|
|
|| j|\}
}t}| jjdkrot| jj }|| |	|
||f| jr|| jnd| j| jd|\}}|jg |dR   }| |}||fS )Nrh   r#   r!   )r   r   r   eagerr   )r   r   r   )rp   r   r   viewr   r   r   r   r   r   updater   r   rS   _attn_implementationr   r   r   r   r   r   r   r   )rE   r6   r   r   r5   r   r   input_shapehidden_shapequery_statesr   r   r   r   cache_kwargsattention_interfacer   r   r0   r0   r1   rJ   '  s>   


	

zGemma3Attention.forward)NN)r)   r*   r+   r,   r%   rN   rB   r   r-   rP   r   r   
LongTensorr   r   r8   rJ   rQ   r0   r0   rF   r1   r     s*    r   c                       s   e Zd Zdedef fddZedddd							
	
		ddejdejdejde	ej de	ej
 de	e de	e de	e de	ej
 deeje	eejejf  f fddZ  ZS )Gemma3DecoderLayerrS   r   c                    s   t    || _|j| _|| _|j| | _t||d| _t	|| _
t| j|jd| _t| j|jd| _t| j|jd| _t| j|jd| _d S )N)rS   r   re   )rA   rB   rS   rW   r   r   attention_typer   	self_attnrR   mlprb   r   input_layernormpost_attention_layernormpre_feedforward_layernormpost_feedforward_layernormr   rF   r0   r1   rB   Y  s   

zGemma3DecoderLayer.__init__r   r5   r   r   NFr6   position_embeddings_globalposition_embeddings_localr   r   output_attentions	use_cacher   r   c
                 K   s   |}|  |}| jjr|}n|}| jd||||||||	d|
\}}| |}|| }|}| |}| |}| |}|| }|f}|rK||f7 }|S )N)r6   r   r   r   r5   r   r   r   r0   )r   r   r   r   r   r   r   )rE   r6   r   r   r   r   r5   r   r   r   r   residualr   self_attn_weightsoutputsr0   r0   r1   rJ   f  s8   
	





zGemma3DecoderLayer.forward)NNNFFN)r)   r*   r+   r%   rN   rB   r   r-   rP   r   r   r   boolr8   r.   rJ   rQ   r0   r0   rF   r1   r   X  s<    	
r   c                       sZ   e Zd ZU eed< dZdZg dZdgZdZ	dZ
dZdZdZeedZ fddZ  ZS )	Gemma3PreTrainedModelrS    T)r   SiglipVisionEmbeddingsSiglipEncoderLayer#SiglipMultiheadAttentionPoolingHeadr5   )r6   r7   c                    sF   t  | t|tr|jj  d S d|jjv r!|j	j  d S d S )NRMSNorm)
rA   _init_weightsrz   Gemma3MultiModalProjectormm_input_projection_weightdatazero_rG   r)   rL   )rE   r   rF   r0   r1   r    s   
z#Gemma3PreTrainedModel._init_weights)r)   r*   r+   r$   r/   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr   r   _can_record_outputsr  rQ   r0   r0   rF   r1   r     s   
 r   r   c              
      s&   dt dt dt dt dtf
 fdd}|S )zA
    Enables a bidirectional mask within the sliding window.
    	batch_idxhead_idxq_idxkv_idxr   c                    s   t ||  k S )zA token can attend to any other token if their absolute distance is within
        the (exclusive) sliding window size (distance < sliding_window).)abs)r  r  r  r  r   r0   r1   
inner_mask  s   z1_bidirectional_window_overlay.<locals>.inner_maskrN   r   )r   r  r0   r  r1   _bidirectional_window_overlay  s   "r  c                       s   e Zd ZU eed< def fddZee									ddee	j
 dee	j dee	j
 dee d	ee	j d
ee dee dee dee	j
 dee defddZ  ZS )Gemma3TextModelrS   c                    s   t     j| _ j| _t j j| j| jjd d| _t	
 fddt jD | _t j jd| _t d| _d| _t   j _dd	i _t d| _|   d S )
N      ?)r>   c                    s   g | ]}t  |qS r0   )r   ).0r   rS   r0   r1   
<listcomp>  s    z,Gemma3TextModel.__init__.<locals>.<listcomp>r   r  Frv   rx   )rA   rB   pad_token_idr=   
vocab_sizer9   rW   rS   embed_tokensrY   
ModuleListrangenum_hidden_layerslayersrb   r   normrs   
rotary_embgradient_checkpointingcopydeepcopyrope_local_base_freq
rope_thetaru   rotary_emb_local	post_initr`   rF   r  r1   rB     s"   

zGemma3TextModel.__init__NrH   r   r   r5   inputs_embedsr   r   output_hidden_statesr   r   r   c
                 K   s  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u |d uA r*td| jr9| jr9|r9td d}|d u rB| 	|}|rQ|d u rQ| jsQt
| j d}|	d u rm|d ur]| nd}tj|||jd  |jd}	|d u rv|	d}t| }ts| j |||	||d}| }| j jrd	d
 |d< t| j j|d< tdi |tdi |d}|}| ||}| ||}|rdnd }|rdnd }| jd | j j D ]*}|r||f7 }||f||||j |||||	d|
}|d }|r||d f7 }q| |}|r||f7 }t||||dS )N:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fr  r   r#   r   rS   input_embedsr   r   r5   r   c                  W   s   t jdt jdS )NTr   )r-   rD   r   )argsr0   r0   r1   <lambda>"  s    z)Gemma3TextModel.forward.<locals>.<lambda>or_mask_functionfull_attentionr   r0   )r   r   r   r   r5   r   r   r   )last_hidden_stater5   r6   r7   ) rS   r   r0  r   
ValueErrorr(  r   loggerwarning_oncer!  r	   get_seq_lengthr-   arangerp   r   r   rz   r{   r)  r   r  r   r   r   r'  r-  r%  r$  r   r&  r   )rE   rH   r   r   r5   r/  r   r   r0  r   r   past_seen_tokenscausal_mask_mappingmask_kwargssliding_mask_kwargsr6   r   r   all_hidden_statesall_self_attnsdecoder_layerlayer_outputsr0   r0   r1   rJ     s   





zGemma3TextModel.forward	NNNNNNNNN)r)   r*   r+   r%   r/   rB   r    r   r   r-   r   rP   r   r.   r   r   r   r   rJ   rQ   r0   r0   rF   r1   r    sJ   
 	
r  c                       s   e Zd ZU dgZddiZddgdgfiZeed< dZdef fdd	Z	e
e	
	
	
	
	
	
	
	
	
	
	ddeej deej deej dee deej deej dee dee dee deej deeejf defddZ  ZS )Gemma3ForCausalLMlm_head.weightlm_headcolwise_repr6   r4   rS   language_modelc                    s@   t  | t|| _|j| _tj|j|jdd| _| 	  d S rT   )
rA   rB   r  modelr   rY   rZ   rW   rK  r.  r`   rF   r0   r1   rB   a  s
   
zGemma3ForCausalLM.__init__Nr   rH   r   r   r5   r/  labelsr   r   r0  r   logits_to_keepr   c                 K   s   |dur|n| j j}|	dur|	n| j j}	| jd||||||||	|
d	|}|j}t|tr4t| dn|}| |dd|ddf }| j j	dur[|| j j	 }t
|}|| j j	 }d}|durm| j||| jfi |}t|||j|j|jdS )a  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, Gemma3ForCausalLM

        >>> model = Gemma3ForCausalLM.from_pretrained("google/gemma-2-9b")
        >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")

        >>> prompt = "What is your favorite condiment?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "What is your favorite condiment?"
        ```N)	rH   r   r   r5   r/  r   r   r0  r   r3   r4   r5   r6   r7   r0   )rS   r   r0  rN  r:  rz   rN   slicerK  final_logit_softcappingr-   r   loss_functionr   r   r5   r6   r7   )rE   rH   r   r   r5   r/  rO  r   r   r0  r   rP  r   r   r6   slice_indicesr4   r3   r0   r0   r1   rJ   j  sB   #

zGemma3ForCausalLM.forward)NNNNNNNNNNr   )r)   r*   r+   _tied_weights_keys_tp_plan_pp_planr%   r/   r  rB   r   r   r   r-   r   rP   r   r.   r   r   rN   r   rJ   rQ   r0   r0   rF   r1   rI  Y  sZ   
 		
rI  c                       s2   e Zd Zdef fddZdejfddZ  ZS )r  rS   c                    s   t    tt|jj|jj| _	t
|jj|jjd| _t|jj|jj | _t|jd | _| j| j | _tj| j| jd| _d S )Nr   r  )kernel_sizestride)rA   rB   rY   rf   r-   rg   vision_configrW   text_configr  rb   layer_norm_epsmm_soft_emb_normrN   
image_size
patch_sizepatches_per_imagemm_tokens_per_imagetokens_per_siderY  	AvgPool2davg_poolr`   rF   r0   r1   rB     s   
z"Gemma3MultiModalProjector.__init__vision_outputsc           	      C   sv   |j \}}}|dd}|||| j| j}| }| |}|d}|dd}| |}t	|| j
}||S )Nr#   r!   )rp   r   r   ra  r   re  flattenr^  r-   r   r  rn   )	rE   rf  
batch_size_
seq_lengthreshaped_vision_outputspooled_vision_outputsnormed_vision_outputsprojected_vision_outputsr0   r0   r1   rJ     s   



z!Gemma3MultiModalProjector.forward)	r)   r*   r+   r$   rB   r-   rP   rJ   rQ   r0   r0   rF   r1   r    s    r  token_type_idsimage_group_idstokens_per_imagec              
      s4   du rdS dt dt dt dt dtf
 fdd}|S )	z
    This function adds the correct offsets to the `q_idx` and `kv_idx` as the torch API can only accept lengths,
    not start and end indices.
    Nr  r  r  r  r   c           	         s   t |jd k |d}| |f }t |jd k |d} | |f }t | jd k |d}| |f dk|dk@ } | |f |k}||@ S )Nr#   r   rh   )r-   whererp   )	r  r  r  r  safe_idxtoken_type_ids_at_kv_idximage_group_ids_at_kv_idxis_image_blocksame_image_blockrp  ro  r0   r1   r    s   z0token_type_ids_mask_function.<locals>.inner_maskr  )ro  rp  rq  r  r0   rx  r1   token_type_ids_mask_function  s   
$ry  zy
    The Base Gemma3 model which consists of a vision backbone and a language model without language modeling head.,
    c                !       s2  e Zd ZddiZdZdef fddZdd Zd	d
 Zdd Z	dd Z
dejdejfddZdejdejdejfddZee													d%deej deej deej deej dee deej deej deej deej dee d ee d!ee d"ee deeef fd#d$Z  ZS )&Gemma3Modelzlanguage_model.modelrM  FrS   c                    sj   t  | tj|jd| _t|| _|jj	| _	tj|jd}|| _
| jjd ur,| jjnd| _|   d S )Nr  rh   )rA   rB   r"   from_configr[  vision_towerr  multi_modal_projectorr\  r   rM  rS   r  r.  )rE   rS   rM  rF   r0   r1   rB     s   

zGemma3Model.__init__c                 C   
   | j  S rI   )rM  get_input_embeddingsrq   r0   r0   r1   r       
z Gemma3Model.get_input_embeddingsc                 C      | j | d S rI   )rM  set_input_embeddingsrE   r   r0   r0   r1   r       z Gemma3Model.set_input_embeddingsc                 C   s
   || _ d S rI   rM  rE   decoderr0   r0   r1   set_decoder  r  zGemma3Model.set_decoderc                 C   s   | j S rI   r  rq   r0   r0   r1   get_decoder  s   zGemma3Model.get_decoderpixel_valuesr   c                 C   s   | j |dj}| |}|S )a  
        Projects the last hidden state from the vision model into language model space.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
               The tensors corresponding to the input images.
        Returns:
            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
        )r  )r|  r:  r}  )rE   r  rf  image_featuresr0   r0   r1   get_image_features  s   

zGemma3Model.get_image_featuresrH   r/  r  c                 C   s   |du r||   tj| jjtj|jdk}|d}n|| jjk}| }|	d
||j}|jd |jd  }||  | krPtd| d| |S )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        N)rM   r   rh   r   r#   z6Image features and image tokens do not match: tokens: z, features )r  r-   rD   rS   image_token_idlongr   allsumr   	expand_asrK   rp   numelr;  )rE   rH   r/  r  special_image_maskn_image_tokensn_image_featuresr0   r0   r1   get_placeholder_mask*  s   z Gemma3Model.get_placeholder_maskNr   r   r5   ro  r   rO  r   r   r0  return_dictc                 K   sf  |du |duA rt d|dur|n| jj}|dur|n| jj}|dur&|n| jj}|durD| jj| jkrD|| jjk}| }d||< n|}|du rP|  |}|du rl|dur\|	 nd}t
j|||jd  |jd}|dur| |}||j|j}| j|||d}|||}t| }ts| j |||||d}|
 p|du p|j p|du}|dur|r|dk|j}|tjj|ddd	dddd
f  @ }t
j| ddd }t
||t
j|d
|jd}t||j|| jj|d< t di |t!di |d}| j"d|||||
||d|d	|}t#|j$|
r!|j%nd|j&|j'|dur/|dS ddS )a]  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration

        >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma32-3b-mix-224")
        >>> processor = AutoProcessor.from_pretrained("google/gemma32-3b-mix-224")

        >>> prompt = "Where is the cat standing?"
        >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, text=prompt,  return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(**inputs,)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Where is the cat standing?\nsnow"
        ```Nr1  r   r#   r2  )r/  r  r3  r#   r   r   rh   r   r7  r8  T)	r   r   r5   r/  r   r   r0  r  r   )r:  r5   r6   r7   r(   r0   )(r;  rS   r   r0  use_return_dictr  r   cloner  r>  r-   r?  rp   r   r  rK   rM   r  masked_scatterrz   r{   get_text_configis_initializedrY   r   padcumsumrN   rr  	full_likery  rb  r   r   rM  r'   r:  r5   r6   r7   )rE   rH   r  r   r   r5   ro  r   r/  rO  r   r   r0  r  	lm_kwargsr  llm_input_idsr@  r  rA  rB  
is_prefillis_imagenew_image_startrp  r   r0   r0   r1   rJ   B  s   .

(
zGemma3Model.forward)NNNNNNNNNNNNN)r)   r*   r+   _checkpoint_conversion_mappingaccepts_loss_kwargsr$   rB   r  r  r  r  r-   rP   r  r   r.   r  r   r   r   r   r   r   r8   r'   rJ   rQ   r0   r0   rF   r1   rz    sx    
	

rz  c                "       s  e Zd ZdddddZdgZdZdef fd	d
Zdd Zdd Z	dd Z
dd Zdd Zedd Zedd Zedd Ze														d4deej deej deej d eej d!ee d"eej d#eej d$eej d%eej d&ee d'ee d(ee d)ee d*eeejf d+eeef fd,d-Z								.		d5 fd/d0	Ze	d6de d1ejdeej d#ejd!ee d eej d"eej d+e!fd2d3Z"  Z#S )7Gemma3ForConditionalGenerationmodel.language_modelmodel.vision_towermodel.multi_modal_projectorrK  )^language_model.model^vision_tower^multi_modal_projectorz^language_model.lm_headrJ  FrS   c                    s<   t  | t|| _tj|jj|jjdd| _	| 
  d S rT   )rA   rB   rz  rN  rY   rZ   r\  rW   r   rK  r.  r`   rF   r0   r1   rB     s   
z'Gemma3ForConditionalGeneration.__init__c                 C   r~  rI   rN  r  rq   r0   r0   r1   r    r  z3Gemma3ForConditionalGeneration.get_input_embeddingsc                 C   r  rI   rN  r  r  r0   r0   r1   r    r  z3Gemma3ForConditionalGeneration.set_input_embeddingsc                 C   r  rI   )rN  r  r  r0   r0   r1   r    r  z*Gemma3ForConditionalGeneration.set_decoderc                 C   r~  rI   )rN  r  rq   r0   r0   r1   r    r  z*Gemma3ForConditionalGeneration.get_decoderc                 C   s   | j |S rI   )rN  r  )rE   r  r0   r0   r1   r    s   z1Gemma3ForConditionalGeneration.get_image_featuresc                 C      | j jS rI   )rN  rM  rq   r0   r0   r1   rM       z-Gemma3ForConditionalGeneration.language_modelc                 C   r  rI   )rN  r|  rq   r0   r0   r1   r|    r  z+Gemma3ForConditionalGeneration.vision_towerc                 C   r  rI   )rN  r}  rq   r0   r0   r1   r}    r  z4Gemma3ForConditionalGeneration.multi_modal_projectorNr   rH   r  r   r   r5   ro  r   r/  rO  r   r   r0  r  rP  r   c                 K   s  |dur|n| j j}|dur|n| j j}|dur|n| j j}| jd||||||||
|	||||d|}|d }t|trCt| dn|}| |dd|ddf }d}|	dur|	 }|dddddf }|	dddf }|dur|dd|j
d  df |j}|||jdk  }|||jdk  }n| }| }t }|d| j jj}|d|j}|||}|s|f|dd  }|dur|f| S |S t|||j|j|j|jdS )	a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration

        >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma-3-4b-it")
        >>> processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it")

        >>> messages = [
        ...     {
        ...         "role": "system",
        ...         "content": [
        ...             {"type": "text", "text": "You are a helpful assistant."}
        ...         ]
        ...     },
        ...     {
        ...         "role": "user", "content": [
        ...             {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
        ...             {"type": "text", "text": "Where is the cat standing?"},
        ...         ]
        ...     },
        ... ]

        >>> inputs = processor.apply_chat_template(
        ...     messages,
        ...     tokenize=True,
        ...     return_dict=True,
        ...     return_tensors="pt",
        ...     add_generation_prompt=True
        ... )
        >>> # Generate
        >>> generate_ids = model.generate(**inputs)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "user\nYou are a helpful assistant.\n\n\n\n\n\nWhere is the cat standing?\nmodel\nBased on the image, the cat is standing in a snowy area, likely outdoors. It appears to"
        ```
        N)rH   r  ro  r   r   r5   r/  r   rO  r   r0  r  r   r   .rh   r#   )r3   r4   r5   r6   r7   r(   r0   )rS   r   r0  r  rN  rz   rN   rR  rK  rO   rp   rK   r   r   rY   CrossEntropyLossr   r\  r   r2   r5   r6   r7   r(   )rE   rH   r  r   r   r5   ro  r   r/  rO  r   r   r0  r  rP  r  r   r6   rU  r4   r3   shift_logitsshift_labelsshift_attention_maskloss_fctflat_logitsflat_labelsro   r0   r0   r1   rJ     sd   @$
z&Gemma3ForConditionalGeneration.forwardTc                    s>   t  j|f||||||	|
|d|}|d dkr||d< |S )N)r5   r/  r   r   r   r   rP  ro  r   r  )rA   prepare_inputs_for_generation)rE   rH   r5   r/  r   r   r  r   ro  r   rP  rO  r   model_inputsrF   r0   r1   r    s"   
z<Gemma3ForConditionalGeneration.prepare_inputs_for_generationr4  c                 K   s   |   |||||d}|d urU|jd dkrU|dk|j}	|	tjj|	dddd d d df  @ }
tj|
	 ddd }t
|	|t|d}t||j|| j|d< td	i |S )
Nr3  r#   r  r   r  rh   r   r7  r0   )r  rp   rK   r   rY   r   r  r-   r  rN   rr  r  ry  rb  r   )rS   r4  r   r   r5   r   ro  r   rB  r  r  rp  r0   r0   r1   r     s    	(z8Gemma3ForConditionalGeneration.create_masks_for_generate)NNNNNNNNNNNNNr   )
NNNNNNNTNNrI   )$r)   r*   r+   r  rV  r  r$   rB   r  r  r  r  r  propertyrM  r|  r}  r   r   r-   r   r.   rP   r   r   r   rN   r8   r2   rJ   r  staticmethodr
   r{   r   rQ   r0   r0   rF   r1   r    s    


	

 $	r  c                       s   e Zd ZddddZ fddZdd Zd	d
 Zee									dde	e
j de	e
j de	e
j de	e
j de	e de	e
j de	e
j de	e
j de	e dee defddZ  ZS )Gemma3ForSequenceClassificationr  r  r  )r  r  r  c                    sB   t  | |j| _t|| _tj|jj| jdd| _	| 
  d S rT   )rA   rB   
num_labelsrz  rN  rY   rZ   r\  rW   scorer.  r`   rF   r0   r1   rB     s
   
z(Gemma3ForSequenceClassification.__init__c                 C   r~  rI   r  rq   r0   r0   r1   r    r  z4Gemma3ForSequenceClassification.get_input_embeddingsc                 C   r  rI   r  r  r0   r0   r1   r    r  z4Gemma3ForSequenceClassification.set_input_embeddingsNrH   r  r   r   r5   r/  ro  rO  r   r   r   c
              
   K   s6  | j |f|||||||	d|
}|j}| |}|dur#|jd }n|jd }| jjjdu r7|dkr7td| jjjdu rAd}n2|durg|| jjjk|j	t
j}t
j|jd |j	t
jd}|| d}nd}t| jj d |t
j||j	d	|f }d}|dur| j|||| jd
}t|||j|j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        )r   r  r   r5   r/  ro  r   Nr   r#   z=Cannot handle batch sizes > 1 if no padding token is defined.rh   )r   rM   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r2  )r4   rO  pooled_logitsrS   rQ  )rN  r:  r  rp   rS   r\  r  r;  rK   r   r-   int32r?  argmaxr<  r=  rG   r)   rT  r   r5   r6   r7   )rE   rH   r  r   r   r5   r/  ro  rO  r   r   transformer_outputsr6   r4   rh  last_non_pad_tokennon_pad_masktoken_indicesr  r3   r0   r0   r1   rJ     sR   	

z'Gemma3ForSequenceClassification.forwardrH  )r)   r*   r+   r  rB   r  r  r   r   r   r-   r   r.   rP   r   r   r   r   r   rJ   rQ   r0   r0   rF   r1   r    sT    		
r  c                   @   s   e Zd ZU dZeed< dS )#Gemma3TextForSequenceClassificationz
    Gemma3TextForSequenceClassification is a text-only sequence classification model that works with Gemma3TextConfig.
    It uses the generic sequence classification implementation for efficiency and consistency.
    rS   N)r)   r*   r+   r,   r%   r/   r0   r0   r0   r1   r  ,  s   
 r  )r   r  rI  r  rz  r  r  )Nr#   )r   NN)Yr)  collections.abcr   dataclassesr   typingr   r   r-   torch.nnrY   activationsr   cache_utilsr   r	   configuration_utilsr
   
generationr   masking_utilsr   r   r   modeling_flash_attention_utilsr   modeling_layersr   r   modeling_outputsr   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.deprecationr   utils.genericr    autor"   configuration_gemma3r$   r%   
get_loggerr)   r<  r'   r2   	Embeddingr9   ModulerR   rb   rs   r   r   rP   rN   r   rO   r8   r   r   r   r   r   r  r  rI  r  ry  rz  r  r  r  __all__r0   r0   r0   r1   <module>   s   

$


#QB" [$
! U w^	