o
    wip                     @   s
  d dl Z d dlmZ d dlmZ d dlmZmZ d dlZd dl	m
Z
 ddlmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZmZmZ ddlmZ ddlmZ ddlmZmZ ddlm Z m!Z! ddl"m#Z#m$Z$ ddl%m&Z& ddl'm(Z(m)Z)m*Z*m+Z+m,Z, ddl-m.Z. ddl/m0Z0 ddl1m2Z2m3Z3 e,4e5Z6ee)ddG dd deZ7ee)ddG dd de(Z8G dd de
j9Z:G d d! d!e
j;Z<G d"d# d#e
j;Z=G d$d% d%e
j;Z>d&d' Z?dQd(d)Z@d*ejAd+eBd,ejAfd-d.ZC	/		dRd0e
j;d1ejAd2ejAd3ejAd4eejA d5eDd6eeD d7eeD d,eEejAejAf fd8d9ZFG d:d; d;e
j;ZGG d<d= d=eZHe)G d>d? d?e$ZIe)G d@dA dAeIZJe)G dBdC dCeIeZKG dDdE dEe
j;ZLdFeejA dGeBd,ee fdHdIZMe)dJdG dKdL dLeIZNe)dMdG dNdO dOeIeZOg dPZPdS )S    N)Callable)	dataclass)OptionalUnion   )ACT2FN)CacheDynamicCache)PretrainedConfig)GenerationMixin)create_causal_maskcreate_masks_for_generate!create_sliding_window_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputauto_docstringcan_return_tupleis_torchdynamo_compilinglogging)deprecate_kwarg   )	AutoModel   )Gemma3ConfigGemma3TextConfigzK
    Base class for Gemma3 outputs, with hidden states and attentions.
    )custom_introc                   @   s$   e Zd ZU dZdZeej ed< dS )Gemma3ModelOutputWithPasta  
    past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nimage_hidden_states)	__name__
__module____qualname____doc__r%   r   torchFloatTensor__annotations__ r-   r-   g/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/gemma3/modeling_gemma3.pyr$   2   s   
 r$   zR
    Base class for Gemma3 causal language model (or autoregressive) outputs.
    c                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeeej ef  ed< dZeeej  ed< dZeeej  ed< dZeej ed< dS )	Gemma3CausalLMOutputWithPasta{  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
    Nlosslogitspast_key_valueshidden_states
attentionsr%   )r&   r'   r(   r)   r0   r   r*   r+   r,   r1   r2   r   listr   r3   tupler4   r%   r-   r-   r-   r.   r/   H   s   
 r/   c                	       sH   e Zd ZdZddedededef fddZd	ejf fd
dZ	  Z
S )Gemma3TextScaledWordEmbeddingz\
    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
          ?num_embeddingsembedding_dimpadding_idxembed_scalec                    s*   t  ||| | jdt|dd d S )Nr<   F
persistent)super__init__register_bufferr*   tensor)selfr9   r:   r;   r<   	__class__r-   r.   r@   l   s   z&Gemma3TextScaledWordEmbedding.__init__	input_idsc                    s   t  || j| jj S N)r?   forwardr<   toweightdtype)rC   rF   rD   r-   r.   rH   p   s   z%Gemma3TextScaledWordEmbedding.forward)r8   )r&   r'   r(   r)   intfloatr@   r*   TensorrH   __classcell__r-   r-   rD   r.   r7   g   s     r7   c                       s*   e Zd Zdef fddZdd Z  ZS )	Gemma3MLPconfigc                    sr   t    || _|j| _|j| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _	t
|j | _d S NFbias)r?   r@   rQ   hidden_sizeintermediate_sizennLinear	gate_projup_proj	down_projr   hidden_activationact_fnrC   rQ   rD   r-   r.   r@   u   s   
zGemma3MLP.__init__c                 C   s$   |  | | || | }|S rG   )r[   r]   rY   rZ   )rC   xr[   r-   r-   r.   rH      s    zGemma3MLP.forward)r&   r'   r(   r"   r@   rH   rO   r-   r-   rD   r.   rP   t   s    
rP   c                       s@   e Zd Zddedef fddZdd Zdd	 Zd
d Z  Z	S )Gemma3RMSNormư>dimepsc                    s&   t    || _tt|| _d S rG   )r?   r@   rc   rW   	Parameterr*   zerosrJ   )rC   rb   rc   rD   r-   r.   r@      s   
zGemma3RMSNorm.__init__c                 C   s$   |t |djddd| j  S )Nr   T)keepdim)r*   rsqrtpowmeanrc   )rC   r_   r-   r-   r.   _norm   s   $zGemma3RMSNorm._normc                 C   s*   |  | }|d| j   }||S )Nr8   )rk   rM   rJ   type_as)rC   r_   outputr-   r-   r.   rH      s   
zGemma3RMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)r6   rJ   shaperc   rC   r-   r-   r.   
extra_repr   s   zGemma3RMSNorm.extra_repr)ra   )
r&   r'   r(   rL   rM   r@   rk   rH   rp   rO   r-   r-   rD   r.   r`      s
    r`   c                       s8   e Zd Zddef fddZe edd Z  Z	S )Gemma3RotaryEmbeddingNrQ   c                    s   t    t|dr|jd ur|jd|jd| _nd| _|j| _|j| _|| _	t
| j | _| | j	|\}| _| jd|dd | j| _d S )Nrope_scaling	rope_typetypedefaultinv_freqFr=   )r?   r@   hasattrrr   getrs   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrQ   r   rope_init_fnattention_scalingrA   rv   original_inv_freq)rC   rQ   devicerv   rD   r-   r.   r@      s   
zGemma3RotaryEmbedding.__init__c           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtr6|jjdkr6|jjnd}t	j
|dd+ | |  dd}t	j||fdd	}| | j }| | j }	W d    n1 smw   Y  |j|jd
|	j|jd
fS )Nr   rf   r    mpscpuF)device_typeenabledr   rb   )rK   )rv   rM   expandrn   rI   r   
isinstancert   strr*   autocast	transposecatcosr}   sinrK   )
rC   r_   position_idsinv_freq_expandedposition_ids_expandedr   freqsembr   r   r-   r-   r.   rH      s   0&zGemma3RotaryEmbedding.forwardrG   )
r&   r'   r(   r"   r@   r*   no_gradr   rH   rO   r-   r-   rD   r.   rq      s
    rq   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..Nrf   r   r   )rn   r*   r   )r_   x1x2r-   r-   r.   rotate_half   s   r   c                 C   sD   | |}| |}| | t| |  }|| t||  }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer   )qkr   r   r   unsqueeze_dimq_embedk_embedr-   r-   r.   apply_rotary_pos_emb   s
   

r   r3   n_repreturnc                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r    N)rn   r   reshape)r3   r   batchnum_key_value_headsslenhead_dimr-   r-   r.   	repeat_kv   s
   0r           modulequerykeyvalueattention_maskdropoutscalingsoftcapc                 K   s   |d u r	| j d }t|| j}	t|| j}
t||	dd| }|d ur2|| }t|}|| }|d urM|d d d d d d d |	jd f }|| }tj	j
|dtjd|j}tj	j||| jd}t||
}|dd }||fS )	N      r   r   rf   )rb   rK   )ptrainingr    )r   r   num_key_value_groupsr*   matmulr   tanhrn   rW   
functionalsoftmaxfloat32rI   rK   r   r   
contiguous)r   r   r   r   r   r   r   r   kwargs
key_statesvalue_statesattn_weightscausal_maskattn_outputr-   r-   r.   eager_attention_forward   s"   

&r   c                       s   e Zd ZdZdedef fddZ		ddejdejd	e	ej d
e	e
 de	ej dee deeje	ej e	eej  f fddZ  ZS )Gemma3Attentionz=Multi-headed attention from 'Attention Is All You Need' paperrQ   	layer_idxc                    s"  t    |j| dk| _|| _|| _t|d|j|j | _	|j|j
 | _|jd | _| jj| _d| _tj|j|j| j	 |jd| _tj|j|j
| j	 |jd| _tj|j|j
| j	 |jd| _tj|j| j	 |j|jd| _| jj| _| jrz|jnd | _t|j	|jd| _t|j	|jd| _d S )Nsliding_attentionr   r   TrS   )rb   rc   )r?   r@   layer_types
is_slidingrQ   r   getattrrU   num_attention_headsr   r   r   query_pre_attn_scalarr   attention_dropout	is_causalrW   rX   attention_biasq_projk_projv_projo_projattn_logit_softcappingsliding_windowr`   rms_norm_epsq_normk_normrC   rQ   r   rD   r-   r.   r@     s2   


zGemma3Attention.__init__Nr3   position_embeddingsr   past_key_valuecache_positionr   r   c                 K   s<  |j d d }g |d| jR }| ||dd}	| ||dd}
| ||dd}| |	}	| |
}
|\}}t	|	|
||\}	}
|d ura|||d}|
|
|| j|\}
}t}| jjdkrot| jj }|| |	|
||f| jr|| jnd| j| jd|\}}|jg |dR   }| |}||fS )Nrf   r    r   )r   r   r   eagerr   )r   r   r   )rn   r   r   viewr   r   r   r   r   r   updater   r   rQ   _attn_implementationr   r   r   r   r   r   r   r   )rC   r3   r   r   r   r   r   input_shapehidden_shapequery_statesr   r   r   r   cache_kwargsattention_interfacer   r   r-   r-   r.   rH   +  s>   	

	

zGemma3Attention.forward)NN)r&   r'   r(   r)   r"   rL   r@   r*   rN   r   r   
LongTensorr   r   r6   rH   rO   r-   r-   rD   r.   r     s(    "r   c                       s   e Zd Zdedef fddZeddd								dd
ejdejdejde	ej de	ej
 de	e de	e de	e de	ej
 deeje	eejejf  f fddZ  ZS )Gemma3DecoderLayerrQ   r   c                    s   t    || _|j| _|| _|j| | _t||d| _t	|| _
t| j|jd| _t| j|jd| _t| j|jd| _t| j|jd| _d S )N)rQ   r   rc   )r?   r@   rQ   rU   r   r   attention_typer   	self_attnrP   mlpr`   r   input_layernormpost_attention_layernormpre_feedforward_layernormpost_feedforward_layernormr   rD   r-   r.   r@   \  s   

zGemma3DecoderLayer.__init__last_cache_positionz4.53.0)versionNFr3   position_embeddings_globalposition_embeddings_localr   r   r   output_attentions	use_cacher   r   c
                 K   s   |}|  |}| jjr|}n|}| jd||||||||	d|
\}}| |}|| }|}| |}| |}| |}|| }|f}|rK||f7 }|S )N)r3   r   r   r   r   r   r   r   r-   )r   r   r   r   r   r   r   )rC   r3   r   r   r   r   r   r   r   r   r   residualr   self_attn_weightsoutputsr-   r-   r.   rH   i  s8   
	





zGemma3DecoderLayer.forward)NNNFFN)r&   r'   r(   r"   rL   r@   r   r*   rN   r   r   r   boolr6   r+   rH   rO   r-   r-   rD   r.   r   [  s<    
	
r   c                   @   sN   e Zd ZeZdZdZg dZdgZdZ	dZ
dZdZdZdZdZdZdd ZdS )Gemma3PreTrainedModel T)r   SiglipVisionEmbeddingsSiglipEncoderLayer#SiglipMultiheadAttentionPoolingHeadr2   c                 C   s   | j j}t|tjtjfr%|jjjd|d |j	d ur#|j	j
  d S d S t|tjrF|jjjd|d |jd urD|jj|j 
  d S d S t|trT|jjd d S t|tra|jj
  d S d S )Nr   )rj   stdr8   )rQ   initializer_ranger   rW   rX   Conv2drJ   datanormal_rT   zero_	Embeddingr;   r`   fill_Gemma3MultiModalProjectormm_input_projection_weight)rC   r   r   r-   r-   r.   _init_weights  s    



z#Gemma3PreTrainedModel._init_weightsN)r&   r'   r(   r!   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_3_supports_flash_attn_2_supports_sdpa_supports_flex_attn_supports_cache_class_supports_quantized_cache_supports_static_cache_supports_attention_backendr  r-   r-   r-   r.   r     s    r   c                       s   e Zd ZeZdef fddZdd Zdd Zee										dd	e
ej d
e
ej de
ej de
e de
ej de
e de
e de
e de
ej dee defddZ  ZS )Gemma3TextModelrQ   c                    s   t     j| _ j| _t j j| j| jjd d| _t	
 fddt jD | _t j jd| _t d| _d| _t   j _dd	i _t d| _|   d S )
N      ?)r<   c                    s   g | ]}t  |qS r-   )r   ).0r   rQ   r-   r.   
<listcomp>  s    z,Gemma3TextModel.__init__.<locals>.<listcomp>r   r  Frs   ru   )r?   r@   pad_token_idr;   
vocab_sizer7   rU   rQ   embed_tokensrW   
ModuleListrangenum_hidden_layerslayersr`   r   normrq   
rotary_embgradient_checkpointingcopydeepcopyrope_local_base_freq
rope_thetarr   rotary_emb_local	post_initr^   rD   r  r.   r@     s"   

zGemma3TextModel.__init__c                 C      | j S rG   r  ro   r-   r-   r.   get_input_embeddings     z$Gemma3TextModel.get_input_embeddingsc                 C   
   || _ d S rG   r(  rC   r   r-   r-   r.   set_input_embeddings     
z$Gemma3TextModel.set_input_embeddingsNrF   r   r   r2   inputs_embedsr   r   output_hidden_statesr   flash_attn_kwargsr   c
                 K   s  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u |d uA r*td| jr9| jr9|r9td d}|d u rB| 	|}|rN|d u rN| jsNt
 }|	d u rj|d urZ| nd}tj|||jd  |jd}	|d u rs|	d}t| }ts| j |||	||d}td	i |td	i |d}|}| ||}| ||}|rd	nd }|rd	nd }| jd | j j D ]*}|r||f7 }||f||||j |||||	d
|
}|d }|r||d f7 }q| |}|r||f7 }t||||dS )N:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fr   r    r   rQ   input_embedsr   r   r2   r   full_attentionr   r-   )r   r   r   r   r   r   r   r   )last_hidden_stater2   r3   r4   )rQ   r   r0  r   
ValueErrorr   r   loggerwarning_oncer  r	   get_seq_lengthr*   arangern   r   r   r   dictr   r   r  r%  r  r  r   r  r   )rC   rF   r   r   r2   r/  r   r   r0  r   r1  past_seen_tokenscausal_mask_mappingmask_kwargsr3   r   r   all_hidden_statesall_self_attnsdecoder_layerlayer_outputsr-   r-   r.   rH     s   






zGemma3TextModel.forward)	NNNNNNNNN)r&   r'   r(   r"   r  r@   r)  r-  r   r   r   r*   r   rN   r   r+   r   r   r   r   rH   rO   r-   r-   rD   r.   r    sN    	
r  c                       s  e Zd ZdgZddiZddgdgfiZeZdZdef fdd	Z	d
d Z
dd Zdd Zdd Zdd Zdd Zee											d&deej deej deej dee deej deej dee dee d ee d!eej d"eeejf d#efd$d%Z  ZS )'Gemma3ForCausalLMlm_head.weightlm_headcolwise_repr3   r1   language_modelrQ   c                    s@   t  | t|| _|j| _tj|j|jdd| _| 	  d S rR   )
r?   r@   r  modelr  rW   rX   rU   rH  r&  r^   rD   r-   r.   r@   ]  s
   
zGemma3ForCausalLM.__init__c                 C      | j jS rG   rK  r  ro   r-   r-   r.   r)  f  s   z&Gemma3ForCausalLM.get_input_embeddingsc                 C   s   || j _d S rG   rM  r,  r-   r-   r.   r-  i     z&Gemma3ForCausalLM.set_input_embeddingsc                 C   r'  rG   rH  ro   r-   r-   r.   get_output_embeddingsl  r*  z'Gemma3ForCausalLM.get_output_embeddingsc                 C   r+  rG   rO  rC   new_embeddingsr-   r-   r.   set_output_embeddingso  r.  z'Gemma3ForCausalLM.set_output_embeddingsc                 C   r+  rG   rK  rC   decoderr-   r-   r.   set_decoderr  r.  zGemma3ForCausalLM.set_decoderc                 C   r'  rG   rT  ro   r-   r-   r.   get_decoderu  r*  zGemma3ForCausalLM.get_decoderNr   rF   r   r   r2   r/  labelsr   r   r0  r   logits_to_keepr   c                 K   s  | j r| jjdkrtd| jj d |dur|n| jj}|	dur$|	n| jj}	| jd||||||||	|
d	|}|j}t	|t
rHt| dn|}| |dd|ddf }| jjduro|| jj }t|}|| jj }d}|dur| j||| jfi |}t|||j|j|jdS )a'  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, Gemma3ForCausalLM

        >>> model = Gemma3ForCausalLM.from_pretrained("google/gemma-2-9b")
        >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")

        >>> prompt = "What is your favorite condiment?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "What is your favorite condiment?"
        ```r   zhIt is strongly recommended to train Gemma3 models with the `eager` attention implementation instead of `zp`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.N)	rF   r   r   r2   r/  r   r   r0  r   )r0   r1   r2   r3   r4   r-   )r   rQ   r   r:  r;  r   r0  rK  r8  r   rL   slicerH  final_logit_softcappingr*   r   loss_functionr  r   r2   r3   r4   )rC   rF   r   r   r2   r/  rY  r   r   r0  r   rZ  loss_kwargsr   r3   slice_indicesr1   r0   r-   r-   r.   rH   x  sN   (

zGemma3ForCausalLM.forward)NNNNNNNNNNr   )r&   r'   r(   _tied_weights_keys_tp_plan_pp_planr"   r  r  r@   r)  r-  rP  rS  rW  rX  r   r   r   r*   r   rN   r   r+   r   r   rL   r   rH   rO   r-   r-   rD   r.   rF  U  sf    		
rF  c                       s2   e Zd Zdef fddZdejfddZ  ZS )r  rQ   c                    s   t    tt|jj|jj| _	t
|jj|jjd| _t|jj|jj | _t|jd | _| j| j | _tj| j| jd| _d S )Nr   r  )kernel_sizestride)r?   r@   rW   rd   r*   re   vision_configrU   text_configr  r`   layer_norm_epsmm_soft_emb_normrL   
image_size
patch_sizepatches_per_imagemm_tokens_per_imagetokens_per_siderc  	AvgPool2davg_poolr^   rD   r-   r.   r@     s   
z"Gemma3MultiModalProjector.__init__vision_outputsc           	      C   sv   |j \}}}|dd}|||| j| j}| }| |}|d}|dd}| |}t	|| j
}||S )Nr    r   )rn   r   r   rk  r   ro  flattenrh  r*   r   r  rl   )	rC   rp  
batch_size_
seq_lengthreshaped_vision_outputspooled_vision_outputsnormed_vision_outputsprojected_vision_outputsr-   r-   r.   rH     s   



z!Gemma3MultiModalProjector.forward)	r&   r'   r(   r!   r@   r*   rN   rH   rO   r-   r-   rD   r.   r    s    r  token_type_idstokens_per_imagec              
      s4    du rdS dt dt dt dt dtf
 fdd}|S )	z
    This function adds the correct offsets to the `q_idx` and `kv_idx` as the torch API can only accept lengths,
    not start and end indices.
    N	batch_idxhead_idxq_idxkv_idxr   c                    s:   t || k} | |f dk | |f dk@ }||@ S Nr    )r*   abs)r{  r|  r}  r~  same_image_blockis_image_blockry  rz  r-   r.   
inner_mask  s    z0token_type_ids_mask_function.<locals>.inner_mask)rL   r   )ry  rz  r  r-   r  r.   token_type_ids_mask_function  s   $	r  zx
    The Base Gemma3 model which consists of a vision backbone and a language model withou language modeling head.,
    c                !       s  e Zd ZddiZdZdef fddZdd Zd	d
 Zdd Z	dd Z
dejdejfddZee													d"dejdejdeej deej deeeej ef  deej deej deej deej dee dee dee dee deeef fd d!Z  ZS )#Gemma3Modelzlanguage_model.modelrJ  FrQ   c                    sj   t  | tj|jd| _t|| _|jj	| _	tj|jd}|| _
| jjd ur,| jjnd| _|   d S )Nr  rf   )r?   r@   r   from_configre  vision_towerr  multi_modal_projectorrf  r  rJ  rQ   r  r&  )rC   rQ   rJ  rD   r-   r.   r@     s   

zGemma3Model.__init__c                 C   
   | j  S rG   )rJ  r)  ro   r-   r-   r.   r)    r.  z Gemma3Model.get_input_embeddingsc                 C      | j | d S rG   )rJ  r-  r,  r-   r-   r.   r-       z Gemma3Model.set_input_embeddingsc                 C   r+  rG   rJ  rU  r-   r-   r.   rW  "  r.  zGemma3Model.set_decoderc                 C   r'  rG   r  ro   r-   r-   r.   rX  %  r*  zGemma3Model.get_decoderpixel_valuesr   c                 C   s   | j |dj}| |}|S )a  
        Projects the last hidden state from the vision model into language model space.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
               The tensors corresponding to the input images.
        Returns:
            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
        )r  )r  r8  r  )rC   r  rp  image_featuresr-   r-   r.   get_image_features(  s   

zGemma3Model.get_image_featuresNrF   r   r   r2   ry  r   r/  rY  r   r   r0  return_dictc                 K   s  |du |duA rt d|dur|n| jj}|dur|n| jj}|dur&|n| jj}|durD| jj| jkrD|| jjk}| }d||< n|}|du rP|  |}|du rl|dur\|	 nd}t
j|||jd  |jd}|dur| |}|du r||  t
j| jjt
j|jdk}n|| jjkd}|||j}t s||  | kr|jddjddd }t d	| d
|jd |jd   d||j|j}|||}t| }ts| j |||||d}|dur|jd dkrt||j| jj|d< tdi |tdi |d}| j d|||||
||d|d	|}t!|j"|
r0|j#nd|j$|j%|dur>|dS ddS )a]  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration

        >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma32-3b-mix-224")
        >>> processor = AutoProcessor.from_pretrained("google/gemma32-3b-mix-224")

        >>> prompt = "Where is the cat standing?"
        >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, text=prompt,  return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(**inputs,)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Where is the cat standing?\nsnow"
        ```Nr2  r   r    r3  )rK   r   rf   r   zVNumber of images does not match number of special image tokens in the input text. Got z image tokens in the text but z tokens from image embeddings.r4  or_mask_functionr6  T)	r   r   r2   r/  r   r   r0  r  r   )r8  r2   r3   r4   r%   r-   )&r9  rQ   r   r0  use_return_dictimage_token_idr  cloner)  r<  r*   r=  rn   r   r  rB   longr   	expand_asrI   r   numelsumrK   masked_scatterr   r>  get_text_configr  rl  r   r   rJ  r$   r8  r2   r3   r4   )rC   rF   r  r   r   r2   ry  r   r/  rY  r   r   r0  r  	lm_kwargsspecial_image_maskllm_input_idsr?  r  image_tokens_in_textr@  rA  r   r-   r-   r.   rH   6  s   .


zGemma3Model.forward)NNNNNNNNNNNNN)r&   r'   r(   _checkpoint_conversion_mappingaccepts_loss_kwargsr!   r@   r)  r-  rW  rX  r*   rN   r  r   r   r   r+   r   r   r5   r   r   r6   r$   rH   rO   r-   r-   rD   r.   r    sj    	

r  zy
    The Base Gemma3 model which consists of a vision backbone and a language model without language modeling head.,
    c                "       s  e Zd ZdddddZdgZdef fdd	Zd
d Zdd Zdd Z	dd Z
dd Zdd Zdd Zedd Zedd Zedd Ze														d7d ejd!ejd"eej d#eej d$eeeej ef  d%eej d&eej d'eej d(eej d)ee d*ee d+ee d,ee d-eeejf d.eeef fd/d0Z								1		d8 fd2d3	Z e!	d9de"d4ejd"eej d&ejd$ee d#eej d%eej d.e#fd5d6Z$  Z%S ):Gemma3ForConditionalGenerationzmodel.language_modelzmodel.vision_towerzmodel.multi_modal_projectorrH  )z^language_model.modelz^vision_towerz^multi_modal_projectorz^language_model.lm_headrG  rQ   c                    s<   t  | t|| _tj|jj|jjdd| _	| 
  d S rR   )r?   r@   r  rK  rW   rX   rf  rU   r  rH  r&  r^   rD   r-   r.   r@     s   
z'Gemma3ForConditionalGeneration.__init__c                 C   r  rG   )rK  r)  ro   r-   r-   r.   r)    r.  z3Gemma3ForConditionalGeneration.get_input_embeddingsc                 C   r  rG   )rK  r-  r,  r-   r-   r.   r-    r  z3Gemma3ForConditionalGeneration.set_input_embeddingsc                 C   r'  rG   rO  ro   r-   r-   r.   rP    r*  z4Gemma3ForConditionalGeneration.get_output_embeddingsc                 C   r+  rG   rO  rQ  r-   r-   r.   rS    r.  z4Gemma3ForConditionalGeneration.set_output_embeddingsc                 C   r  rG   )rK  rW  rU  r-   r-   r.   rW    r  z*Gemma3ForConditionalGeneration.set_decoderc                 C   r  rG   )rK  rX  ro   r-   r-   r.   rX    r.  z*Gemma3ForConditionalGeneration.get_decoderc                 C   s   | j |S rG   )rK  r  )rC   r  r-   r-   r.   r    rN  z1Gemma3ForConditionalGeneration.get_image_featuresc                 C   rL  rG   )rK  rJ  ro   r-   r-   r.   rJ       z-Gemma3ForConditionalGeneration.language_modelc                 C   rL  rG   )rK  r  ro   r-   r-   r.   r    r  z+Gemma3ForConditionalGeneration.vision_towerc                 C   rL  rG   )rK  r  ro   r-   r-   r.   r    r  z4Gemma3ForConditionalGeneration.multi_modal_projectorNr   rF   r  r   r   r2   ry  r   r/  rY  r   r   r0  r  rZ  r   c                 K   s  |dur|n| j j}|dur|n| j j}|dur|n| j j}| jd||||||||
|	||||d|}|d }t|trCt| dn|}| |dd|ddf }d}|	dur|	 }|dddddf }|	dddf }|dur|dd|j
d  df |j}|||jdk  }|||jdk  }n| }| }t }|d| j jj}|d|j}|||}|s|f|dd  }|dur|f| S |S t|||j|j|j|jdS )	a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration

        >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma-3-4b-it")
        >>> processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it")

        >>> messages = [
        ...     {
        ...         "role": "system",
        ...         "content": [
        ...             {"type": "text", "text": "You are a helpful assistant."}
        ...         ]
        ...     },
        ...     {
        ...         "role": "user", "content": [
        ...             {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
        ...             {"type": "text", "text": "Where is the cat standing?"},
        ...         ]
        ...     },
        ... ]

        >>> inputs = processor.apply_chat_template(
        ...     messages,
        ...     tokenize=True,
        ...     return_dict=True,
        ...     return_tensors="pt",
        ...     add_generation_prompt=True
        ... )
        >>> # Generate
        >>> generate_ids = model.generate(**inputs)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "user\nYou are a helpful assistant.\n\n\n\n\n\nWhere is the cat standing?\nmodel\nBased on the image, the cat is standing in a snowy area, likely outdoors. It appears to"
        ```
        N)rF   r  ry  r   r   r2   r/  r   rY  r   r0  r  r   r   .rf   r    )r0   r1   r2   r3   r4   r%   r-   )rQ   r   r0  r  rK  r   rL   r[  rH  rM   rn   rI   r   r   rW   CrossEntropyLossr   rf  r  r/   r2   r3   r4   r%   )rC   rF   r  r   r   r2   ry  r   r/  rY  r   r   r0  r  rZ  r  r   r3   r_  r1   r0   shift_logitsshift_labelsshift_attention_maskloss_fctflat_logitsflat_labelsrm   r-   r-   r.   rH     sd   @$
z&Gemma3ForConditionalGeneration.forwardTc                    s>   t  j|f||||||	|
|d|}|d dkr||d< |S )N)r2   r/  r   r   r   r   rZ  ry  r   r  )r?   prepare_inputs_for_generation)rC   rF   r2   r/  r   r   r  r   ry  r   rZ  rY  r   model_inputsrD   r-   r.   r  v  s"   
z<Gemma3ForConditionalGeneration.prepare_inputs_for_generationr5  c           	      K   sR   |   |||||d}|d ur"|jd dkr"t||j| j|d< tdi |S )Nr4  r    r  r-   )r  rn   r  rI   r   rl  r   )	rQ   r5  r   r   r2   r   ry  r   rA  r-   r-   r.   r     s   	z8Gemma3ForConditionalGeneration.create_masks_for_generate)NNNNNNNNNNNNNr   )
NNNNNNNTNNrG   )&r&   r'   r(   r  r`  r!   r@   r)  r-  rP  rS  rW  rX  r  propertyrJ  r  r  r   r*   r   r+   r   rN   r   r5   r   r   rL   r6   r/   rH   r  staticmethodr
   r>  r   rO   r-   r-   rD   r.   r    s    


	

 $	r  )r   r  rF  r  r  r  )r   NN)Qr!  collections.abcr   dataclassesr   typingr   r   r*   torch.nnrW   activationsr   cache_utilsr   r	   configuration_utilsr
   
generationr   masking_utilsr   r   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.deprecationr   autor   configuration_gemma3r!   r"   
get_loggerr&   r:  r$   r/   r   r7   ModulerP   r`   rq   r   r   rN   rL   r   rM   r6   r   r   r   r   r  rF  r  r  r  r  __all__r-   r-   r-   r.   <module>   s   
"


#PB% w $ 7 s