o
    eiؾ                    @   s$  d dl Z d dlmZmZ d dlmZ d dlmZ d dlZd dl	m
Z
 d dlm
  mZ ddlmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ ddlm Z m!Z!m"Z" ddl#m$Z$m%Z% ddl&m'Z'm(Z( ddl)m*Z* ddl+m,Z,m-Z-m.Z.m/Z/m0Z0 ddl1m2Z2m3Z3 ddl4m5Z5 ddl6m7Z7 ddl8m9Z9m:Z:m;Z;m<Z< ee.G dd de!Z=ee.ddG dd de Z>ee.ddG dd  d e,Z?G d!d" d"e
j@ZAG d#d$ d$e
j@ZBG d%d& d&e
j@ZCG d'd( d(e
j@ZDG d)d* d*e
j@ZEG d+d, d,e
j@ZFG d-d. d.e
j@ZGG d/d0 d0e
j@ZHG d1d2 d2e
j@ZIG d3d4 d4e
j@ZJG d5d6 d6e(ZKG d7d8 d8e
jLZMG d9d: d:e
j@ZNG d;d< d<e
j@ZOG d=d> d>e
j@ZPd?d@ ZQdAejRdBeSdCejRfdDdEZT	F		dndGe
j@dHejRdIejRdJejRdKejRdB dLeUdMeUdB dNeUdB dCeVejRejRf fdOdPZWdodQejRdRejRdSejRdTeSfdUdVZXeeXG dWdX dXe
j@ZYG dYdZ dZeZZe.G d[d\ d\e(Z[G d]d^ d^e
j@Z\e.d_dG d`da dae[Z]e.dbdG dcdd dde[eZ^G dedf dfe
j@Z_e.dgdG dhdi die[Z`e.djdG dkdl dle[eZag dmZbdS )p    N)CallableSequence)	dataclass)Optional   )initialization)ACT2FN)CacheDynamicCache)GenerationMixin)use_kernelized_func)create_causal_mask!create_sliding_window_causal_mask)GradientCheckpointingLayer)BaseModelOutputWithPastBaseModelOutputWithPoolingCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tupletorch_compilable_check)maybe_autocastmerge_with_config_defaults)capture_outputs   )	AutoModel   )Gemma3nAudioConfigGemma3nConfigGemma3nTextConfigGemma3nVisionConfigc                   @   s$   e Zd ZU dZdZejdB ed< dS )Gemma3nAudioEncoderModelOutputzz
    audio_mel_mask (`torch.FloatTensor`, *optional*):
        A torch.BoolTensor of shape `(batch_size, num_frames)`
    Naudio_mel_mask)__name__
__module____qualname____doc__r(   torch
BoolTensor__annotations__ r0   r0   j/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/gemma3n/modeling_gemma3n.pyr'   6   s   
 r'   zL
    Base class for Gemma3n outputs, with hidden states and attentions.
    custom_introc                   @   s6   e Zd ZU dZdZejdB ed< dZejdB ed< dS )Gemma3nModelOutputWithPasta  
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    audio_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        audio_hidden_states of the model produced by the audio encoder and after projecting the last hidden state.
    Nimage_hidden_statesaudio_hidden_states)	r)   r*   r+   r,   r5   r-   FloatTensorr/   r6   r0   r0   r0   r1   r4   A   s   
 r4   zS
    Base class for Gemma3n causal language model (or autoregressive) outputs.
    c                   @   s   e Zd ZU dZdZejdB ed< dZejdB ed< dZ	e
dB ed< dZeej dB ed< dZeej dB ed< dZejdB ed< dZejdB ed	< dS )
Gemma3nCausalLMOutputWithPastaF  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
    audio_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        audio_hidden_states of the model produced by the audio encoder and after projecting the last hidden state.
    Nlosslogitspast_key_valueshidden_states
attentionsr5   r6   )r)   r*   r+   r,   r9   r-   r7   r/   r:   r;   r	   r<   tupler=   r5   r6   r0   r0   r0   r1   r8   [   s   
 r8   c                       sR   e Zd Zddededef fddZdd	 Zd
ej	dej	fddZ
dd Z  ZS )Gemma3nRMSNormư>Tdimeps
with_scalec                    sL   t    || _|| _| jrtt|| _d S | j	dt
ddd d S )Nweight      ?F
persistent)super__init__rB   rC   nn	Parameterr-   onesrD   register_buffertensor)selfrA   rB   rC   	__class__r0   r1   rI      s   
zGemma3nRMSNorm.__init__c                 C   s$   |t |djddd| j  S )Nr    T)keepdim)r-   sqrtpowmeanrB   )rO   xr0   r0   r1   _norm   s   $zGemma3nRMSNorm._normrW   returnc                 C   s"   |  | | j  }||S N)rX   floatrD   type_as)rO   rW   outputr0   r0   r1   forward   s   
zGemma3nRMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)r>   rD   shaperB   rO   r0   r0   r1   
extra_repr   s   zGemma3nRMSNorm.extra_repr)r@   T)r)   r*   r+   intr[   boolrI   rX   r-   Tensorr^   ra   __classcell__r0   r0   rP   r1   r?   ~   s
    
r?   c                       s   e Zd Zdef fddZdejdejdejfddZd	ejd
e	de	de	de	de	de	dejfddZ
dejdejdejfddZ  ZS )%Gemma3nAudioRelativePositionEmbeddingconfigc                    s   t    || _| jj| _| jj| _| j| j | _td| jj	d | _
| jj| _tj| j| j| j dd| _d}d}| jd }tt|t| t|d d }|tt||   }| jd| dddd	 d S )
Nr   r"   FbiasrE        @r    inv_timescalesrF   )rH   rI   rg   conf_num_attention_heads	num_headshidden_sizechannelshead_dimmaxconf_attention_context_leftmax_backwardconf_attention_context_rightmax_forwardrJ   Linearpos_projmathlogr[   r-   exparangerM   	unsqueeze)rO   rg   min_timescalemax_timescalenum_timescaleslog_timescale_incrementrk   rP   r0   r1   rI      s$   




$
z.Gemma3nAudioRelativePositionEmbedding.__init__positiondtyperY   c                 C   sN   |  d}|| jj|jtjd }tjt|t	|gdd}|
|S )NrR   devicer   rA   )r[   r|   rk   tor   r-   float32catsincostype)rO   r   r   scaled_timetiming_signalr0   r0   r1   _get_timing_signal_1d_pos   s   
z?Gemma3nAudioRelativePositionEmbedding._get_timing_signal_1d_posterm_bd_before_shift
batch_sizerm   num_query_blocksquery_block_sizekey_context_sizemax_span_plus_1c                 C   sx   |d | }d|f}	t j||	}
|
|||||d  f}|ddddddd|| f }||||||f}|S )aZ  Performs the relative shift.

        Args:
          term_bd_before_shift: Tensor of shape [B, N, U, W, F_span]. batch_size
            (B), num_heads (N), num_query_blocks (U), query_block_size (W),
            key_context_size (C = W+L+R), max_span_plus_1 (F_span = L+R+1).

        Returns:
          Tensor of shape [B, N, U, W, C].
        r"   r   N)rJ   
functionalpadreshape)rO   r   r   rm   r   r   r   r   pad_amount_last_dimpadding_tupleterm_bd_paddedterm_bd_reshapedterm_bd_slicedterm_bd_shiftedr0   r0   r1   _relative_shift   s(   
$
	z5Gemma3nAudioRelativePositionEmbedding._relative_shiftquerieskeysc              	   C   s"  |j \}}}}}|j \}}}	}}tj| j| j d d|jdd}
|
j d }| j|
|jd}| 	|}|
d|| j| jd}|ddddd}|ddddd}t||}|ddddd}|ddd}|
|||| |}t||}|
|||||}| ||||||	|}|| S )	Nr"   rR   r   r   r   r   r       )r_   r-   r{   rs   ru   r   r|   r   r   rw   r   rm   rp   squeezepermutematmulr   )rO   r   r   r   r   r   rm   rp   _r   pos_indicesr   sin_emb_timing_signalprojected_sin_embsin_emb	queries_pkeys_p_tterm_ac
q_permuted
s_permuted
q_reshapedterm_bd_unshifed_matmulterm_bd_unshifedr   r0   r0   r1   r^      sJ   

		
z-Gemma3nAudioRelativePositionEmbedding.forward)r)   r*   r+   r#   rI   r-   rd   r   r   rb   r   r^   re   r0   r0   rP   r1   rf      s*    	
$=rf   c                       s   e Zd Zdef fddZdd Zdejdeded	ejfd
dZ	dejd	ejfddZ
dejd	ejfddZdejdejd	ejfddZ  ZS )Gemma3nAudioAttentionrg   c                    sb  t    || _| jj| _| jj| _| j| j | _| jj| _| jj	| _
td| jjd | _| jj| _| j| j | j
 | _t|| _tt| jf| _tj| j| j| j dd| _tj| j| j| j dd| _tj| j| j| j dd| _| jd }dtjjtd }| jd||   ! dd	 | " }| jd
|dd	 | jdt| j# dd	 d S )Nr   r"   Frh         rE           q_scalerF   local_causal_valid_masksoftcap)$rH   rI   rg   rl   rm   rn   rp   conf_attention_chunk_size
chunk_sizert   max_future_horizonrq   rr   max_past_horizonconf_attention_logit_capattention_logits_soft_capcontext_sizerf   relative_position_embeddingrJ   rK   r-   zerosper_dim_scalerv   q_projk_projv_projr   softplusrN   rM   clonedetachcreate_local_causal_valid_maskr[   )rO   rg   r   r_softplus_0r   rP   r0   r1   rI   D  s2   








zGemma3nAudioAttention.__init__c                 C   sv   t jt j| j| jft jdddj}t jt j| j| jft jd| j| j d}t j| j| jft jd}|| | }|S )Nr   r   )diagonal)	r-   trilrL   r   r   rc   Tr   r   )rO   lower_causal_maskupper_causal_maskr   r0   r0   r1   r   f  s   
z4Gemma3nAudioAttention.create_local_causal_valid_maskrW   pad_left	pad_rightrY   c           	      C   sL   |j ^}}}|||g|R }|||g|R }tj|||gdd}|S )Nr"   r   )r_   	new_zerosr-   r   )	rO   rW   r   r   batchr   
tail_shapeleftrightr0   r0   r1   	_pad_dim1s  s
   zGemma3nAudioAttention._pad_dim1r<   c                 C   sx   |j }|dd \}}|| j d | j }|| j |  }dkr'| |d|}||| jf|dd  }|| }|S )aE  Turns a sequence to non overlapping blocks.

        Args:
            hidden_states: a tensor of [batch, time, ...].

        Returns:
            A tensor of [batch, num_blocks, block_size, ...], with necessary
            paddings,
            where output[:, i, ...] are x[:, i*block_size:(i+1)*block_size, ...].
        Nr    r"   r   )r_   r   r   r   
contiguous)rO   r<   r_   bt
num_blockspadding_lenpermute_dimsr0   r0   r1   _convert_to_blockz  s   z'Gemma3nAudioAttention._convert_to_blockc                 C   sl   | j }| j| j d }| |||}| j}| j}|jd||d}|jdkr2|jdkr2tj|ddd}|	 S )a  Extracts temporal context for every block.

        Args:
            hidden_states: a tensor of [batch, time, ...].

        Returns:
            A tensor of [batch, num_blocks, context_size, ...], with necessary
            paddings,
            where context_size = block_size + left_context + right_context,
            and output[:, i, ...] are x[:, start-left_context:end+right_context,
            ...],
            start = i * block_size, end = (i + 1) * block_size.
        r"   )	dimensionsizestepr    r   rR   )sourcedestination)
r   r   r   r   r   unfoldndimr-   movedimr   )rO   r<   r   r   	frame_len
frame_step
x_unfoldedr0   r0   r1   _extract_block_context  s   z,Gemma3nAudioAttention._extract_block_contextmaskc           "   
   C   s  g |j d d | j| jR }| || }| || }| || }tj	j
| j}ddd| jf}||}	|| j |	 }|j d d \}
}| |}| |}| |}|j d }| }| |}|jdkr|j d |j d  | jkr||
|| j}|j |
|| jfkrtd|j  d|
 d| d| j d		|dd
}| jddd}t|||j}| ||}| j|j}|| }t|}|| }t||t|jj}tj	j
j |dtj!dj|jd}|j \}}}}}|j d }|"dddddd||}|"dddddd||}t#||} | |||||"ddddd}!|!|
|| j$ | j| jf}!|!d d d |f }!|!S )NrR   r"   r    r   r   z%Shape of extracted_valid_mask_blocks z	 is not (z, z) after potential reshape.r   rA   r   r   )%r_   rm   rp   r   r   r   r   r   r-   rJ   r   r   r   viewr   r   r   r   r   
ValueErrorr|   r   logical_andr   r   r   r   tanhwherefinfor   minsoftmaxr   r   bmmr   )"rO   r<   r   	qkv_shapequery_states
key_statesvalue_statesper_dim_scale_spbroadcast_shapeper_dim_scale_sp_broadcastr   q_timequery_blocks
key_blocksvalue_blocksr   original_valid_maskextracted_valid_mask_blockscondition_from_input_validitycondition_from_causalityfinal_condition_for_wherer:   softcap_valprobabilitiesb_dimn_dimu_dimw_dimc_dimh_dimprob_bunv_bun
result_bmmcontext_vectorsr0   r0   r1   r^     s    






	

 
 zGemma3nAudioAttention.forward)r)   r*   r+   r#   rI   r   r-   rd   rb   r   r   r   r.   r^   re   r0   r0   rP   r1   r   C  s    "$0r   c                       sL   e Zd ZdZ	ddedee def fddZdej	d	ej	fd
dZ
  ZS )Gemma3nAudioCumulativeGroupNorma  Applies Group Normalization cumulatively over the time dimension.

    This layer normalizes the input by calculating the mean and variance
    cumulatively over the time dimension (dim 1). The statistics are computed
    over all feature dimensions (specified by `feature_dims` and `num_channels`)
    for elements marked as valid by the optional `mask`.

    If a `mask` is provided (True for valid, False for invalid/padded),
    invalid time steps do not contribute to the statistics calculation, and
    their corresponding output values are zeroed out.

    Scale and bias, if enabled, are applied per-channel (last dimension).
    This behavior is similar to JAX's `GroupNormalization` with `num_groups=1`
    and `cumulative=True`.
    MbP?num_channelsfeature_dimsrB   c                    sT   t    || _t|| _|| _tt	|| _
ttddt| j d | _d S )Nr    r"   )rH   rI   r  r>   r  rB   rJ   rK   r-   rL   rD   rangelenreduction_axes)rO   r  r  rB   rP   r0   r1   rI   8  s   

"z(Gemma3nAudioCumulativeGroupNorm.__init__r<   rY   c                 C   sL  | j | jf }|jdd |krtd|jdd  d| |j}tj}||}tj||d}tj	|| j
dd}tj|dd	}tj	|| j
dd}	tj|	dd	}
tj|
d
d}|| }|| d}tj	|| j
dd}tj|dd	}|| }|| t|| j  }| j|}dg| d  | jg }||| }|| }||S )zApplies cumulative group norm, optionally using a mask.

        Args:
          hidden_states: Input tensor, shape [B, T, *feature_dims, C].

        Returns:
          Normalized tensor with the same shape as x.
        r    NzInput tensor shape suffix z> does not match expected suffix (feature_dims + num_channels) r   TrA   rS   r"   r   rE   )r   )r  r  r_   r   r   r-   r   r   	ones_likesumr  cumsumclamprU   rsqrtrB   rD   rA   r   )rO   r<   expected_input_suffixinput_dtype
calc_dtypex_calc	mask_calcsum_values_at_tcum_sum_valueselements_in_group_at_tcum_count_elementssafe_cum_count_elementscum_meansquared_diff_from_meansum_sq_diff_at_tcum_sum_sq_diffcum_variancenormalized_xscalescale_view_shapefinal_outputr0   r0   r1   r^   J  s6   	

z'Gemma3nAudioCumulativeGroupNorm.forward)r  )r)   r*   r+   r,   rb   r   r[   rI   r-   rd   r^   re   r0   r0   rP   r1   r  '  s    r  c                       sX   e Zd ZdZ	ddedededeeeeef f fddZd	ej	d
ej	fddZ
  ZS )Gemma3nAudioSSCPConvBlockzA single convolution block for the SubSampleConvProjection.

    This block consists of a 2D convolution, followed by CumulativeGroupNorm,
    and a ReLU activation. It handles manual padding for the convolution.
    r   r   r   r   rg   idxinput_freq_dimmanual_paddingc                    s   t    || _|| _|dkrdn| jj|d  }| jj| }| jj| \}}| jj| \}	}
tj||||f|	|
fddd| _	|| jd  | jd  }|| |
 d }t
||f| jjd| _t | _d S )Nr   r"   )r   r   F)in_channelsout_channelskernel_sizestridepaddingri   )r  r  rB   )rH   rI   rg   r8  sscp_conv_channel_sizesscp_conv_kernel_sizesscp_conv_stride_sizerJ   Conv2dconvr  sscp_conv_group_norm_epsnormReLU
activation)rO   rg   r6  r7  r8  r9  r:  kernel_hkernel_wstride_hstride_wf_in_padded
f_out_convrP   r0   r1   rI     s2   
z"Gemma3nAudioSSCPConvBlock.__init__audio_encodingsrY   c                 C   sf   t j|| jddd| jjj}| |}|dddd }| 	|}|dddd }| 
|S )Nconstantr   )modevaluer   r    r   r"   )Fr   r8  r   rB  rD   r   r   r   rD  rF  )rO   rM  audio_encodings_paddedaudio_encodings_conv
x_for_normx_normedaudio_encodings_normedr0   r0   r1   r^     s   


z!Gemma3nAudioSSCPConvBlock.forward)r5  )r)   r*   r+   r,   r#   rb   r>   rI   r-   rd   r^   re   r0   r0   rP   r1   r4    s    +r4  c                       8   e Zd Zdef fddZdejdejfddZ  ZS )#Gemma3nAudioSubSampleConvProjectionrg   c                    s  t    || _|j}g }g }tdD ]:}|j| \}}|j| \}}	d}
|d }d}d}|||
|f}|| || | }|| |	 d }|| |}qtd|j||d d| _	td|d ||d d| _
|jd }|d }|| | _tj| j| jjdd| _d S )Nr    r   r"   )r6  r7  rg   r8  rR   Frh   )rH   rI   rg   input_feat_sizer  r?  r@  appendr4  conv_0conv_1r>  input_proj_in_featuresrJ   rv   rn   input_proj_linear)rO   rg   current_f_for_block_inputcalculated_block_paddingcalculated_f_out_dimsirG  rH  rI  rJ  	pad_t_toppad_t_bottom
pad_f_leftpad_f_rightmanual_padding_tuplerK  f_out_after_convfinal_c_outfinal_f_outrP   r0   r1   rI     sL   




z,Gemma3nAudioSubSampleConvProjection.__init__rM  rY   c                 C   s`   | d}| |}| |}|j\}}}}|dddd }||||| }	| |	}
|
S )Nr"   r   r    r   )r|   r[  r\  r_   r   r   r   r^  )rO   rM  audio_encodings_reshapedrW   r   c_outt_outf_out
x_permutedoutput_flattenedr]   r0   r0   r1   r^     s   



z+Gemma3nAudioSubSampleConvProjection.forward	r)   r*   r+   r#   rI   r-   rd   r^   re   r0   r0   rP   r1   rX    s    9rX  c                       >   e Zd Zdef fddZdejdejdejfddZ  Z	S )	Gemma3nAudioConformerAttentionrg   c                    sv   t    || _| jj| _| jdt| jjdd t	| jj| _
t|| _tj| j| jjdd| _t	| jj| _d S )Ngradient_clippingFrF   rh   )rH   rI   rg   rn   post_in_featuresrM   r-   rN   rt  r?   pre_attn_normr   attnrJ   rv   post	post_normrO   rg   rP   r0   r1   rI   #  s   


z'Gemma3nAudioConformerAttention.__init__rM  r(   rY   c                 C   sz   |}t || j | j}| |}| ||}|j\}}}}	|||||	 }
| |
}t || j | j}|| | S rZ   )	r-   r  rt  rv  rw  r_   r   rx  ry  )rO   rM  r(   audio_encodings_input_to_attnaudio_encodings_normaudio_encodings_attn_outr   r   rm   rp   rk  r0   r0   r1   r^   -  s   

z&Gemma3nAudioConformerAttention.forward
r)   r*   r+   r#   rI   r-   rd   r.   r^   re   r0   r0   rP   r1   rs  "  s    $
rs  c                       rW  ) Gemma3nAudioConformerFeedForwardrg   c                    s   t    || _| jdt| jjdd t| jj| _	t
j| jj| jjd dd| _t
j| jjd | jjdd| _t| jj| _| jj| _d S )Nrt  FrF   r   rh   )rH   rI   rg   rM   r-   rN   rt  r?   rn   pre_layer_normrJ   rv   ffw_layer_1ffw_layer_2post_layer_normconf_residual_weightpost_layer_scalerz  rP   r0   r1   rI   ?  s   
z)Gemma3nAudioConformerFeedForward.__init__rM  rY   c                 C   sn   |}t || j | j}| |}| |}tj|}| |}t || j | j}| 	|}||| j
  S rZ   )r-   r  rt  r  r  rJ   r   silur  r  r  )rO   rM  residualr0   r0   r1   r^   K  s   



z(Gemma3nAudioConformerFeedForward.forwardrq  r0   r0   rP   r1   r  >  s    r  c                       rW  ) Gemma3nAudioConformerLightConv1drg   c              	      s   t    || _t| jj| jjd| _tj| jj| jjd dd| _	tj
| jj| jj| jjdd| jjdd| _| jdt| jjdd	 t| jj| jjd| _tj| jj| jjdd| _| jjd | _d S )
NrB   r    Frh   r"   r   )r9  r:  r;  r<  r=  groupsri   rt  rF   )rH   rI   rg   r?   rn   rms_norm_epsr  rJ   rv   linear_startConv1dconf_conv_kernel_sizedepthwise_conv1drM   r-   rN   rt  	conv_norm
linear_endcausal_paddingrz  rP   r0   r1   rI   X  s"   
	z)Gemma3nAudioConformerLightConv1d.__init__rM  rY   c                 C   s   |}|  |}| |}tjjj|dd}|ddd}t|| j	df}| 
|}|ddd}t|| j | j}| |}tj|}| |}|| }|S )NrR   r   r   r    r"   )r  r  r-   rJ   r   glur   rQ  r   r  r  r  rt  r  r  r  )rO   rM  audio_encodings_residualaudio_encodings_permutedaudio_encodings_permuted_paddedr]   r0   r0   r1   r^   m  s   




z(Gemma3nAudioConformerLightConv1d.forwardrq  r0   r0   rP   r1   r  W  s    r  c                       rr  )	Gemma3nAudioConformerBlockrg   c                    sl   t    || _t| j| _t| j| _t| j| _t| j| _	| j
dt| jjdd t| jj| _d S )Nrt  FrF   )rH   rI   rg   r  ffw_layer_startrs  	attentionr  lconv1dffw_layer_endrM   r-   rN   rt  r?   rn   rD  rz  rP   r0   r1   rI     s   
z#Gemma3nAudioConformerBlock.__init__rM  r(   rY   c                 C   sh   |  |}| ||}| }||d|j }| |}| |}t|| j	 | j	}| 
|}|S )NrR   )r  r  r|   r   r   r  r  r-   r  rt  rD  )rO   rM  r(   validity_mask_for_lconvaudio_encodings_for_lconv_inputr]   r0   r0   r1   r^     s   



z"Gemma3nAudioConformerBlock.forwardr~  r0   r0   rP   r1   r    s    $r  c                       sf   e Zd ZU dZeed< dZdZdef fddZe	e
dejdejdee d	eeB fd
dZ  ZS )Gemma3nAudioEncoderzx
    An audio encoder based on the [Universal Speech Model](https://huggingface.co/papers/2303.01037) architecture.
    rg   	audio_melaudioc                    sH   t     | _t | _t fddt jD | _	| 
  d S )Nc                    s   g | ]}t  qS r0   )r  .0r   rg   r0   r1   
<listcomp>  s    z0Gemma3nAudioEncoder.__init__.<locals>.<listcomp>)rH   rI   rg   rX  subsample_conv_projectionrJ   
ModuleListr  conf_num_hidden_layers	conformer	post_initrz  rP   r  r1   rI     s   
zGemma3nAudioEncoder.__init__r(   kwargsrY   c                 K   s^  |  |}|jd }d}tt| jjD ]}|| jj| d 9 }qtj||jd| }tj	||jd d d}|j
dkrN|j
dkrN|d|jd d}n |j
|j
krn|jd dkrn|jd dkrn||jd krn|d}t|d|}	| jD ]}
|
||	}qx| jjdkr|dddd| jjf }|	dddd| jjf }	||	dd}t||	dS )	a  Encodes a batch of MELs.

        Args:
            audio_mel: a torch.Tensor of shape [batch, num_frames, num_channels,
              mel_bins].

        Returns:
            audio_encodings: a torch.Tensor of shape
                `[batch_size, self.config.audio_soft_tokens_per_image,
                self.config.audio_config.hidden_size]`
            audio_mel_mask: a torch.BoolTensor of shape [batch, num_frames].
        r"   r   r   )rq   rR   Nr   )last_hidden_stater(   )r  r_   r  r  rg   r@  r-   r{   r   r  r   r|   expandgatherr  conf_reduction_factormasked_fillr'   )rO   r  r(   r  rM  t_subtime_stride_productstride_pair_idxindicescurrent_maskblockr0   r0   r1   r^     s2   



zGemma3nAudioEncoder.forward)r)   r*   r+   r,   r#   r/   main_input_nameinput_modalitiesrI   r   r   r-   rd   r.   r   r   r>   r'   r^   re   r0   r0   rP   r1   r    s"   
 
r  c                	       sH   e Zd ZdZddedededef fddZd	ejf fd
dZ	  Z
S )Gemma3nTextScaledWordEmbeddingz\
    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
    rE   num_embeddingsembedding_dimpadding_idxembed_scalec                    s0   t  ||| || _| jdt|dd d S )Nr  FrF   )rH   rI   scalar_embed_scalerM   r-   rN   )rO   r  r  r  r  rP   r0   r1   rI     s   z'Gemma3nTextScaledWordEmbedding.__init__	input_idsc                    s   t  || j| jj S rZ   )rH   r^   r  r   rD   r   rO   r  rP   r0   r1   r^     s   z&Gemma3nTextScaledWordEmbedding.forward)rE   )r)   r*   r+   r,   rb   r[   rI   r-   rd   r^   re   r0   r0   rP   r1   r    s     r  c                       s<   e Zd ZdZdef fddZdejdejfddZ  Z	S )	Gemma3nTextLaurelBlockz Learned Augmented Residual Layerrg   c                    s^   t    || _tj| jj| jjdd| _tj| jj| jjdd| _t	| jj| jj
d| _d S )NFrh   r  )rH   rI   rg   rJ   rv   rn   laurel_ranklinear_leftlinear_rightr?   r  post_laurel_normrz  rP   r0   r1   rI     s
   
zGemma3nTextLaurelBlock.__init__r<   rY   c                 C   s&   |  |}| |}| |}|| S rZ   )r  r  r  )rO   r<   laurel_hidden_statesnormed_laurel_hidden_statesr0   r0   r1   r^     s   


zGemma3nTextLaurelBlock.forward)
r)   r*   r+   r,   r%   rI   r-   rd   r^   re   r0   r0   rP   r1   r    s    r  c                       sT   e Zd Zddedef fddZdejdejfdd	Zd
ejdejfddZ	  Z
S )Gemma3nTextMLPr   rg   	layer_idxc                    s   t    || _|j| _|j| | _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _	t
|j | _|j| | _d S NFrh   )rH   rI   rg   rn   intermediate_sizerJ   rv   	gate_projup_proj	down_projr   hidden_activationact_fnactivation_sparsity_patternactivation_sparsityrO   rg   r  rP   r0   r1   rI     s   
zGemma3nTextMLP.__init__r<   rY   c                 C   sD   |  |}| jdkr| |}| |}| |}| || }|S )Nr   )r  r  _gaussian_topkr  r  r  )rO   r<   r  activationsr  r  r0   r0   r1   r^     s   




zGemma3nTextMLP.forwardinputsc                 C   sz   t j| jt j|jd}t jjdd}||}|	|j
}t j|ddd}t j|dddd}|||  }tj|| S )	Nr   r   r   r"   rR   Tr  F)rA   rS   unbiased)r-   rN   r  r   r   distributionsnormalNormalicdfr   r   rV   stdrJ   r   relu)rO   r  target_sparsity_tensornormal_diststd_multiplierinputs_mean
inputs_stdcutoff_xr0   r0   r1   r  #  s   
zGemma3nTextMLP._gaussian_topk)r   )r)   r*   r+   r%   rb   rI   r-   rd   r^   r  re   r0   r0   rP   r1   r    s    	r  c                       s   e Zd ZdZdef fddZdejdejfddZd	ejdejfd
dZ	dejdejdejfddZ
dejdejfddZdejdejfddZ  ZS )Gemma3nTextAltUpa  Alternating Updates (AltUp)

    The AltUp module wraps transformer layers. The `predict` step modifies the
    input to the transformer layer, and the `correct` step propagates the output
    of the transformer layer to the sparsely updated dimensions.

    See more in the research paper:

    https://proceedings.neurips.cc/paper_files/paper/2023/file/f2059277ac6ce66e7e5543001afa8bb5-Paper-Conference.pdf
    rg   c                    s   t    || _tt| jj| _tj	| jj
| jj
dd| _tj	| jj
| jj
d dd| _tj	| jj| jj
dd| _t| jj| jjd| _| jdt| jjd dd d S )NFrh   r    r  router_input_scale      rF   )rH   rI   rg   rJ   rK   r-   r   rn   correct_output_scalerv   altup_num_inputscorrection_coefsprediction_coefsmodality_routerr?   r  router_normrM   rN   rz  rP   r0   r1   rI   @  s   
"zGemma3nTextAltUp.__init__rW   rY   c                 C   s.   |  || j }| |}t| |S rZ   )r  r  r  r-   r   r[   r\   )rO   rW   router_inputsroutedr0   r0   r1   compute_router_modalitiesJ  s   
z*Gemma3nTextAltUp.compute_router_modalitiesr<   c                 C   s   |  || jj }| jr | jjdur | jjj| jj | jj | |j	g |j
dd | jj| jjR  dddd}t|dddd|}|dddd}||7 }| |S )a  Predicts the output of a layer using a trainable map.

        Args:
            hidden_states: A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` derived by
                stacking the input embeddings and preprocessing the last `num_altup_inputs - 1` matrices.

        Returns:
            A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` containing the predictions.
        NrR   r   r"   r   r    )r  rg   altup_active_idxtrainingaltup_coef_clipr  rD   dataclamp_r   r_   r  r   r-   r   r   r\   )rO   r<   
modalities	all_coefspredictionsr0   r0   r1   predictO  s$   
zGemma3nTextAltUp.predictr  	activatedc                 C   s   |  |}||| jj  }|| jjddd}| jr:| jjdur:| jj	| jj | jj}t
jjj||ddd }n| |d }|dddd}t
||}||7 }| |S )a_  Corrects the predictions relative to the

        Args:
            predictions: A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` derived by
                stacking the input embeddings and preprocessing the last `num_altup_inputs - 1` matrices.
            activated: A 3D tensor of shape `[batch_size, num_tokens, hidden_size]` containing the activated inputs.

        Returns:
            A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` correcting the original
                predictions relative to the activated input embeddings.
        r"   Nrh   rE   r    r   rR   )r  rg   r  repeatr  r  r  r  rD   r  r-   rJ   r   linearr   r|   mulr   r\   )rO   r  r  r  
innovationrD   r  	correctedr0   r0   r1   correctk  s   
zGemma3nTextAltUp.correctr  c                 C   s   | | j| j  |S )a	  
        This is only defined as the `forward` so that accelerate hooks can move correctly `correct_output_scale`
        (which is a nn.Parameter, not a Module) between devices when offloading. It is otherwise only used in
        `scale_corrected_output`
        )r\   r  rO   r  r0   r0   r1   r^     s   zGemma3nTextAltUp.forwardc                 C   s
   |  |S )zMScales the provided 3D tensor of shape [batch_size, num_tokens, hidden_size].)r^   r   r0   r0   r1   scale_corrected_output  s   
z'Gemma3nTextAltUp.scale_corrected_output)r)   r*   r+   r,   r%   rI   r-   rd   r  r  r  r^   r  re   r0   r0   rP   r1   r  4  s    
r  c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..NrR   r    r   )r_   r-   r   )rW   x1x2r0   r0   r1   rotate_half  s   r  r<   n_reprY   c                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r"   N)r_   r  r   )r<   r  r   num_key_value_headsslenrp   r0   r0   r1   	repeat_kv  s
   0r  r   modulequerykeyrP  attention_maskdropoutscalingr   c                 K   s   |d u r	| j d }t|| j}	t|| j}
t||	dd| }|d ur2|| }t|}|| }|d ur:|| }tjj	|dtj
d|j}tjj||| jd}t||
}|dd }||fS )Nr   r    r   rR   r   )pr  r"   )rp   r  num_key_value_groupsr-   r   	transposer   rJ   r   r   r   r   r   r  r  r   )r	  r
  r  rP  r  r  r  r   r  r   r   attn_weightsattn_outputr0   r0   r1   eager_attention_forward  s    

r  rW   r   r   unsqueeze_dimc                 C   s(   | |}| |}| | t| |  S )a\  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        x (`torch.Tensor`): The tensor to embed.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )r|   r  )rW   r   r   r  r0   r0   r1   apply_rotary_pos_emb  s   

r  c                       s   e Zd ZdZdedef fddZ				ddejdejd	ejdB d
e	dB dej
dB dee deejejdB eej dB f fddZ  ZS )Gemma3nTextAttentionz=Multi-headed attention from 'Attention Is All You Need' paperrg   r  c                    s  t    t|dr|j| nd | _|| _|| _t|d|j|j	 | _
|j	|j | _d| _| jj| _d| _tj|j|j	| j
 |jd| _tj|j|j| j
 |jd| _tj|j|j| j
 |jd| _tj|j	| j
 |j|jd| _| jdkry|jnd | _| jdk| _t|j
|jd| _t|j
|jd| _t|j
|jdd	| _| jj| jj }||  kod
kn  | _ |jd | }| j rt!|d |d d d "|j|  | _#d| _$d S d | _#|t!|d |d d d "|j|  k| _$d S )Nlayer_typesrp   rE   Trh   sliding_attention)rA   rB   F)rA   rB   rC   r   r"   rR   )%rH   rI   hasattrr  
layer_typerg   r  getattrrn   num_attention_headsrp   r  r  r  attention_dropout	is_causalrJ   rv   attention_biasr   r   r   o_projsliding_window
is_slidingr?   r  q_normk_normv_normnum_hidden_layersnum_kv_shared_layersis_kv_shared_layerr  indexkv_shared_layer_indexstore_full_length_kv)rO   rg   r  first_kv_shared_layer_idxprev_layersrP   r0   r1   rI     sH   

(
zGemma3nTextAttention.__init__Nr<   position_embeddingsr  r;   cache_positionr  rY   c                 K   s  |j d d }g |d| jjR }|\}	}
| ||}| |}t||	|
dd}|dd}| jrM|d urM|j	| j
 \}}||j}||j}n.| ||}| |}t||	|
dd}|dd}| ||}| |}|dd}|d ur|
|	|| jd}| js|||| j|\}}| jrt|dsi |_	||f|j	| j< t| jjt}|| ||||f| jr| jnd| j| jd|\}}|jg |dR   }| |}||fS )	NrR   r    )r  r"   )r   r   r0  r"  shared_layersr   )r  r  r"  ) r_   rg   rp   r   r   r$  r  r  r)  r1  r+  r   r   r   r%  r   r&  r"  updater  r,  r  r   get_interface_attn_implementationr  r  r  r  r   r   r!  )rO   r<   r/  r  r;   r0  r  input_shapehidden_shaper   r   r   r   r   cache_kwargsattention_interfacer  r  r0   r0   r1   r^     sf   	




	

zGemma3nTextAttention.forwardNNNN)r)   r*   r+   r,   r%   rb   rI   r-   rd   r	   
LongTensorr   r   r>   r^   re   r0   r0   rP   r1   r    s,    /r  c                       s   e Zd Zdedef fddZ						ddejdejdejd	ejdB d
ejdB de	dB dejdB de
e deejeejejf dB f fddZ  ZS )Gemma3nTextDecoderLayerrg   r  c                    s   t    || _|j| _|| _|j| | _t||| _t	||d| _
t| j|jd| _t| j|jd| _t| j|jd| _t| j|jd| _|j| _t|j | _t|| _t|| _tj| j| jdd| _tj| j| jdd| _t| j|jd| _d S )N)r  r  Frh   )rH   rI   rg   rn   r  r  attention_typer  	self_attnr  mlpr?   r  input_layernormpost_attention_layernormpre_feedforward_layernormpost_feedforward_layernormhidden_size_per_layer_inputr   r  r  r  altupr  laurelrJ   rv   per_layer_input_gateper_layer_projectionpost_per_layer_input_normr  rP   r0   r1   rI   [  s$   


z Gemma3nTextDecoderLayer.__init__Nr<   r/  per_layer_inputr  position_idsr;   r0  r  rY   c              	   K   s  | j |}	|	| jj }
| |
}| |}| jd||||||d|\}}| |}|
| }|| t	d }| 
|}| |}| |}|| }| j |	|}|| jj  }| jjre| j |}| |}| |}t||}| |}| |}|dd   |7  < |S )N)r<   r  rJ  r/  r;   r0  r    r"   r0   )rD  r  rg   r  r?  rE  r=  r@  rx   rT   rA  r>  rB  r  r   altup_correct_scaler  rF  r  r-   multiplyrG  rH  )rO   r<   r/  rI  r  rJ  r;   r0  r  r  active_predictionactive_prediction_normedlaurel_outputrw  r   
attn_gatedattn_laurel	attn_normattn_ffwattn_ffw_normattn_ffw_laurel_gatedcorrected_predictionsfirst_predictionr0   r0   r1   r^   q  s@   



	






zGemma3nTextDecoderLayer.forward)NNNNNN)r)   r*   r+   r%   rb   rI   r-   rd   r:  r	   r   r   r>   r7   r^   re   r0   r0   rP   r1   r;  Z  s6    	
r;  c                       sd   e Zd ZU eed< dZdZdgZdgZdZ	dZ
dZdZdZeedZdZe  fdd	Z  ZS )
Gemma3nPreTrainedModelrg   modelTr;  r;   )r<   r=   )imagetextr  c                    s0  t  | t|trt|j nt|trIt|j	 |j
d }dtjjtd }t|j||  t|j|j t|j|  nt|trWt|j|j nt|trnt|j t|j| jjd  nt|trd\}}|j d }t!"t#|t#| t$|d d }|t%t&||   }t|j'|# (d(d nZt|t)rt|j*| jd  t|j+dt!,d	  n>t|t-r|j.D ]4}	|j/}
|j0|	 d
krt1|j0|	  }
|
|j|	d\}}tt2||	 d| tt2||	 d| qt3|drt|j4| jj4 d S d S )Nr   rE   r   r  )rE   rj   r    r"   r          @defaultr  	_inv_freq_original_inv_freqrt  )5rH   _init_weights
isinstancer  initones_rD   r   zeros_r   rp   r-   rJ   r   r   rN   copy_r   	constant_r   r   r   r   r  r  r  r  r  r  rg   rn   rf   ro   rx   ry   r[   rq   rz   r{   rk   r|   Gemma3nTextModelper_layer_projection_scaleper_layer_input_scalerT   Gemma3nRotaryEmbeddingr  compute_default_rope_parameters	rope_typer   r  r  rt  )rO   r	  r   r   r}   r~   r   r   rk   r  rope_init_fncurr_inv_freqr   rP   r0   r1   ra    sJ   






 

z$Gemma3nPreTrainedModel._init_weights)r)   r*   r+   r$   r/   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr;  r  _can_record_outputsr  r-   no_gradra  re   r0   r0   rP   r1   rX    s"   
 rX  c                       s   e Zd ZU ejed< ddef fddZe				ddedB de	d de
dB d	edB d
edef f
ddZe edddZ  ZS )rk  inv_freqNrg   c                    s   t    |j| _|j| _|| _tt|j| _i | _	| jD ]P}| jj
| }|d u r+q|d | j	|< | j}| j	| dkrCt| j	|  }|| j||d\}}| j| d|dd | j| d| dd t| | d| qd S )	Nrm  r]  r^  r_  FrF   r`  _attention_scaling)rH   rI   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrg   listsetr  rm  rope_parametersrl  r   rM   r   setattr)rO   rg   r   r  rope_paramsrn  ro  curr_attention_scalingrP   r0   r1   rI     s&   

zGemma3nRotaryEmbedding.__init__r   ztorch.deviceseq_lenr  rY   ztorch.Tensorc                 C   s^   | j | d }t| ddp| j| j }d}d|tjd|dtjdj|tjd|   }||fS )	a|  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
            layer_type (`str`, *optional*):
                The current layer type if the model has different RoPE parameters per type.
                Should not be used unless `config.layer_types is not None`

        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetarp   NrE   r   r    r   r   )	r  r  rn   r  r-   r{   int64r   r[   )rg   r   r  r  baserA   attention_factorr{  r0   r0   r1   rl    s   &z6Gemma3nRotaryEmbedding.compute_default_rope_parametersc                 C   s  t | | d}t | | d}|d d d d f  |jd dd|j}|d d d d d f  }t|jjtrE|jjdkrE|jjnd}t	|dd	) | |  
dd
}	tj|	|	fdd}
|
 | }|
 | }W d    n1 syw   Y  |j|jd|j|jdfS )Nr_  r|  r   rR   r"   mpscpuF)device_typeenabledr    r   r   )r  r[   r  r_   r   r   rb  r   strr   r  r-   r   r   r   r   )rO   rW   rJ  r  r{  attention_scalinginv_freq_expandedposition_ids_expandedr  freqsembr   r   r0   r0   r1   r^   !  s   .&zGemma3nRotaryEmbedding.forwardNNr9  rZ   )r)   r*   r+   r-   rd   r/   r%   rI   staticmethodr   rb   r  r>   r[   rl  rz  r   r^   re   r0   r0   rP   r1   rk    s,   
 

#rk  zBThe base Gemma 3n language model without a language modeling head.c                       s   e Zd ZU eed< dZdef fddZeedde									dde
jdB d	e
jdB d
e
jdB de
jdB dedB de
jdB dedB de
jdB dee defddZde
jde
jfddZ	dde
jd	e
jdB de
jfddZ  ZS )rh  rg   )r[  c                    sl  t     j_ j_t j jjjjd d_t	
 fddt jD _t j jd_t _d_ j_ j_t j j j j jd d_t	jj j j dd_t j jd_t	
fddtd	jjD _t	
fd
dtd	jjD _jdtjd dd jdt tddd !  d S )N      ?)r  c                    s   g | ]}t  |qS r0   )r;  )r  r  r  r0   r1   r  C  s    z-Gemma3nTextModel.__init__.<locals>.<listcomp>r  Frh   c                        g | ]}t j j jd dqS Frh   rJ   rv   rn   r  r`   r0   r1   r  ]       r"   c                    r  r  r  r  r`   r0   r1   r  a  r  ri  r   rF   rj  r\  )"rH   rI   pad_token_idr  
vocab_sizer  rn   rg   embed_tokensrJ   r  r  r'  layersr?   r  rD  rk  
rotary_embgradient_checkpointingrC  vocab_size_per_layer_inputembed_tokens_per_layerrv   per_layer_model_projectionper_layer_projection_normr  altup_projectionsaltup_unembed_projectionsrM   r-   rN   r   r  rz  rP   )rg   rO   r1   rI   9  sF   


zGemma3nTextModel.__init__F)tie_last_hidden_statesNr  per_layer_inputsr  rJ  r;   inputs_embeds	use_cacher0  r  rY   c	              	   K   s  |du |duA rt d|dur| |}| |}| ||}|r,|du r,t| jd}|du rG|dur8| nd}
tj|j	d |j
d|
 }|du rP|d}t| }tsp| j|||||d}tdi |tdi |d}|}tj|d	 d
ddd }td}|g}td| jjD ]6}| j|d  |}|j|j|j
d}tj|d	 d
dd}tt|||j
}|| | }|| qtj|dd}i }| jjD ]}| |||||< q| jd| jj D ](}||j }|dddd|j ddf }||||j |f||||d|	}qtj|d d	 d
ddd }|d g}td| jjD ]9}| j!|d  || }|j|j|j
d}tj|d	 d
dd}tt|||j
}|| | }|| q)t|}tj|dd}| "|}t#||dS )z
        per_layer_inputs (torch.Tensor, *optional*, defaults to None):
            Pre-computed per-layer embeddings. If None, they are derived from input_ids if provided.
        N:You must specify exactly one of input_ids or inputs_embedsr  r   r"   r   )rg   r  r  r0  r;   rJ  )full_attentionr  r    rR   Tr  r  gh㈵>r  r   )r  rJ  r;   r0  )r  r;   r0   )$r   r  get_per_layer_inputsproject_per_layer_inputsr
   rg   get_seq_lengthr-   r{   r_   r   r|   rb  dictr   r   rV   rN   r  r  r  r   r   rT   maximumrZ  stackr  r  r  r'  r<  r  r  rD  r   )rO   r  r  r  rJ  r;   r  r  r0  r  past_seen_tokenscausal_mask_mappingmask_kwargshidden_states_0target_magnitudeepsilon_tensortemp_hidden_statesrb  
altup_projcurrent_hidden_statenew_magnituder<   r/  r  decoder_layercausal_maskrI  altup_unemb_projr0   r0   r1   r^   j  s   








zGemma3nTextModel.forwardc                 C   s&   |  |jg |j| jj| jR  S rZ   )r  r   r_   rg   r'  rC  r  r0   r0   r1   r    s   z%Gemma3nTextModel.get_per_layer_inputsc                 C   s   |  |}|| jj|j|jd9 }|jg |jd d | jj| j	R  }| 
|}|d u r0|S |j|jkrC|dd | jjd d f }|| | jj|j|jd S )Nr  rR   .)r  ri  r   r   r   r   r_   rg   r'  rC  r  rj  )rO   r  r  rG  r0   r0   r1   r    s&   

z)Gemma3nTextModel.project_per_layer_inputs)NNNNNNNNrZ   )r)   r*   r+   r%   r/   r  rI   r   r   r   r-   r:  rd   r	   r7   rc   r   r   r   r^   r  r  re   r0   r0   rP   r1   rh  4  sZ   
 1	
n
rh  z?The base Gemma 3n language model with a language modeling head.c                       s   e Zd ZU ddiZddiZddgdgfiZeed< dd	iZdef fd
dZ	e
e									ddejdB dejdB dejdB dedB dejdB dejdB dedB dejdB deejB dee defddZ  ZS )Gemma3nForCausalLMlm_head.weightzmodel.embed_tokens.weightlm_headcolwise_gather_outputr<   r:   rg   zmodel.language_modelrY  c                    s@   t  | t|| _|j| _tj|j|jdd| _| 	  d S r  )
rH   rI   rh  rY  r  rJ   rv   rn   r  r  rz  rP   r0   r1   rI     s
   
zGemma3nForCausalLM.__init__Nr   r  r  rJ  r;   r  labelsr  r0  logits_to_keepr  rY   c
              
   K   s   | j d|||||||d|
}|j}t|	trt|	 dn|	}| |dd|ddf }| jjdurE|| jj }t	|}|| jj }d}|durW| j
||| jfi |
}t|||j|j|jdS )a  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, Gemma3nForCausalLM

        >>> model = Gemma3nForCausalLM.from_pretrained("google/gemma-2-9b")
        >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")

        >>> prompt = "What is your favorite condiment?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "What is your favorite condiment?"
        ```)r  r  rJ  r;   r  r  r0  N)r9   r:   r;   r<   r=   r0   )rY  r  rb  rb   slicer  rg   final_logit_softcappingr-   r   loss_functionr  r   r;   r<   r=   )rO   r  r  rJ  r;   r  r  r  r0  r  r  outputsr<   slice_indicesr:   r9   r0   r0   r1   r^     s8   !
zGemma3nForCausalLM.forward)	NNNNNNNNr   )r)   r*   r+   _tied_weights_keys_tp_plan_pp_planr%   r/   _checkpoint_conversion_mappingrI   r   r   r-   r:  rd   r	   r7   rc   rb   r   r   r   r^   re   r0   r0   rP   r1   r    sR   
 		
r  c                       sX   e Zd ZdZdeeB def fddZ		ddej	dB dej
dB d	ej
fd
dZ  ZS )Gemma3nMultimodalEmbedderzQEmbeds token ids or soft tokens for multimodal content into language model space.multimodal_configtext_configc                    s   t    |j| _|j| _|j| _|j| _|j| _t	
| j| j| _t| j| jd| _t| j| jd| _t	j| j| jdd| _t| j| jdd| _d S )Nr  Frh   )rB   rC   )rH   rI   rn   multimodal_hidden_sizer  rB   vocab_offsetr  text_hidden_sizerJ   	Embedding	embeddingr?   hard_embedding_normsoft_embedding_normrv   embedding_projectionembedding_post_projection_norm)rO   r  r  rP   r0   r1   rI   T  s   
z"Gemma3nMultimodalEmbedder.__init__Nr  r  rY   c                 C   sZ   |du |duA rt d|dur| |}n| || j }| |}| |}| |S )a  Embeds token ids or soft tokens for multimodal content into language model space.

        Args:
            input_ids: A torch.LongTensor containing the token ids to embed. Values should be in the range
                `[vocab_offset, vocab_offset + vocab_size)`.
            inputs_embeds: A torch.Tensor containing the soft tokens to embed.

        Returns:
            A torch.Tensor of embeddings with  shape `[batch_size, seq_len, self.config.text_config.hidden_size]`.
        Nr  )r   r  r  r  r  r  r  )rO   r  r  emb_normhard_embemb_norm_projr0   r0   r1   r^   g  s   


z!Gemma3nMultimodalEmbedder.forwardr  )r)   r*   r+   r,   r#   r&   r%   rI   r-   r:  rd   r^   re   r0   r0   rP   r1   r  Q  s     r  z
    The base Gemma 3n model comprising a vision backbone, an audio backbone, and a language model without a
    language modeling head.
    c                "       s  e Zd Zi ZdZdef fddZdd Zdd Ze	e
d	d
dejdee deeB fddZ				d(dejdB dejdB dejdB dejdB fddZe															d)dejdB dejdB dejdB dejdB dejdB dejdB dedB dejdB dejdB dejdB dejdB dedB d edB d!edB d"ee def d#d$Ze	e
d%d
dejdejdee deeB fd&d'Z  ZS )*Gemma3nModelFrg   c                    s~   t  | tj|jd| _|jj| _tj|jd}|| _|jj	| _	t|j
| _t|j|j| _t|j
|j| _|   d S )Nr  )rH   rI   r!   from_configvision_configvision_towerr  r  language_modelr  audio_configaudio_towerr  embed_visionembed_audior  )rO   rg   r  rP   r0   r1   rI     s   

zGemma3nModel.__init__c                 C   
   | j  S rZ   )r  get_input_embeddingsr`   r0   r0   r1   r       
z!Gemma3nModel.get_input_embeddingsc                 C      | j | d S rZ   )r  set_input_embeddingsrO   rP  r0   r0   r1   r       z!Gemma3nModel.set_input_embeddingszOProjects the last hidden state from the vision model into language model space.r2   pixel_valuesr  rY   c                 K   sj   | j d	|ddd|}|j}||jd | jjj| jjddd}|| jjjd 9 }| j	|d|_
|S )
NFT)r  
do_poolingreturn_dictr   r    r"   r  r  r0   )r  r  r   r_   rg   r  rn   vision_soft_tokens_per_imager   r  pooler_output)rO   r  r  vision_outputsr  r0   r0   r1   get_image_features  s   
zGemma3nModel.get_image_featuresNr  r  image_featuresaudio_featuresc           	      C   s>  |du r1||   tj| jjtj|jdk}|d}||   tj| jjtj|jdkd}n|| jjk}|| jjk}|	 }|
d||j}|durlt||  | kd| d|jd |jd    |	 }|
d||j}|durt||  | kd| d|jd |jd    ||fS )	z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        Nr  rR   z6Image features and image tokens do not match, tokens: z, features: r   r"   z6Audio features and audio tokens do not match, tokens: )r  r-   rN   rg   image_token_idlongr   allaudio_token_idr  r|   	expand_asr   r   numelr_   )	rO   r  r  r  r  special_image_maskspecial_audio_maskn_image_tokensn_audio_tokensr0   r0   r1   get_placeholder_mask  s:   
z!Gemma3nModel.get_placeholder_maskinput_featuresr  input_features_maskrJ  r;   token_type_idsr0  r  r  output_attentionsoutput_hidden_states	lm_kwargsc           *      K   s  |du |
duA rt d|dur|n| jj}|dur|n| jj}|dur|  |}
t|dk|| jk }t||t	|}| j
|}t|| jjk|| jjk }| jj| jj d }t||||
j}| j|d}||
j|
j}|d|
}t|||
}
|| jjk}| jj| jj d }t||||
j}| j|d}||
j|
j}|d|
}t|||
}
nd}|dur| j|ddj}||
j|
j}| j||
|d	\}}|
||}
|durP|durP| j|| dd} | j} | j}tj| jd ggtj| jd
}!| j|!d}"t|d|"| } | j\}#}$}%| jj|$ }&|"|#|&|%}'tj | |'fdd} | |
j|
j} | j||
| d\}}(|
|(| }
| j
dd|||||
|||d|	d|})t!|)j"|rm|)j#nd|)j$|)j%|dury|nd|dur| dS ddS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, Gemma3nForConditionalGeneration

        >>> model = Gemma3nForConditionalGeneration.from_pretrained("google/gemma3n2-3b-mix-224")
        >>> processor = AutoProcessor.from_pretrained("google/gemma3n2-3b-mix-224")

        >>> prompt = "Where is the cat standing?"
        >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(images=image, text=prompt,  return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(**inputs,)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Where is the cat standing?\nsnow"
        ```
        Nr  r   r"   )r  rR   T)r  )r  r  r  r   )r  r  )r  r  r  rJ  r;   r  r  r  r  r  r0  )r  r;   r<   r=   r5   r6   r0   )&r   rg   r  r  r  r-   r   r  r   
zeros_liker  r  r  r  r  r  r   r   r   r|   r  r  r  r  masked_scatterget_audio_featuresr(   rN   r  r_   audio_soft_tokens_per_imager  r   r4   r  r;   r<   r=   )*rO   r  r  r  r  r  rJ  r;   r  r0  r  r  r  r  r  r	  per_layer_inputs_maskper_layer_inputs_tokensr  vision_maskdummy_vision_token_idvision_input_idsvision_embedsexpanded_vision_mask
audio_maskdummy_audio_token_idaudio_input_idsaudio_embedsexpanded_audio_maskr  r  r   r  audio_padding_toksaudio_padding_embsaudio_batch_sizeaudio_seq_lenaudio_embed_dimextra_padding_tokensextra_padding_featuresr   r  r0   r0   r1   r^     s   1

zGemma3nModel.forwardzPProjects the last hidden state from the audio encoder into language model space.c                 K   s0   | j ||fddi|}| j|jd}||_|S )a0  
        input_features (`torch.FloatTensor]` of shape `(num_images, seq_length, num_features)`):
            The tensors corresponding to the input audio.
        input_features_mask (`torch.FloatTensor]` of shape `(num_images, seq_length)`):
            The attention mask for the input audio.
        r  Tr  )r  r  r  r  )rO   r  r  r  audio_outputsr  r0   r0   r1   r  u  s   zGemma3nModel.get_audio_featuresr9  )NNNNNNNNNNNNNN)r)   r*   r+   r  accepts_loss_kwargsr$   rI   r  r  r   r   r-   r7   r   r   r>   r   r  r:  r  rd   r	   rc   r8   r^   r'   r  re   r0   r0   rP   r1   r    s    
,	
 r  z
    The base Gemma 3n model comprising a vision backbone, an audio backbone, a language model, and a language modeling
    head.
    c                %       sN  e Zd Zi ZddiZdef fddZdd Zdd	 Ze	d
e
jdee fddZee																d&de
jdB d
e
jdB de
jdB de
jdB de
jdB de
jdB dedB de
jdB de
jdB de
jdB de
jdB dedB dedB dedB dee
jB dee def"d d!Z										"			#d' fd$d%	Z  ZS )(Gemma3nForConditionalGenerationr  z(model.language_model.embed_tokens.weightrg   c                    s<   t  | t|| _tj|jj|jjdd| _	| 
  d S r  )rH   rI   r  rY  rJ   rv   r  rn   r  r  r  rz  rP   r0   r1   rI     s   
z(Gemma3nForConditionalGeneration.__init__c                 C   r  rZ   )rY  r  r`   r0   r0   r1   r    r  z4Gemma3nForConditionalGeneration.get_input_embeddingsc                 C   r  rZ   )rY  r  r  r0   r0   r1   r    r  z4Gemma3nForConditionalGeneration.set_input_embeddingsr  r  c                 K   s   | j j|fi |S rZ   )rY  r  )rO   r  r  r0   r0   r1   r    s   z2Gemma3nForConditionalGeneration.get_image_featuresNr   r  r  r  r  rJ  r;   r  r0  r  r  r  r  r  r  r	  rY   c                 K   s  |dur|n| j j}|dur|n| j j}| jd	|||||||||	|
||||dd|}|j}t|tr:t| dn|}| |dd|ddf }| j 	 j
 }dura|| }t|}|| }d}|dur| }|dddddf }|dddf }|dur|dd|jd  df |j}|||jdk  }|||jdk  }n| }| }t }|d| j jj}|d|j}|||}t|||j|j|j|j|jdS )
a  
        input_features_mask (torch.Tensor, *optional*, defaults to None):
            The attention mask for the input audio.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels in
            `[0, ..., config.text_config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration

        >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma-3-4b-it")
        >>> processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it")

        >>> messages = [
        ...     {
        ...         "role": "system",
        ...         "content": [
        ...             {"type": "text", "text": "You are a helpful assistant."}
        ...         ]
        ...     },
        ...     {
        ...         "role": "user", "content": [
        ...             {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
        ...             {"type": "text", "text": "Where is the cat standing?"},
        ...         ]
        ...     },
        ... ]

        >>> inputs = processor.apply_chat_template(
        ...     messages,
        ...     tokenizer=True,
        ...     return_dict=True,
        ...     return_tensors="pt",
        ...     add_generation_prompt=True
        ... )
        >>> # Generate
        >>> generate_ids = model.generate(**inputs)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "user\nYou are a helpful assistant.\n\n\n\n\n\nWhere is the cat standing?\nmodel\nBased on the image, the cat is standing in a snowy area, likely outdoors. It appears to"
        ```
        NT)r  r  r  r  r  rJ  r;   r  r0  r  r  r  r  r  r  .rR   r"   r   )r9   r:   r;   r<   r=   r5   r6   r0   )rg   r  r  rY  r  rb  rb   r  r  get_text_configr  r-   r   r[   r_   r   r   r   rJ   CrossEntropyLossr   r  r  r8   r;   r<   r=   r5   r6   )rO   r  r  r  r  r  rJ  r;   r  r0  r  r  r  r  r  r  r	  r  r<   r  r:   r  r9   shift_logitsshift_labelsshift_attention_maskloss_fctflat_logitsflat_labelsr0   r0   r1   r^     sj   E
$
z'Gemma3nForConditionalGeneration.forwardTFc                    sL   t  j|f||||||||
|d	|}|s|s$||d< ||d< |	|d< |S )N)	r;   r  r  rJ  r0  r  r  r  is_first_iterationr  r  r  )rH   prepare_inputs_for_generation)rO   r  r;   r  r0  rJ  r  r  r  r  r  r  r  r  r,  r  model_inputsrP   r0   r1   r-  ,	  s(   z=Gemma3nForConditionalGeneration.prepare_inputs_for_generation)NNNNNNNNNNNNNNr   )NNNNNNNNNTNNF)r)   r*   r+   r  r  r$   rI   r  r  r   r-   r7   r   r   r  r   r:  rd   r	   rc   rb   r8   r^   r-  re   r0   r0   rP   r1   r#    s    	
 r#  )r  r  r#  r  rX  rh  )r   NN)r"   )crx   collections.abcr   r   dataclassesr   typingr   r-   torch.nnrJ   torch.nn.functionalr   rQ   r   rc  r  r   cache_utilsr	   r
   
generationr   integrationsr   masking_utilsr   r   modeling_layersr   modeling_outputsr   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.genericr   r   utils.output_capturingr   autor!   configuration_gemma3nr#   r$   r%   r&   r'   r4   r8   Moduler?   rf   r   r  r4  rX  rs  r  r  r  r  r  r  r  r  r  r  rd   rb   r  r[   r>   r  r  r  r;  rX  rk  rh  r  r  r  r#  __all__r0   r0   r0   r1   <module>   s   	 + emEI+Q&c

""wM;Q JR2   G