o
    	۷iӷ                    @   s  d dl Z d dlZd dlmZmZ d dlmZ d dlmZm	Z	 d dl
Z
d dlmZ d dlm  mZ ddlmZ ddlmZmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZm Z  ddl!m"Z"m#Z# ddl$m%Z%m&Z& ddl'm(Z( ddl)m*Z*m+Z+m,Z,m-Z-m.Z. ddl/m0Z0 ddl1m2Z2 ddl3m4Z4m5Z5m6Z6m7Z7 e.8e9Z:ee,ddG dd deZ;ee,ddG dd de*Z<G dd dej=Z>G dd  d ej=Z?G d!d" d"ej=Z@G d#d$ d$ej=ZAG d%d& d&ej=ZBG d'd( d(ej=ZCG d)d* d*ej=ZDG d+d, d,ej=ZEG d-d. d.ej=ZFG d/d0 d0ej=ZGG d1d2 d2e&ZHG d3d4 d4ejIZJG d5d6 d6ej=ZKG d7d8 d8ej=ZLG d9d: d:ej=ZMG d;d< d<ej=ZNd=d> ZOd?e
jPd@eQdAe
jPfdBdCZR	D		dkdEej=dFe
jPdGe
jPdHe
jPdIee
jP dJeSdKeeS dLeeS dAeTe
jPe
jPf fdMdNZU		dldOe
jPdPe
jPdQe
jPdRee
jP dSeQf
dTdUZVG dVdW dWej=ZWG dXdY dYeZXe,G dZd[ d[e&ZYe,d\dG d]d^ d^eYZZe,d_dG d`da daeYeZ[G dbdc dcej=Z\e,dddG dedf dfeYZ]e,dgdG dhdi dieYeZ^g djZ_dS )m    N)CallableSequence)	dataclass)OptionalUnion   )ACT2FN)CacheDynamicCache)GenerationMixin)create_causal_mask!create_sliding_window_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuplelogging)deprecate_kwarg   )	AutoModel   )Gemma3nAudioConfigGemma3nConfigGemma3nTextConfigGemma3nVisionConfigzL
    Base class for Gemma3n outputs, with hidden states and attentions.
    )custom_introc                   @   s6   e Zd ZU dZdZeej ed< dZ	eej ed< dS )Gemma3nModelOutputWithPasta  
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    audio_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        audio_hidden_states of the model produced by the audio encoder and after projecting the last hidden state.
    Nimage_hidden_statesaudio_hidden_states)
__name__
__module____qualname____doc__r&   r   torchFloatTensor__annotations__r'    r/   r/   b/home/ubuntu/vllm_env/lib/python3.10/site-packages/transformers/models/gemma3n/modeling_gemma3n.pyr%   3   s   
 r%   zS
    Base class for Gemma3n causal language model (or autoregressive) outputs.
    c                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
ee ed< dZeeej  ed< dZeeej  ed< dZeej ed< dZeej ed	< dS )
Gemma3nCausalLMOutputWithPastaF  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
    audio_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        audio_hidden_states of the model produced by the audio encoder and after projecting the last hidden state.
    Nlosslogitspast_key_valueshidden_states
attentionsr&   r'   )r(   r)   r*   r+   r2   r   r,   r-   r.   r3   r4   r	   r5   tupler6   r&   r'   r/   r/   r/   r0   r1   M   s   
 r1   c                       sR   e Zd Zddededef fddZdd	 Zd
ej	dej	fddZ
dd Z  ZS )Gemma3nRMSNormư>Tdimeps
with_scalec                    sL   t    || _|| _| jrtt|| _d S | j	dt
ddd d S )Nweight      ?F
persistent)super__init__r;   r<   nn	Parameterr,   onesr=   register_buffertensor)selfr:   r;   r<   	__class__r/   r0   rB   q   s   
zGemma3nRMSNorm.__init__c                 C   s$   |t |djddd| j  S )Nr   T)keepdim)r,   sqrtpowmeanr;   )rH   xr/   r/   r0   _norm{   s   $zGemma3nRMSNorm._normrP   returnc                 C   s"   |  | | j  }||S N)rQ   floatr=   type_as)rH   rP   outputr/   r/   r0   forward~   s   
zGemma3nRMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)r7   r=   shaper;   rH   r/   r/   r0   
extra_repr   s   zGemma3nRMSNorm.extra_repr)r9   T)r(   r)   r*   intrT   boolrB   rQ   r,   TensorrW   rZ   __classcell__r/   r/   rI   r0   r8   p   s
    
r8   c                       s   e Zd Zdef fddZdejdejdejfddZd	ejd
e	de	de	de	de	de	dejfddZ
dejdejdejfddZ  ZS )%Gemma3nAudioRelativePositionEmbeddingconfigc                    s   t    || _| jj| _| jj| _| j| j | _td| jj	d | _
| jj| _tj| j| j| j dd| _d}d}| jd }tt|t| t|d d }|tt||   }| jd| dddd	 d S )
Nr   r   Fbiasr>   g     @r   inv_timescalesr?   )rA   rB   r`   conf_num_attention_heads	num_headshidden_sizechannelshead_dimmaxconf_attention_context_leftmax_backwardconf_attention_context_rightmax_forwardrC   Linearpos_projmathlogrT   r,   exparangerF   	unsqueeze)rH   r`   min_timescalemax_timescalenum_timescaleslog_timescale_incrementrc   rI   r/   r0   rB      s$   




$
z.Gemma3nAudioRelativePositionEmbedding.__init__positiondtyperR   c                 C   sN   |  d}|| jj|jtjd }tjt|t	|gdd}|
|S )NrK   )devicerz   r:   )rT   rt   rc   tor{   r,   float32catsincostype)rH   ry   rz   scaled_timetiming_signalr/   r/   r0   _get_timing_signal_1d_pos   s   
z?Gemma3nAudioRelativePositionEmbedding._get_timing_signal_1d_posterm_bd_before_shift
batch_sizere   num_query_blocksquery_block_sizekey_context_sizemax_span_plus_1c                 C   sx   |d | }d|f}	t j||	}
|
|||||d  f}|ddddddd|| f }||||||f}|S )aZ  Performs the relative shift.

        Args:
          term_bd_before_shift: Tensor of shape [B, N, U, W, F_span]. batch_size
            (B), num_heads (N), num_query_blocks (U), query_block_size (W),
            key_context_size (C = W+L+R), max_span_plus_1 (F_span = L+R+1).

        Returns:
          Tensor of shape [B, N, U, W, C].
        r   r   N)rC   
functionalpadreshape)rH   r   r   re   r   r   r   r   pad_amount_last_dimpadding_tupleterm_bd_paddedterm_bd_reshapedterm_bd_slicedterm_bd_shiftedr/   r/   r0   _relative_shift   s(   
$
	z5Gemma3nAudioRelativePositionEmbedding._relative_shiftquerieskeysc              	   C   s"  |j \}}}}}|j \}}}	}}tj| j| j d d|jdd}
|
j d }| j|
|jd}| 	|}|
d|| j| jd}|ddddd}|ddddd}t||}|ddddd}|ddd}|
|||| |}t||}|
|||||}| ||||||	|}|| S )	Nr   rK   r{   r   rz   r   r      )rX   r,   rs   rk   rm   r{   rt   r   rz   ro   r   re   rh   squeezepermutematmulr   )rH   r   r   r   r   r   re   rh   _r   pos_indicesr   sin_emb_timing_signalprojected_sin_embsin_emb	queries_pkeys_p_tterm_ac
q_permuted
s_permuted
q_reshapedterm_bd_unshifed_matmulterm_bd_unshifedr   r/   r/   r0   rW      sJ   

		
z-Gemma3nAudioRelativePositionEmbedding.forward)r(   r)   r*   r    rB   r,   r]   rz   r   r[   r   rW   r^   r/   r/   rI   r0   r_      s*    	
$=r_   c                       s   e Zd Zdef fddZdejdededejfdd	Zd
ejdejfddZ	d
ejdejfddZ
d
ejdejdejfddZ  ZS )Gemma3nAudioAttentionr`   c                    s  t    || _| jj| _| jj| _| j| j | _| jj| _| jj	| _
td| jjd | _| jj| _| j| j | j
 | _t|| _tt| jf| _tj| j| j| j dd| _tj| j| j| j dd| _tj| j| j| j dd| _| jd }dtjjtd }| jd||   ! dd	 tj"tj#| j| jftj$d
ddj%}tj"tj#| j| jftj$d
| j| j
 d}tj#| j| jftj$d
}|| | }| jd|dd	 | jdt| j& dd	 d S )Nr   r   Fra         r>           q_scaler?   r   )diagonallocal_causal_valid_masksoftcap)'rA   rB   r`   rd   re   rf   rh   conf_attention_chunk_size
chunk_sizerl   max_future_horizonri   rj   max_past_horizonconf_attention_logit_capattention_logits_soft_capcontext_sizer_   relative_position_embeddingrC   rD   r,   zerosper_dim_scalern   q_projk_projv_projr   softplusrG   rF   clonedetachtrilrE   r\   TrT   )rH   r`   r   r_softplus_0lower_causal_maskupper_causal_maskr   rI   r/   r0   rB   6  sH   









zGemma3nAudioAttention.__init__rP   pad_left	pad_rightrR   c           	      C   sL   |j ^}}}|||g|R }|||g|R }tj|||gdd}|S )Nr   r|   )rX   	new_zerosr,   r   )	rH   rP   r   r   batchr   
tail_shapeleftrightr/   r/   r0   	_pad_dim1a  s
   zGemma3nAudioAttention._pad_dim1r5   c                 C   sx   |j }|dd \}}|| j d | j }|| j |  }dkr'| |d|}||| jf|dd  }|| }|S )aE  Turns a sequence to non overlapping blocks.

        Args:
            hidden_states: a tensor of [batch, time, ...].

        Returns:
            A tensor of [batch, num_blocks, block_size, ...], with necessary
            paddings,
            where output[:, i, ...] are x[:, i*block_size:(i+1)*block_size, ...].
        Nr   r   r   )rX   r   r   r   
contiguous)rH   r5   rX   bt
num_blockspadding_lenpermute_dimsr/   r/   r0   _convert_to_blockh  s   z'Gemma3nAudioAttention._convert_to_blockc                 C   sl   | j }| j| j d }| |||}| j}| j}|jd||d}|jdkr2|jdkr2tj|ddd}|	 S )a  Extracts temporal context for every block.

        Args:
            hidden_states: a tensor of [batch, time, ...].

        Returns:
            A tensor of [batch, num_blocks, context_size, ...], with necessary
            paddings,
            where context_size = block_size + left_context + right_context,
            and output[:, i, ...] are x[:, start-left_context:end+right_context,
            ...],
            start = i * block_size, end = (i + 1) * block_size.
        r   )	dimensionsizestepr   r   rK   )sourcedestination)
r   r   r   r   r   unfoldndimr,   movedimr   )rH   r5   r   r   	frame_len
frame_step
x_unfoldedr/   r/   r0   _extract_block_context~  s   z,Gemma3nAudioAttention._extract_block_contextmaskc           "   
   C   s  g |j d d | j| jR }| || }| || }| || }tj	j
| j}ddd| jf}||}	|| j |	 }|j d d \}
}| |}| |}| |}|j d }| }| |}|jdkr|j d |j d  | jkr||
|| j}|j |
|| jfkrtd|j  d|
 d| d| j d		|dd
}| jddd}t|||j}| ||}| j|j}|| }t|}|| }t||t|jj}tj	j
j |dtj!dj|jd}|j \}}}}}|j d }|"dddddd||}|"dddddd||}t#||} | |||||"ddddd}!|!|
|| j$ | j| jf}!|!d d d |f }!|!S )NrK   r   r   r   r   z%Shape of extracted_valid_mask_blocks z	 is not (z, z) after potential reshape.r   r:   rz   r   )%rX   re   rh   r   r   r   r   r   r,   rC   r   r   r   viewr   r   r   r   r   
ValueErrorrt   r   logical_andr}   r{   r   r   tanhwherefinforz   minsoftmaxr~   r   bmmr   )"rH   r5   r   	qkv_shapequery_states
key_statesvalue_statesper_dim_scale_spbroadcast_shapeper_dim_scale_sp_broadcastr   q_timequery_blocks
key_blocksvalue_blocksr   original_valid_maskextracted_valid_mask_blockscondition_from_input_validitycondition_from_causalityfinal_condition_for_wherer3   softcap_valprobabilitiesb_dimn_dimu_dimw_dimc_dimh_dimprob_bunv_bun
result_bmmcontext_vectorsr/   r/   r0   rW     s    






	

 
 zGemma3nAudioAttention.forward)r(   r)   r*   r    rB   r,   r]   r[   r   r   r   
BoolTensorrW   r^   r/   r/   rI   r0   r   5  s    +$0r   c                       sL   e Zd ZdZ	ddedee def fddZdej	d	ej	fd
dZ
  ZS )Gemma3nAudioCumulativeGroupNorma  Applies Group Normalization cumulatively over the time dimension.

    This layer normalizes the input by calculating the mean and variance
    cumulatively over the time dimension (dim 1). The statistics are computed
    over all feature dimensions (specified by `feature_dims` and `num_channels`)
    for elements marked as valid by the optional `mask`.

    If a `mask` is provided (True for valid, False for invalid/padded),
    invalid time steps do not contribute to the statistics calculation, and
    their corresponding output values are zeroed out.

    Scale and bias, if enabled, are applied per-channel (last dimension).
    This behavior is similar to JAX's `GroupNormalization` with `num_groups=1`
    and `cumulative=True`.
    MbP?num_channelsfeature_dimsr;   c                    sT   t    || _t|| _|| _tt	|| _
ttddt| j d | _d S )Nr   r   )rA   rB   r  r7   r  r;   rC   rD   r,   rE   r=   rangelenreduction_axes)rH   r  r  r;   rI   r/   r0   rB   &  s   

"z(Gemma3nAudioCumulativeGroupNorm.__init__r5   rR   c                 C   sL  | j | jf }|jdd |krtd|jdd  d| |j}tj}||}tj||d}tj	|| j
dd}tj|dd	}tj	|| j
dd}	tj|	dd	}
tj|
d
d}|| }|| d}tj	|| j
dd}tj|dd	}|| }|| t|| j  }| j|}dg| d  | jg }||| }|| }||S )zApplies cumulative group norm, optionally using a mask.

        Args:
          hidden_states: Input tensor, shape [B, T, *feature_dims, C].

        Returns:
          Normalized tensor with the same shape as x.
        r   NzInput tensor shape suffix z> does not match expected suffix (feature_dims + num_channels) r   Tr:   rL   r   r|   r>   )r   )r  r  rX   r   rz   r,   r~   r}   	ones_likesumr  cumsumclamprN   rsqrtr;   r=   r:   r   )rH   r5   expected_input_suffixinput_dtype
calc_dtypex_calc	mask_calcsum_values_at_tcum_sum_valueselements_in_group_at_tcum_count_elementssafe_cum_count_elementscum_meansquared_diff_from_meansum_sq_diff_at_tcum_sum_sq_diffcum_variancenormalized_xscalescale_view_shapefinal_outputr/   r/   r0   rW   8  s6   	

z'Gemma3nAudioCumulativeGroupNorm.forward)r  )r(   r)   r*   r+   r[   r   rT   rB   r,   r]   rW   r^   r/   r/   rI   r0   r    s    r  c                       sX   e Zd ZdZ	ddedededeeeeef f fddZd	ej	d
ej	fddZ
  ZS )Gemma3nAudioSSCPConvBlockzA single convolution block for the SubSampleConvProjection.

    This block consists of a 2D convolution, followed by CumulativeGroupNorm,
    and a ReLU activation. It handles manual padding for the convolution.
    r   r   r   r   r`   idxinput_freq_dimmanual_paddingc                    s   t    || _|| _|dkrdn| jj|d  }| jj| }| jj| \}}| jj| \}	}
tj||||f|	|
fddd| _	|| jd  | jd  }|| |
 d }t
||f| jjd| _t | _d S )Nr   r   )r   r   F)in_channelsout_channelskernel_sizestridepaddingrb   )r  r  r;   )rA   rB   r`   r/  sscp_conv_channel_sizesscp_conv_kernel_sizesscp_conv_stride_sizerC   Conv2dconvr  sscp_conv_group_norm_epsnormReLU
activation)rH   r`   r-  r.  r/  r0  r1  kernel_hkernel_wstride_hstride_wf_in_padded
f_out_convrI   r/   r0   rB     s2   
z"Gemma3nAudioSSCPConvBlock.__init__audio_encodingsrR   c                 C   sf   t j|| jddd| jjj}| |}|dddd }| 	|}|dddd }| 
|S )Nconstantr   )modevaluer   r   r   r   )Fr   r/  r}   r9  r=   rz   r   r   r;  r=  )rH   rD  audio_encodings_paddedaudio_encodings_conv
x_for_normx_normedaudio_encodings_normedr/   r/   r0   rW     s   


z!Gemma3nAudioSSCPConvBlock.forward)r,  )r(   r)   r*   r+   r    r[   r7   rB   r,   r]   rW   r^   r/   r/   rI   r0   r+    s    +r+  c                       8   e Zd Zdef fddZdejdejfddZ  ZS )#Gemma3nAudioSubSampleConvProjectionr`   c                    s  t    || _|j}g }g }tdD ]:}|j| \}}|j| \}}	d}
|d }d}d}|||
|f}|| || | }|| |	 d }|| |}qtd|j||d d| _	td|d ||d d| _
|jd }|d }|| | _tj| j| jjdd| _d S )Nr   r   r   )r-  r.  r`   r/  rK   Fra   )rA   rB   r`   input_feat_sizer  r6  r7  appendr+  conv_0conv_1r5  input_proj_in_featuresrC   rn   rf   input_proj_linear)rH   r`   current_f_for_block_inputcalculated_block_paddingcalculated_f_out_dimsir>  r?  r@  rA  	pad_t_toppad_t_bottom
pad_f_leftpad_f_rightmanual_padding_tuplerB  f_out_after_convfinal_c_outfinal_f_outrI   r/   r0   rB     sL   




z,Gemma3nAudioSubSampleConvProjection.__init__rD  rR   c                 C   s`   | d}| |}| |}|j\}}}}|dddd }||||| }	| |	}
|
S )Nr   r   r   r   )rt   rR  rS  rX   r   r   r   rU  )rH   rD  audio_encodings_reshapedrP   r   c_outt_outf_out
x_permutedoutput_flattenedrV   r/   r/   r0   rW     s   



z+Gemma3nAudioSubSampleConvProjection.forward	r(   r)   r*   r    rB   r,   r]   rW   r^   r/   r/   rI   r0   rO    s    9rO  c                       >   e Zd Zdef fddZdejdejdejfddZ  Z	S )	Gemma3nAudioConformerAttentionr`   c                    sv   t    || _| jj| _| jdt| jjdd t	| jj| _
t|| _tj| j| jjdd| _t	| jj| _d S )Ngradient_clippingFr?   ra   )rA   rB   r`   rf   post_in_featuresrF   r,   rG   rk  r8   pre_attn_normr   attnrC   rn   post	post_normrH   r`   rI   r/   r0   rB     s   


z'Gemma3nAudioConformerAttention.__init__rD  audio_mel_maskrR   c                 C   sz   |}t || j | j}| |}| ||}|j\}}}}	|||||	 }
| |
}t || j | j}|| | S rS   )	r,   r  rk  rm  rn  rX   r   ro  rp  )rH   rD  rr  audio_encodings_input_to_attnaudio_encodings_normaudio_encodings_attn_outr   r   re   rh   rb  r/   r/   r0   rW     s   

z&Gemma3nAudioConformerAttention.forward
r(   r)   r*   r    rB   r,   r]   r
  rW   r^   r/   r/   rI   r0   rj    s    $
rj  c                       rN  ) Gemma3nAudioConformerFeedForwardr`   c                    s   t    || _| jdt| jjdd t| jj| _	t
j| jj| jjd dd| _t
j| jjd | jjdd| _t| jj| _t| jj| _d S )Nrk  Fr?   r   ra   )rA   rB   r`   rF   r,   rG   rk  r8   rf   pre_layer_normrC   rn   ffw_layer_1ffw_layer_2post_layer_normconf_residual_weightpost_layer_scalerq  rI   r/   r0   rB   -  s   
z)Gemma3nAudioConformerFeedForward.__init__rD  rR   c                 C   sn   |}t || j | j}| |}| |}tj|}| |}t || j | j}| 	|}||| j
  S rS   )r,   r  rk  rx  ry  rC   r   silurz  r{  r}  )rH   rD  residualr/   r/   r0   rW   9  s   



z(Gemma3nAudioConformerFeedForward.forwardrh  r/   r/   rI   r0   rw  ,  s    rw  c                       rN  ) Gemma3nAudioConformerLightConv1dr`   c              	      s   t    || _t| jj| jjd| _tj| jj| jjd dd| _	tj
| jj| jj| jjdd| jjdd| _| jdt| jjdd	 t| jj| jjd| _tj| jj| jjdd| _| jjd | _d S )
Nr;   r   Fra   r   r   )r0  r1  r2  r3  r4  groupsrb   rk  r?   )rA   rB   r`   r8   rf   rms_norm_epsrx  rC   rn   linear_startConv1dconf_conv_kernel_sizedepthwise_conv1drF   r,   rG   rk  	conv_norm
linear_endcausal_paddingrq  rI   r/   r0   rB   F  s"   
	z)Gemma3nAudioConformerLightConv1d.__init__rD  rR   c                 C   s   |}|  |}| |}tjjj|dd}|ddd}t|| j	df}| 
|}|ddd}t|| j | j}| |}tj|}| |}|| }|S )NrK   r|   r   r   r   )rx  r  r,   rC   r   glur   rH  r   r  r  r  rk  r  r~  r  )rH   rD  audio_encodings_residualaudio_encodings_permutedaudio_encodings_permuted_paddedrV   r/   r/   r0   rW   [  s   




z(Gemma3nAudioConformerLightConv1d.forwardrh  r/   r/   rI   r0   r  E  s    r  c                       ri  )	Gemma3nAudioConformerBlockr`   c                    sl   t    || _t| j| _t| j| _t| j| _t| j| _	| j
dt| jjdd t| jj| _d S )Nrk  Fr?   )rA   rB   r`   rw  ffw_layer_startrj  	attentionr  lconv1dffw_layer_endrF   r,   rG   rk  r8   rf   r;  rq  rI   r/   r0   rB   q  s   
z#Gemma3nAudioConformerBlock.__init__rD  rr  rR   c                 C   sh   |  |}| ||}| }||d|j }| |}| |}t|| j	 | j	}| 
|}|S )NrK   )r  r  rt   r}   rz   r  r  r,   r  rk  r;  )rH   rD  rr  validity_mask_for_lconvaudio_encodings_for_lconv_inputrV   r/   r/   r0   rW   |  s   



z"Gemma3nAudioConformerBlock.forwardrv  r/   r/   rI   r0   r  p  s    $r  c                       sZ   e Zd ZU dZeed< dZdef fddZdej	dej
deej	ej
f fdd	Z  ZS )
Gemma3nAudioEncoderzx
    An audio encoder based on the [Universal Speech Model](https://huggingface.co/papers/2303.01037) architecture.
    r`   	audio_melc                    s@   t     | _t | _t fddt jD | _	d S )Nc                    s   g | ]}t  qS r/   )r  .0r   r`   r/   r0   
<listcomp>  s    z0Gemma3nAudioEncoder.__init__.<locals>.<listcomp>)
rA   rB   r`   rO  subsample_conv_projectionrC   
ModuleListr  conf_num_hidden_layers	conformerrq  rI   r  r0   rB     s   

zGemma3nAudioEncoder.__init__rr  rR   c           
      C   sZ  |  |}|jd }d}tt| jjD ]}|| jj| d 9 }qtj||jd| }tj	||jd d d}|j
dkrN|j
dkrN|d|jd d}n |j
|j
krn|jd dkrn|jd dkrn||jd krn|d}t|d|}| jD ]}	|	||}qx| jjdkr|dddd| jjf }|dddd| jjf }||dd}||fS )a  Encodes a batch of MELs.

        Args:
            audio_mel: a torch.Tensor of shape [batch, num_frames, num_channels,
              mel_bins].

        Returns:
            audio_encodings: a torch.Tensor of shape
                `[batch_size, self.config.audio_soft_tokens_per_image,
                self.config.audio_config.hidden_size]`
            audio_mel_mask: a torch.BoolTensor of shape [batch, num_frames].
        r   r   r   )ri   rK   Nr   )r  rX   r  r  r`   r7  r,   rs   r{   r  r   rt   expandgatherr  conf_reduction_factormasked_fill)
rH   r  rr  rD  t_subtime_stride_productstride_pair_idxindicescurrent_maskblockr/   r/   r0   rW     s,   



zGemma3nAudioEncoder.forward)r(   r)   r*   r+   r    r.   main_input_namerB   r,   r]   r
  r7   rW   r^   r/   r/   rI   r0   r    s   
 	r  c                	       sH   e Zd ZdZddedededef fddZd	ejf fd
dZ	  Z
S )Gemma3nTextScaledWordEmbeddingz\
    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
    r>   num_embeddingsembedding_dimpadding_idxembed_scalec                    s*   t  ||| | jdt|dd d S )Nr  Fr?   )rA   rB   rF   r,   rG   )rH   r  r  r  r  rI   r/   r0   rB     s   z'Gemma3nTextScaledWordEmbedding.__init__	input_idsc                    s   t  || j| jj S rS   )rA   rW   r  r}   r=   rz   rH   r  rI   r/   r0   rW     s   z&Gemma3nTextScaledWordEmbedding.forward)r>   )r(   r)   r*   r+   r[   rT   rB   r,   r]   rW   r^   r/   r/   rI   r0   r    s     r  c                       s<   e Zd ZdZdef fddZdejdejfddZ  Z	S )	Gemma3nTextLaurelBlockz Learned Augmented Residual Layerr`   c                    s^   t    || _tj| jj| jjdd| _tj| jj| jjdd| _t	| jj| jj
d| _d S )NFra   r  )rA   rB   r`   rC   rn   rf   laurel_ranklinear_leftlinear_rightr8   r  post_laurel_normrq  rI   r/   r0   rB     s
   
zGemma3nTextLaurelBlock.__init__r5   rR   c                 C   s&   |  |}| |}| |}|| S rS   )r  r  r  )rH   r5   laurel_hidden_statesnormed_laurel_hidden_statesr/   r/   r0   rW     s   


zGemma3nTextLaurelBlock.forward)
r(   r)   r*   r+   r"   rB   r,   r]   rW   r^   r/   r/   rI   r0   r    s    r  c                       sT   e Zd Zddedef fddZdejdejfdd	Zd
ejdejfddZ	  Z
S )Gemma3nTextMLPr   r`   	layer_idxc                    s   t    || _|j| _|j| | _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _	t
|j | _|j| | _d S NFra   )rA   rB   r`   rf   intermediate_sizerC   rn   	gate_projup_proj	down_projr   hidden_activationact_fnactivation_sparsity_patternactivation_sparsityrH   r`   r  rI   r/   r0   rB     s   
zGemma3nTextMLP.__init__r5   rR   c                 C   sD   |  |}| jdkr| |}| |}| |}| || }|S )Nr   )r  r  _gaussian_topkr  r  r  )rH   r5   r  activationsr  r  r/   r/   r0   rW      s   




zGemma3nTextMLP.forwardinputsc                 C   sz   t j| jt j|jd}t jjdd}||}|	|j
}t j|ddd}t j|dddd}|||  }tj|| S )	Nrz   r{   r   r   rK   Tr  F)r:   rL   unbiased)r,   rG   r  r~   r{   distributionsnormalNormalicdfr   rz   rO   stdrC   r   relu)rH   r  target_sparsity_tensornormal_diststd_multiplierinputs_mean
inputs_stdcutoff_xr/   r/   r0   r  	  s   
zGemma3nTextMLP._gaussian_topk)r   )r(   r)   r*   r"   r[   rB   r,   r]   rW   r  r^   r/   r/   rI   r0   r    s    	r  c                       s   e Zd ZdZdef fddZdejdejfddZd	ejdejfd
dZ	dejdejdejfddZ
dejdejfddZdejdejfddZ  ZS )Gemma3nTextAltUpa  Alternating Updates (AltUp)

    The AltUp module wraps transformer layers. The `predict` step modifies the
    input to the transformer layer, and the `correct` step propagates the output
    of the transformer layer to the sparsely updated dimensions.

    See more in the research paper:

    https://proceedings.neurips.cc/paper_files/paper/2023/file/f2059277ac6ce66e7e5543001afa8bb5-Paper-Conference.pdf
    r`   c                    s   t    || _tt| jj| _tj	| jj
| jj
dd| _tj	| jj
| jj
d dd| _tj	| jj| jj
dd| _t| jj| jjd| _| jdt| jjd dd d S )NFra   r   r  router_input_scaleg      r?   )rA   rB   r`   rC   rD   r,   r   rf   correct_output_scalern   altup_num_inputscorrection_coefsprediction_coefsmodality_routerr8   r  router_normrF   rG   rq  rI   r/   r0   rB   &  s   
"zGemma3nTextAltUp.__init__rP   rR   c                 C   s.   |  || j }| |}t| |S rS   )r  r  r  r,   r   rT   rU   )rH   rP   router_inputsroutedr/   r/   r0   compute_router_modalities0  s   
z*Gemma3nTextAltUp.compute_router_modalitiesr5   c                 C   s   |  || jj }| jr | jjdur | jjj| jj | jj | |j	g |j
dd | jj| jjR  dddd}t|dddd|}|dddd}||7 }| |S )a  Predicts the output of a layer using a trainable map.

        Args:
            hidden_states: A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` derived by
                stacking the input embeddings and preprocessing the last `num_altup_inputs - 1` matrices.

        Returns:
            A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` containing the predictions.
        NrK   r   r   r   r   )r  r`   altup_active_idxtrainingaltup_coef_clipr  r=   dataclamp_r   rX   r  r   r,   r   r   rU   )rH   r5   
modalities	all_coefspredictionsr/   r/   r0   predict5  s$   
zGemma3nTextAltUp.predictr  	activatedc                 C   s   |  |}||| jj  }|| jjddd}| jjdur+| jjj	| jj | jj | |d }|
dddd}t||}||7 }| |S )a_  Corrects the predictions relative to the

        Args:
            predictions: A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` derived by
                stacking the input embeddings and preprocessing the last `num_altup_inputs - 1` matrices.
            activated: A 3D tensor of shape `[batch_size, num_tokens, hidden_size]` containing the activated inputs.

        Returns:
            A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` correcting the original
                predictions relative to the activated input embeddings.
        r   Nr>   r   r   rK   )r  r`   r  repeatr  r  r  r=   r  r  r   rt   r,   mulr   rU   )rH   r  r  r  
innovationr  	correctedr/   r/   r0   correctQ  s   
zGemma3nTextAltUp.correctr  c                 C   s   | | j| j  |S )a	  
        This is only defined as the `forward` so that accelerate hooks can move correctly `correct_output_scale`
        (which is a nn.Parameter, not a Module) between devices when offloading. It is otherwise only used in
        `scale_corrected_output`
        )rU   r  rH   r  r/   r/   r0   rW   n  s   zGemma3nTextAltUp.forwardc                 C   s
   |  |S )zMScales the provided 3D tensor of shape [batch_size, num_tokens, hidden_size].)rW   r  r/   r/   r0   scale_corrected_outputv  s   
z'Gemma3nTextAltUp.scale_corrected_output)r(   r)   r*   r+   r"   rB   r,   r]   r  r  r  rW   r  r^   r/   r/   rI   r0   r    s    
r  c                       sD   e Zd ZU ejed< ddef fddZe e	dd Z
  ZS )	Gemma3nTextRotaryEmbeddinginv_freqNr`   c                    s   t    t|drt|jtr|jd|jd| _nd| _|j| _	|j| _
|| _t| j | _| | j|\}| _| jd|dd | j| _d S )Nrope_scaling	rope_typer   defaultr  Fr?   )rA   rB   hasattr
isinstancer  dictgetr  max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr`   r   rope_init_fnattention_scalingrF   r  original_inv_freq)rH   r`   r{   r  rI   r/   r0   rB   ~  s   
z#Gemma3nTextRotaryEmbedding.__init__c           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtr6|jjdkr6|jjnd}t	j
|dd+ | |  dd}t	j||fdd	}| | j }| | j }	W d    n1 smw   Y  |j|jd
|	j|jd
fS )Nr   rK   r   mpscpuF)device_typeenabledr   r|   r   )r  rT   r  rX   r}   r{   r  r   strr,   autocast	transposer   r   r  r   rz   )
rH   rP   position_idsinv_freq_expandedposition_ids_expandedr  freqsembr   r   r/   r/   r0   rW     s   0&z"Gemma3nTextRotaryEmbedding.forwardrS   )r(   r)   r*   r,   r]   r.   r"   rB   no_gradr   rW   r^   r/   r/   rI   r0   r  {  s   
 
r  c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..NrK   r   r|   )rX   r,   r   )rP   x1x2r/   r/   r0   rotate_half  s   r  r5   n_reprR   c                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)rX   r  r   )r5   r  r   num_key_value_headsslenrh   r/   r/   r0   	repeat_kv  s
   0r  r   modulequerykeyrG  attention_maskdropoutscalingr   c                 K   s   |d u r	| j d }t|| j}	t|| j}
t||	dd| }|d ur2|| }t|}|| }|d urM|d d d d d d d |	jd f }|| }tj	j
|dtjd|j}tj	j||| jd}t||
}|dd }||fS )	Nr   r   r   r   rK   r   )pr  r   )rh   r  num_key_value_groupsr,   r   r  r   rX   rC   r   r   r~   r}   rz   r  r  r   )r  r  r  rG  r  r  r  r   kwargsr   r   attn_weightscausal_maskattn_outputr/   r/   r0   eager_attention_forward  s"   

&r!  rP   r   r   r  unsqueeze_dimc                 C   s(   | |}| |}| | t| |  S )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        x (`torch.Tensor`): The tensor to embed.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )rt   r  )rP   r   r   r  r"  r/   r/   r0   apply_rotary_pos_emb  s   

r#  c                       s   e Zd ZdZdedef fddZedddd		
	
ddej	dej	de
ej	 de
e de
ej dee deej	e
ej	 e
eej	  f fddZ  ZS )Gemma3nTextAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr`   r  c                    s  t    |j| dk| _|| _|| _t|d|j|j | _	|j|j
 | _| jj| _d| _tj|j|j| j	 |jd| _tj|j|j
| j	 |jd| _tj|j|j
| j	 |jd| _tj|j| j	 |j|jd| _| jro|jnd | _t|j	|jd| _t|j	|jd| _t|j	|jdd| _| jj| jj }||  kodkn  | _|jd | }| jrt|d	 |d d d
 |j|  | _ d| _!d S d | _ |t|d	 |d d d
 |j|  k| _!d S )Nsliding_attentionrh   Tra   )r:   r;   F)r:   r;   r<   r   r   rK   )"rA   rB   layer_types
is_slidingr`   r  getattrrf   num_attention_headsrh   r  r  attention_dropout	is_causalrC   rn   attention_biasr   r   r   o_projsliding_windowr8   r  q_normk_normv_normnum_hidden_layersnum_kv_shared_layersis_kv_shared_layerr  indexkv_shared_layer_indexstore_full_length_kv)rH   r`   r  first_kv_shared_layer_idxprev_layersrI   r/   r0   rB     sD   

(
zGemma3nTextAttention.__init__past_key_valuer4   4.58new_nameversionNr5   position_embeddingsr  cache_positionr  rR   c                 K   s  |j d d }g |d| jjR }|\}	}
| ||}| |}t||	|
dd}|dd}| jrM|d urM|j	| j
 \}}||j}||j}n.| ||}| |}t||	|
dd}|dd}| ||}| |}|dd}|d ur|
|	|| jd}| js|||| j|\}}| jrt|dsi |_	||f|j	| j< t}| jjdkrt| jj }|| ||||f| jr| jndd	| jd
|\}}|jg |dR   }| |}||fS )NrK   r   )r"  r   )r   r   r@  r.  shared_layerseagerr   r>   )r  r  r.  )rX   r`   rh   r   r   r/  r#  r  r4  rA  r6  r}   r{   r   r0  r   r1  r.  updater  r7  r  r!  _attn_implementationr   r  r*  r   r   r-  )rH   r5   r?  r  r4   r@  r  input_shapehidden_shaper   r   r   r   r   cache_kwargsattention_interfacer   r  r/   r/   r0   rW      sf   





	

zGemma3nTextAttention.forwardNN)r(   r)   r*   r+   r"   r[   rB   r   r,   r]   r   r	   
LongTensorr   r   r7   rW   r^   r/   r/   rI   r0   r$    s*    *r$  c                       s   e Zd Zdedef fddZedddd							
	
		ddejdejdejdejde	ej de	ej
 de	e de	e de	e de	ej
 deeje	eejejf  f fddZ  ZS )Gemma3nTextDecoderLayerr`   r  c                    s   t    || _|j| _|| _|j| | _t||| _t	||d| _
t| j|jd| _t| j|jd| _t| j|jd| _t| j|jd| _|j| _t|j | _t|| _t|| _tj| j| jdd| _tj| j| jdd| _t| j|jd| _d S )N)r  r  Fra   )rA   rB   r`   rf   r  r&  attention_typer$  	self_attnr  mlpr8   r  input_layernormpost_attention_layernormpre_feedforward_layernormpost_feedforward_layernormhidden_size_per_layer_inputr   r  r  r  altupr  laurelrC   rn   per_layer_input_gateper_layer_projectionpost_per_layer_input_normr  rI   r/   r0   rB   k  s$   


z Gemma3nTextDecoderLayer.__init__r:  r4   r;  r<  NFr5   position_embeddings_globalposition_embeddings_localper_layer_inputr  r  output_attentions	use_cacher@  rR   c                 K   s@  | j |}|| jj }| |}| |}| jjr|}n|}| jd|||||||	|
d|\}}| |}|| }|| t	
d }| |}| |}| |}|| }| j ||}|| jj  }| jjrp| j |}| |}| |}t||}| |}| |}|dd   |7  < |f}|r||f7 }|S )N)r5   r?  r  r  r4   r\  r]  r@  r   r   r/   )rT  r  r`   r  rO  rU  rM  r'  rP  rp   rM   rQ  rN  rR  r  r   altup_correct_scaler  rV  r  r,   multiplyrW  rX  )rH   r5   rY  rZ  r[  r  r  r4   r\  r]  r@  r  r  active_predictionactive_prediction_normedlaurel_outputr?  rn  self_attn_weights
attn_gatedattn_laurel	attn_normattn_ffwattn_ffw_normattn_ffw_laurel_gatedcorrected_predictionsfirst_predictionoutputsr/   r/   r0   rW     sP   

	









zGemma3nTextDecoderLayer.forward)NNNFFN)r(   r)   r*   r"   r[   rB   r   r,   r]   r   rJ  r	   r\   r7   r-   rW   r^   r/   r/   rI   r0   rK  j  s@    	
rK  c                       sX   e Zd ZU eed< dZdZdgZdgZdZ	dZ
dZdZdZeedZ fddZ  ZS )	Gemma3nPreTrainedModelr`    TrK  r4   )r5   r6   c                    s`   t  | t|tr|jjd d S t|tr!|jj	  d S t|t
r.|jj	  d S d S )Nr>   )rA   _init_weightsr  r  r=   r  fill_r   r   zero_r  r  )rH   r  rI   r/   r0   ro    s   


z$Gemma3nPreTrainedModel._init_weights)r(   r)   r*   r!   r.   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendrK  r$  _can_record_outputsro  r^   r/   r/   rI   r0   rm    s   
 rm  zBThe base Gemma 3n language model without a language modeling head.c                       s   e Zd ZU eed< def fddZee										ddee	j
 dee	j dee	j dee	j
 d	ee d
ee	j dee dee dee dee	j
 dee defddZde	j
de	jfddZ	dd
e	jdee	j de	jfddZ  ZS )Gemma3nTextModelr`   c                    s  t     j_ j_t j jjjjd d_t	
 fddt jD _t j jd_t d_d_t   j _dd	i _t d_ j_ j_t j j j j jd d_t	jj j j dd
_t j jd_t	
fddtdjj D _!t	
fddtdjj D _"j#dt$%jd dd j#dt$&t$%ddd '  d S )N      ?)r  c                    s   g | ]}t  |qS r/   )rK  )r  r  r  r/   r0   r    s    z-Gemma3nTextModel.__init__.<locals>.<listcomp>r  r  Fr  r  ra   c                        g | ]}t j j jd dqS Fra   rC   rn   rf   r  rY   r/   r0   r         r   c                    r~  r  r  r  rY   r/   r0   r    r  per_layer_projection_scaler   r?   per_layer_input_scaleg       @)(rA   rB   pad_token_idr  
vocab_sizer  rf   r`   embed_tokensrC   r  r  r2  layersr8   r  r;  r  
rotary_embgradient_checkpointingcopydeepcopyrope_local_base_freq
rope_thetar  rotary_emb_localrS  vocab_size_per_layer_inputembed_tokens_per_layerrn   per_layer_model_projectionper_layer_projection_normr  altup_projectionsaltup_unembed_projectionsrF   r,   rG   r  	post_initrq  rI   )r`   rH   r0   rB     sN   



zGemma3nTextModel.__init__Nr  per_layer_inputsr  r  r4   inputs_embedsr]  r\  output_hidden_statesr@  r  rR   c           !   
   K   s  |dur|n| j j}|	dur|	n| j j}	|dur|n| j j}|du |duA r*td| jr9| jr9|r9td d}|durG| 	|}| 
|}| ||}|r\|du r\| js\t| j d}|
du rx|durh| nd}tj|||jd  |jd}
|du r|
d}t| }ts| j |||
||d	}tdi |tdi |d
}|}| ||}| ||}tj|d dddd }td}|g}td| j jD ]6}| j|d  |}|j|j|jd}tj|d ddd}t t!|||j}|| | }|"| qtj#|dd}|	rdnd}|rdnd}| j$d| j j% D ]?}|	r)||f7 }||j& }|dddd|j'ddf }|||||f||||||
d|}|d }|r]||d f7 }q|	rg||f7 }tj|d d dddd }|d g}td| j jD ]9}| j(|d  || } | j|j|jd}tj|d ddd}t t!|||j}|| | }|"| qt#|}tj|dd}| )|}t*||||dS )z
        per_layer_inputs (torch.Tensor, *optional*, defaults to None):
            Pre-computed per-layer embeddings. If None, they are derived from input_ids if provided.
        N:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fr  r   r   r   )r`   input_embedsr  r@  r4   r  )full_attentionr%  r   rK   Tr  r}  gh㈵>r  r|   r/   )r  r  r4   r\  r]  r@  )last_hidden_stater4   r5   r6   )+r`   r\  r  r]  r   r  r  loggerwarning_oncer  get_per_layer_inputsproject_per_layer_inputsr
   get_seq_lengthr,   rs   rX   r{   rt   r  r  r   r   r  r  rO   rG   r  r  r  r}   rz   rM   maximumrQ  stackr  r2  rL  r  r  r;  r   )!rH   r  r  r  r  r4   r  r]  r\  r  r@  r  past_seen_tokenscausal_mask_mappingmask_kwargshidden_states_0rY  rZ  target_magnitudeepsilon_tensortemp_hidden_statesrY  
altup_projcurrent_hidden_statenew_magnituder5   all_hidden_statesall_self_attnsdecoder_layerr  r[  layer_outputsaltup_unemb_projr/   r/   r0   rW   !  s   










zGemma3nTextModel.forwardc                 C   s&   |  |jg |j| jj| jR  S rS   )r  r   rX   r`   r2  rS  r  r/   r/   r0   r    s   z%Gemma3nTextModel.get_per_layer_inputsc                 C   s   |  |}|| jj|j|jd9 }|jg |jd d | jj| j	R  }| 
|}|d u r0|S |j|jkrC|dd | jjd d f }|| | jj|j|jd S )Nr  rK   .)r  r  r}   rz   r{   r   rX   r`   r2  rS  r  r  )rH   r  r  rW  r/   r/   r0   r    s&   

z)Gemma3nTextModel.project_per_layer_inputs)
NNNNNNNNNNrS   )r(   r)   r*   r"   r.   rB   r   r   r   r,   rJ  r]   r	   r-   r\   r   r   r   rW   r  r  r^   r/   r/   rI   r0   r|    sd   
 9	
 
r|  z?The base Gemma 3n language model with a language modeling head.c                       s   e Zd ZU dgZddiZddgdgfiZeed< dZddiZ	def fd	d
Z
ee											ddeej deej deej dee deej deej dee dee dee deej deeejf defddZ  ZS )Gemma3nForCausalLMlm_head.weightlm_headcolwise_repr5   r3   r`   modelzmodel.language_modelc                    s@   t  | t|| _|j| _tj|j|jdd| _| 	  d S r  )
rA   rB   r|  r  r  rC   rn   rf   r  r  rq  rI   r/   r0   rB     s
   
zGemma3nForCausalLM.__init__Nr   r  r  r  r4   r  labelsr]  r\  r  r@  logits_to_keeprR   c                 K   s   |dur|n| j j}|	dur|	n| j j}	| jd||||||||	|
d	|}|j}t|tr4t| dn|}| |dd|ddf }| j j	dur[|| j j	 }t
|}|| j j	 }d}|durm| j||| jfi |}t|||j|j|jdS )a  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, Gemma3nForCausalLM

        >>> model = Gemma3nForCausalLM.from_pretrained("google/gemma-2-9b")
        >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")

        >>> prompt = "What is your favorite condiment?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "What is your favorite condiment?"
        ```N)	r  r  r  r4   r  r]  r\  r  r@  )r2   r3   r4   r5   r6   r/   )r`   r\  r  r  r  r  r[   slicer  final_logit_softcappingr,   r   loss_functionr  r   r4   r5   r6   )rH   r  r  r  r4   r  r  r]  r\  r  r@  r  r  rl  r5   slice_indicesr3   r2   r/   r/   r0   rW     sB   #

zGemma3nForCausalLM.forward)NNNNNNNNNNr   )r(   r)   r*   _tied_weights_keys_tp_plan_pp_planr"   r.   rr  _checkpoint_conversion_mappingrB   r   r   r   r,   rJ  r]   r	   r-   r\   r   r[   r   rW   r^   r/   r/   rI   r0   r    s\   
 		
r  c                       s\   e Zd ZdZdeeef def fddZ		dde	e
j de	e
j d	e
jfd
dZ  ZS )Gemma3nMultimodalEmbedderzQEmbeds token ids or soft tokens for multimodal content into language model space.multimodal_configtext_configc                    s   t    |j| _|j| _|j| _|j| _|j| _t	
| j| j| _t| j| jd| _t| j| jd| _t	j| j| jdd| _t| j| jdd| _d S )Nr  Fra   )r;   r<   )rA   rB   rf   multimodal_hidden_sizer  r;   vocab_offsetr  text_hidden_sizerC   	Embedding	embeddingr8   hard_embedding_normsoft_embedding_normrn   embedding_projectionembedding_post_projection_norm)rH   r  r  rI   r/   r0   rB   <  s   
z"Gemma3nMultimodalEmbedder.__init__Nr  r  rR   c                 C   sZ   |du |duA rt d|dur| |}n| || j }| |}| |}| |S )a  Embeds token ids or soft tokens for multimodal content into language model space.

        Args:
            input_ids: A torch.LongTensor containing the token ids to embed. Values should be in the range
                `[vocab_offset, vocab_offset + vocab_size)`.
            inputs_embeds: A torch.Tensor containing the soft tokens to embed.

        Returns:
            A torch.Tensor of embeddings with  shape `[batch_size, seq_len, self.config.text_config.hidden_size]`.
        Nr  )r   r  r  r  r  r  r  )rH   r  r  emb_normhard_embemb_norm_projr/   r/   r0   rW   O  s   


z!Gemma3nMultimodalEmbedder.forwardrI  )r(   r)   r*   r+   r   r    r#   r"   rB   r   r,   rJ  r]   rW   r^   r/   r/   rI   r0   r  9  s     
r  z
    The base Gemma 3n model comprising a vision backbone, an audio backbone, and a language model without a
    language modeling head.
    c                        sv  e Zd Zi ZdZdef fddZdd Zdd Zd	d
 Z	dd Z
dejdejfddZ				d'deej deej deej deej fddZe														d(deej deej deej deej deej deej dee deej deej deej deej d ee d!ee d"ee defd#d$Zdejdejdeejejf fd%d&Z  ZS ))Gemma3nModelFr`   c                    s   t  | tj|jd| _|jj| _tj|jd}|| _| j	j
d ur'| j	j
nd| _
|jj| _t|j| _t|j|j| _t|j|j| _|   d S )Nr  rK   )rA   rB   r   from_configvision_configvision_towerr  r  language_modelr`   r  r  audio_configaudio_towerr  embed_visionembed_audior  )rH   r`   r  rI   r/   r0   rB   v  s   

zGemma3nModel.__init__c                 C   
   | j  S rS   )r  get_input_embeddingsrY   r/   r/   r0   r       
z!Gemma3nModel.get_input_embeddingsc                 C      | j | d S rS   )r  set_input_embeddingsrH   rG  r/   r/   r0   r       z!Gemma3nModel.set_input_embeddingsc                 C   s
   || _ d S rS   r  rH   decoderr/   r/   r0   set_decoder  r  zGemma3nModel.set_decoderc                 C   s   | j S rS   r  rY   r/   r/   r0   get_decoder  s   zGemma3nModel.get_decoderpixel_valuesrR   c                 C   sX   | j |dddj}||jd | jjj| jjddd}|| jjjd 9 }| j	|dS )	a  
        Projects the last hidden state from the vision model into language model space.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
               The tensors corresponding to the input images.

        Returns:
            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
        FT)r  
do_poolingreturn_dictr   r   r   r}  r  )
r  r  r   rX   r`   r  rf   vision_soft_tokens_per_imager   r  )rH   r  vision_outputsr/   r/   r0   get_image_features  s   
zGemma3nModel.get_image_featuresNr  r  image_featuresaudio_featuresc           	      C   sB  |du r1||   tj| jjtj|jdk}|d}||   tj| jjtj|jdkd}n|| jjk}|| jjk}|	 }|
d||j}|durm||  | krmtd| d|jd |jd   |	 }|
d||j}|dur||  | krtd| d|jd |jd   ||fS )	z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        Nr  rK   z6Image features and image tokens do not match: tokens: z, features r   r   z6Audio features and image tokens do not match: tokens: )r  r,   rG   r`   image_token_idlongr{   allaudio_token_idr  rt   	expand_asr}   numelr   rX   )	rH   r  r  r  r  special_image_maskspecial_audio_maskn_image_tokensn_audio_tokensr/   r/   r0   get_placeholder_mask  s6   
z!Gemma3nModel.get_placeholder_maskinput_featuresr  input_features_maskr  r4   token_type_idsr@  r  r]  r\  r  c           *      K   s  |du |
duA rt d|dur|n| jj}|dur|n| jj}|dur|  |}
t|dk|| jk }t||t	|}| j
|}t|| jjk|| jjk }| jj| jj d }t||||
j}| j|d}|d|
}t|||
}
|| jjk}| jj| jj d }t||||
j}| j|d}|d|
}t|||
}
nd}|dur| |}||
j|
j}| j||
|d\}}|
||}
|dur7|dur7| || \} }tj| jd ggtj| jd}!| j|!d}"t|d|"| } | j\}#}$}%| jj|$ }&|"|#|&|%}'tj| |'fdd	} | |
j|
j} | j||
| d
\}}(|
|(| }
| j
dd|||||
|||d|	d|})t|)j |rT|)j!nd|)j"|)j#|dur`|nd|durj| dS ddS )aj  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Gemma3nForConditionalGeneration

        >>> model = Gemma3nForConditionalGeneration.from_pretrained("google/gemma3n2-3b-mix-224")
        >>> processor = AutoProcessor.from_pretrained("google/gemma3n2-3b-mix-224")

        >>> prompt = "Where is the cat standing?"
        >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, text=prompt,  return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(**inputs,)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Where is the cat standing?\nsnow"
        ```
        Nr  r   r   )r  rK   )r  r  r  r|   )r  r  T)r  r  r  r  r4   r  r]  r\  r  r  r@  )r  r4   r5   r6   r&   r'   r/   )$r   r`   r\  r  r  r,   r   r  r   
zeros_liker  r  r  r  r  r  r}   r{   rt   r  r  rz   r  masked_scatterget_audio_featuresrG   r  rX   audio_soft_tokens_per_imager  r   r%   r  r4   r5   r6   )*rH   r  r  r  r  r  r  r4   r  r@  r  r  r]  r\  r  	lm_kwargsper_layer_inputs_maskper_layer_inputs_tokensr  vision_maskdummy_vision_token_idvision_input_idsvision_embedsexpanded_vision_mask
audio_maskdummy_audio_token_idaudio_input_idsaudio_embedsexpanded_audio_maskr  r  r   r  audio_padding_toksaudio_padding_embsaudio_batch_sizeaudio_seq_lenaudio_embed_dimextra_padding_tokensextra_padding_featuresr  rl  r/   r/   r0   rW     s   /


zGemma3nModel.forwardc                 C   s    |  ||\}}| j|d|fS )a-  
        Projects the last hidden state from the audio encoder into language model space.

        Args:
            input_features (`torch.FloatTensor]` of shape `(num_images, seq_length, num_features)`):
               The tensors corresponding to the input audio.
            input_features_mask (`torch.FloatTensor]` of shape `(num_images, seq_length)`):
               The attention mask for the input audio.

        Returns:
            audio_features (`torch.Tensor`): Audio feature tensor of shape `(num_images, audio_length, embed_dim)`).
        r  )r  r  )rH   r  r  audio_outputsr	  r/   r/   r0   r  `  s   zGemma3nModel.get_audio_features)NNNN)NNNNNNNNNNNNNN)r(   r)   r*   r  accepts_loss_kwargsr!   rB   r  r  r  r  r,   r]   r  r   rJ  r-   r  r   r	   r\   r1   rW   r7   r  r^   r/   r/   rI   r0   r  k  s    
*	
 r  z
    The base Gemma 3n model comprising a vision backbone, an audio backbone, a language model, and a language modeling
    head.
    c                #       sv  e Zd Zi ZdgZdZdef fddZdd Zdd	 Z	d
d Z
dd Zdd Zedd Zedd Zedd Zee															d/deej deej deej deej deej deej dee deej d eej d!eej d"eej d#ee d$ee d%ee d&eeejf d'ef d(d)Z										*		d0 fd+d,	Zed-d. Z  Z S )1Gemma3nForConditionalGenerationr  r  r`   c                    s<   t  | t|| _tj|jj|jjdd| _	| 
  d S r  )rA   rB   r  r  rC   rn   r  rf   r  r  r  rq  rI   r/   r0   rB   ~  s   
z(Gemma3nForConditionalGeneration.__init__c                 C   r  rS   )r  r  rY   r/   r/   r0   r    r  z4Gemma3nForConditionalGeneration.get_input_embeddingsc                 C   r  rS   )r  r  r  r/   r/   r0   r    r  z4Gemma3nForConditionalGeneration.set_input_embeddingsc                 C   r  rS   )r  r  r  r/   r/   r0   r    r  z+Gemma3nForConditionalGeneration.set_decoderc                 C   r  rS   )r  r  rY   r/   r/   r0   r    r  z+Gemma3nForConditionalGeneration.get_decoderc                 C   s   | j |S rS   )r  r  )rH   r  r/   r/   r0   r    s   z2Gemma3nForConditionalGeneration.get_image_featuresc                 C      | j jS rS   )r  r  rY   r/   r/   r0   r       z.Gemma3nForConditionalGeneration.language_modelc                 C   r  rS   )r  r  rY   r/   r/   r0   r    r  z,Gemma3nForConditionalGeneration.vision_towerc                 C   s   t d)Nz2Use embed_vision instead of multi_modal_projector.)AttributeErrorrY   r/   r/   r0   multi_modal_projector  r  z5Gemma3nForConditionalGeneration.multi_modal_projectorNr   r  r  r  r  r  r  r4   r  r@  r  r  r]  r\  r  r  rR   c                 K   s  |dur|n| j j}|dur|n| j j}| jd	|||||||||	|
||||dd|}|j}t|tr:t| dn|}| |dd|ddf }| j 	 j
 }dura|| }t|}|| }d}|dur| }|dddddf }|dddf }|dur|dd|jd  df |j}|||jdk  }|||jdk  }n| }| }t }|d| j jj}|d|j}|||}t|||j|j|j|j|jdS )
al  
        input_features_mask (torch.Tensor, *optional*, defaults to None):
            The attention mask for the input audio.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels in
            `[0, ..., config.text_config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration

        >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma-3-4b-it")
        >>> processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it")

        >>> messages = [
        ...     {
        ...         "role": "system",
        ...         "content": [
        ...             {"type": "text", "text": "You are a helpful assistant."}
        ...         ]
        ...     },
        ...     {
        ...         "role": "user", "content": [
        ...             {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
        ...             {"type": "text", "text": "Where is the cat standing?"},
        ...         ]
        ...     },
        ... ]

        >>> inputs = processor.apply_chat_template(
        ...     messages,
        ...     tokenizer=True,
        ...     return_dict=True,
        ...     return_tensors="pt",
        ...     add_generation_prompt=True
        ... )
        >>> # Generate
        >>> generate_ids = model.generate(**inputs)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "user\nYou are a helpful assistant.\n\n\n\n\n\nWhere is the cat standing?\nmodel\nBased on the image, the cat is standing in a snowy area, likely outdoors. It appears to"
        ```
        NT)r  r  r  r  r  r  r4   r  r@  r  r  r]  r\  r  r  .rK   r   r   )r2   r3   r4   r5   r6   r&   r'   r/   )r`   r\  r  r  r  r  r[   r  r  get_text_configr  r,   r   rT   rX   r}   r{   r   rC   CrossEntropyLossr   r  r  r1   r4   r5   r6   r&   r'   )rH   r  r  r  r  r  r  r4   r  r@  r  r  r]  r\  r  r  r  rl  r5   r  r3   r  r2   shift_logitsshift_labelsshift_attention_maskloss_fctflat_logitsflat_labelsr/   r/   r0   rW     sj   D
$
z'Gemma3nForConditionalGeneration.forwardTc                    sN   t  j|f||||||||
d|}|d dkr%||d< ||d< |	|d< |S )N)r4   r  r  r  r@  r]  r  r  r   r  r  r  )rA   prepare_inputs_for_generation)rH   r  r4   r  r@  r  r  r  r  r  r  r]  r  r  r  model_inputsrI   r/   r0   r$  %	  s&   
z=Gemma3nForConditionalGeneration.prepare_inputs_for_generationc                 C   r  rS   )r  r  rY   r/   r/   r0   r  N	  r  z+Gemma3nForConditionalGeneration.audio_tower)NNNNNNNNNNNNNNr   )NNNNNNNNNTNN)!r(   r)   r*   r  r  rr  r!   rB   r  r  r  r  r  propertyr  r  r  r   r   r   r,   rJ  r-   r]   r	   r\   r   r[   r1   rW   r$  r  r^   r/   r/   rI   r0   r  s  s    


	
 )r  )r  r  r  r  rm  r|  )r   NN)Nr   )`r  rp   collections.abcr   r   dataclassesr   typingr   r   r,   torch.nnrC   torch.nn.functionalr   rH  r  r   cache_utilsr	   r
   
generationr   masking_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.deprecationr   autor   configuration_gemma3nr    r!   r"   r#   
get_loggerr(   r  r%   r1   Moduler8   r_   r   r  r+  rO  rj  rw  r  r  r  r  r  r  r  r  r  r  r]   r[   r  rT   r7   r!  r#  r$  rK  rm  r|  r  r  r  r  __all__r/   r/   r/   r0   <module>   s   
 + amEI+J&a$

'
w^ x\2   [