o
    پi5                     @   sD  d dl Z d dlmZmZmZ d dlZd dlmZ d dlm  m	Z
 d dlmZmZ d dlmZmZmZ d dlmZ d dlmZ d dlmZmZ G dd	 d	ejZG d
d dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZ G dd dejZ!G dd dejZ"G dd deZ#dS )    N)OptionalSequenceTuple)Gemma3nAudioConfigPreTrainedModel)ColumnParallelLinearQKVParallelLinearRowParallelLinear)QuantizationConfig)Gemma3nRMSNorm)
add_prefixmake_layersc                       sZ   e Zd ZdZ	ddedee def fddZ	dd	ej	d
e
ej	 dej	fddZ  ZS )Gemma3nCumulativeGroupNorma  Applies Group Normalization cumulatively over the time dimension.

    This layer normalizes the input by calculating the mean and variance
    cumulatively over the time dimension (dim 1). The statistics are computed
    over all feature dimensions (specified by `feature_dims` and `num_channels`)
    for elements marked as valid by the optional `mask`.

    If a `mask` is provided (True for valid, False for invalid/padded),
    invalid time steps do not contribute to the statistics calculation, and
    their corresponding output values are zeroed out.

    Scale and bias, if enabled, are applied per-channel (last dimension).
    This behavior is similar to JAX's `GroupNormalization` with `num_groups=1`
    and `cumulative=True`.
    MbP?num_channelsfeature_dimsepsc                    sT   t    || _t|| _|| _tt	|| _
ttddt| j d | _d S )N      )super__init__r   tupler   r   nn	Parametertorchonesweightrangelenreduction_axes)selfr   r   r   	__class__ S/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/gemma3n_audio.pyr   $   s   

"z#Gemma3nCumulativeGroupNorm.__init__Nxmaskreturnc                 C   sL  | j | jf }|jdd |krtd|jdd  d| |j}tj}||}tj||d}tj	|| j
dd}tj|dd	}	tj	|| j
dd}
tj|
dd	}tj|d
d}|	| }|| d}tj	|| j
dd}tj|dd	}|| }|| t|| j  }| j|}dg| d  | jg }||| }|| }||S )ap  Applies cumulative group norm, optionally using a mask.

        Args:
          x: Input tensor, shape [B, T, *feature_dims, C].
          mask: Optional boolean mask, shape [B, T]. True indicates a valid
            (non-padded) time step. If None, all time steps are considered valid.

        Returns:
          Normalized tensor with the same shape as x.
        r   NzInput tensor shape suffix z> does not match expected suffix (feature_dims + num_channels) dtypeT)dimkeepdimr   r*         ?)min)r   r   shape
ValueErrorr)   r   float32to	ones_likesumr   cumsumclamppowrsqrtr   r   r*   view)r    r%   r&   expected_input_suffixinput_dtype
calc_dtypex_calc	mask_calcsum_values_at_tcum_sum_valueselements_in_group_at_tcum_count_elementssafe_cum_count_elementscum_meansquared_diff_from_meansum_sq_diff_at_tcum_sum_sq_diffcum_variancenormalized_xscalescale_view_shapefinal_outputr#   r#   r$   forward8   s>   

z"Gemma3nCumulativeGroupNorm.forward)r   N)__name__
__module____qualname____doc__intr   floatr   r   Tensorr   rM   __classcell__r#   r#   r!   r$   r      s(    r   c                       s   e Zd Z		ddedee def fddZdej	d	ej
d
ej	fddZdej	dedededededed
ej	fddZdej	dej	d
ej	fddZ  ZS )%Gemma3nAudioRelativePositionEmbeddingN configquant_configprefixc           	         s   t    || _| jj| _| jj| _| j| j | _td| jj	d | _
| jj| _t| j| j| j d|td|d| _d}d}| jd }tt|t| t|d d }|tt||   }| jd	| dddd
 d S )Nr   r   Fpos_projbiasrZ   r[   r-   g     @r   inv_timescales
persistent)r   r   rY   conf_num_attention_heads	num_headshidden_sizechannelshead_dimmaxconf_attention_context_leftmax_backwardconf_attention_context_rightmax_forwardr   r   r\   mathlogrT   r   exparangeregister_buffer	unsqueeze)	r    rY   rZ   r[   min_timescalemax_timescalenum_timescaleslog_timescale_incrementr_   r!   r#   r$   r      s<   






z.Gemma3nAudioRelativePositionEmbedding.__init__positionr)   r'   c                 C   s\   |j dksJ | d}|| jj|jtjd }tjt	|t
|gdd}||S )Nr   )devicer)   r,   )ndimrT   rq   r_   r2   rx   r   r1   catsincostype)r    rv   r)   scaled_timetiming_signalr#   r#   r$   _get_timing_signal_1d_pos   s   
z?Gemma3nAudioRelativePositionEmbedding._get_timing_signal_1d_posterm_bd_before_shift
batch_sizerc   num_query_blocksquery_block_sizekey_context_sizemax_span_plus_1c                 C   sv   |d | }d|f}	t ||	}
|
|||||d  f}|ddddddd|| f }||||||f}|S )zPerforms the relative shift.r   r   N)Fpadreshape)r    r   r   rc   r   r   r   r   pad_amount_last_dimpadding_tupleterm_bd_paddedterm_bd_reshapedterm_bd_slicedterm_bd_shiftedr#   r#   r$   _relative_shift   s,   
	z5Gemma3nAudioRelativePositionEmbedding._relative_shiftquerieskeysc              	   C   s&  |j \}}}}}|j \}}}	}}tj| j| j d d|jdd}
|
j d }| j|
|jd}| 	|\}}|
d|| j| jd}|ddddd}|ddddd}t||}|ddddd}|ddd}|
|||| |}t||}|
|||||}| ||||||	|}|| S )	Nr   rw   rx   r   r(      r      )r/   r   ro   ri   rk   rx   rq   r   r)   r\   r   rc   rf   squeezepermutematmulr   )r    r   r   r   r   r   rc   rf   _r   pos_indicesr   sin_emb_timing_signalprojected_sin_embsin_emb	queries_pkeys_p_tterm_ac
q_permuted
s_permuted
q_reshapedterm_bd_unshifed_matmulterm_bd_unshifedr   r#   r#   r$   rM      sX   

z-Gemma3nAudioRelativePositionEmbedding.forwardNrX   )rO   rP   rQ   r   r   r
   strr   r   rU   r)   r   rS   r   rM   rV   r#   r#   r!   r$   rW      sF    &
	
$%rW   c                       s   e Zd ZdZ		ddedee def fddZd	e	j
d
edede	j
fddZd	e	j
de	j
fddZd	e	j
de	j
fddZd	e	j
de	jde	j
fddZ  ZS )Gemma3nAudioAttentionz+Local dot product self-attention for audio.NrX   rY   rZ   r[   c           	   
      s  t    || _| jj| _| jj| _| j| j | _| jj| _| jj	| _
td| jjd | _| jj| _| j| j | j
 | _t||td|d| _tt| jf| _t| j| j| j| jd|td|d| _| jd }d	ttd
 }| jd||    dd tj!tj"| j| jftj#dddj$}tj!tj"| j| jftj#d| j| j
 d}tj"| j| jftj#d}|| | }| jd|dd | jdt| j% dd d S )Nr   r   relative_position_embeddingr[   Fqkv_projr]   g      r-           q_scaler`   r(   )diagonallocal_causal_valid_masksoftcap)&r   r   rY   rb   rc   rd   rf   conf_attention_chunk_size
chunk_sizerj   max_future_horizonrg   rh   max_past_horizonconf_attention_logit_capattention_logits_soft_capcontext_sizerW   r   r   r   r   r   zerosper_dim_scaler   r   r   softplustensorrp   clonedetachtrilr   boolTrT   )	r    rY   rZ   r[   r   r_softplus_0lower_causal_maskupper_causal_maskr   r!   r#   r$   r     sl   










zGemma3nAudioAttention.__init__r%   	dim10_val	dim11_valr'   c                 C   sF   dg|j  d }|j d }d| }|||< |||d < t|t|S )Nr   r   r   )ry   r   r   r   )r    r%   r   r   r   dim_idx_from_endstart_idx_for_dimr#   r#   r$   	_pad_dim1]  s   
zGemma3nAudioAttention._pad_dim1c                 C   sx   |j }|dd \}}|| j d | j }|| j |  }dkr'| |d|}||| jf|dd  }|| }|S )z+Turns a sequence to non overlapping blocks.Nr   r   r   )r/   r   r   r   
contiguous)r    r%   r/   bt
num_blockspadding_lenpermute_dimsr#   r#   r$   _convert_to_blockg  s   z'Gemma3nAudioAttention._convert_to_blockc                 C   sl   | j }| j| j d }| |||}| j}| j}|jd||d}|jdkr2|jdkr2tj|ddd}|	 S )z*Extracts temporal context for every block.r   )	dimensionsizestepr   r   rw   )sourcedestination)
r   r   r   r   r   unfoldry   r   movedimr   )r    r%   pad_left	pad_right	frame_len
frame_step
x_unfoldedr#   r#   r$   _extract_block_contextt  s   z,Gemma3nAudioAttention._extract_block_contextr&   c           #      C   s  |  |\}}|jddd\}}}|jg |jd d | j| jR   }|jg |jd d | j| jR   }|jg |jd d | j| jR   }t| j	}ddd| jf}	|
|	}
|| j |
 }|jd d \}}| |}| |}| |}|jd }| }| |}|jdkr|jd |kr|jd |kr|jd |jd  | jkr|||| j}|dd}| jddd}t|||j}| ||}| j|j}|| }t|}|| }t||t|jj}tj|dtjd	j|jd
}|j\}}}}}|jd }|dddddd||}|dddddd||} t || }!|!|||||ddddd}"|"||| j! | j| jf}"|"d d d |f }"|"S )Nr   rw   )chunksr*   r   r   r   r   )r*   r)   r(   )"r   chunkr   r/   rc   rf   r   r   r   r   r9   r   r   r   ry   r   rq   r   r   logical_andr2   rx   r   r   tanhwherefinfor)   r.   softmaxr1   r   bmmr   )#r    r%   r&   qkvr   query_states
key_statesvalue_statesper_dim_scale_spbroadcast_shapeper_dim_scale_sp_broadcastr   q_timequery_blocks
key_blocksvalue_blocksr   original_valid_maskextracted_valid_mask_blockscondition_from_input_validitycondition_from_causalityfinal_condition_for_wherelogitssoftcap_valprobabilitiesb_dimn_dimu_dimw_dimc_dimh_dimprob_bunv_bun
result_bmmcontext_vectorsr#   r#   r$   rM     s   










zGemma3nAudioAttention.forwardr   )rO   rP   rQ   rR   r   r   r
   r   r   r   rU   rS   r   r   r   
BoolTensorrM   rV   r#   r#   r!   r$   r     s.    E

$r   c                       sh   e Zd ZdZ			ddedededeeeeef d	ee d
e	f fddZ
dejdejfddZ  ZS )Gemma3nAudioSSCPConvBlockz;A single convolution block for the SubSampleConvProjection.r   r   r   r   NrX   rY   idxinput_freq_dimmanual_paddingrZ   r[   c                    s   t    || _|| _|dkrdn| jj|d  }| jj| }| jj| \}	}
| jj| \}}tj|||	|
f||fddd| _	|| jd  | jd  }||
 | d }t
||f| jjd| _t | _d S )Nr   r   )r   r   F)in_channelsout_channelskernel_sizestridepaddingr^   )r   r   r   )r   r   rY   r  sscp_conv_channel_sizesscp_conv_kernel_sizesscp_conv_stride_sizer   Conv2dconvr   sscp_conv_group_norm_epsnormReLU
activation)r    rY   r  r  r  rZ   r[   r	  r
  kernel_hkernel_wstride_hstride_wf_in_padded
f_out_convr!   r#   r$   r     s.   
		z"Gemma3nAudioSSCPConvBlock.__init__audio_encodingsr'   c                 C   sZ   t j|| jddd}| |}|dddd }| |}|dddd }| |S )Nconstantr   )modevaluer   r   r   r   )r   r   r  r  r   r   r  r  )r    r  audio_encodings_paddedaudio_encodings_conv
x_for_normx_normedaudio_encodings_normedr#   r#   r$   rM     s   



z!Gemma3nAudioSSCPConvBlock.forward)r  NrX   )rO   rP   rQ   rR   r   rS   r   r   r
   r   r   r   rU   rM   rV   r#   r#   r!   r$   r    s&    &r  c                       J   e Zd Z		ddedee def fddZdej	d	ej	fd
dZ
  ZS )#Gemma3nAudioSubSampleConvProjectionNrX   rY   rZ   r[   c              	      s"  t    || _|j}g }g }tdD ]:}|j| \}}	|j| \}
}d}|d }d}d}||||f}|| || | }||	 | d }|| |}qtd|j||d |t	d|d| _
td|d ||d |t	d|d| _|jd }|d }|| | _t| j| jjd|t	d	|d
| _d S )Nr   r   r   conv_0)r  r  rY   r  rZ   r[   conv_1rw   Finput_proj_linearr]   )r   r   rY   input_feat_sizer   r  r  appendr  r   r(  r)  r  input_proj_in_featuresr	   rd   r*  )r    rY   rZ   r[   current_f_for_block_inputcalculated_block_paddingcalculated_f_out_dimsir  r  r  r  	pad_t_toppad_t_bottom
pad_f_leftpad_f_rightmanual_padding_tupler  f_out_after_convfinal_c_outfinal_f_outr!   r#   r$   r     sX   



	
z,Gemma3nAudioSubSampleConvProjection.__init__r  r'   c                 C   sd   | d}| |}| |}|j\}}}}|dddd }||||| }	| |	\}
}|
S )Nr   r   r   r   )rq   r(  r)  r/   r   r   r9   r*  )r    r  audio_encodings_reshapedr%   r   c_outt_outf_out
x_permutedoutput_flattenedoutputr   r#   r#   r$   rM   Z  s   


z+Gemma3nAudioSubSampleConvProjection.forwardr   rO   rP   rQ   r   r   r
   r   r   r   rU   rM   rV   r#   r#   r!   r$   r'    s    >r'  c                       P   e Zd Z		ddedee def fddZdej	d	ej
d
ej	fddZ  ZS )Gemma3nAudioConformerAttentionNrX   rY   rZ   r[   c                    s   t    || _| jj| jj }| jj|f| _| jj| _| jdt	| jj
dd t| jj| _t||td|d| _t| j| jjd|td|d| _t| jj| _d S )Ngradient_clippingFr`   attnr   postr]   )r   r   rY   rd   rb   post_in_shapepost_in_featuresrp   r   r   rD  r   pre_attn_normr   r   rE  r	   rF  	post_norm)r    rY   rZ   r[   rf   r!   r#   r$   r   f  s,   

z'Gemma3nAudioConformerAttention.__init__r  audio_mel_maskr'   c                 C   s~   |}t || j | j}| |}| ||}|j\}}}}	|||||	 }
| |
\}}t || j | j}|| | S rN   )	r   r6   rD  rI  rE  r/   r   rF  rJ  )r    r  rK  audio_encodings_input_to_attnaudio_encodings_normaudio_encodings_attn_outr   r   rc   rf   r:  r   r#   r#   r$   rM     s   

z&Gemma3nAudioConformerAttention.forwardr   rO   rP   rQ   r   r   r
   r   r   r   rU   r  rM   rV   r#   r#   r!   r$   rC  e  s"     rC  c                       r&  ) Gemma3nAudioConformerFeedForwardNrX   rY   rZ   r[   c                    s   t    || _| jdt| jjdd t| jj| _	t
| jj| jjd d|td|d| _t| jjd | jjd|td|d| _t| jj| _t| jj| _d S )NrD  Fr`   r   ffw_layer_1r]   ffw_layer_2)r   r   rY   rp   r   r   rD  r   rd   pre_layer_normr   r   rQ  r	   rR  post_layer_normconf_residual_weightpost_layer_scaler    rY   rZ   r[   r!   r#   r$   r     s0   


z)Gemma3nAudioConformerFeedForward.__init__r  r'   c                 C   st   |}t || j | j}| |}| |\}}t|}| |\}}t || j | j}| |}||| j	  S rN   )
r   r6   rD  rS  rQ  r   silurR  rT  rV  )r    r  residualr   r#   r#   r$   rM     s   


z(Gemma3nAudioConformerFeedForward.forwardr   rA  r#   r#   r!   r$   rP    s    !rP  c                       r&  ) Gemma3nAudioConformerLightConv1dNrX   rY   rZ   r[   c              	      s   t    || _t| jj| jjd| _t| jj| jjd d|td|d| _	t
j| jj| jj| jjdd| jjdd| _| jd	t| jjdd
 t| jj| jjd| _t| jj| jjd|td|d| _| jjd | _d S )N)r   r   Flinear_startr]   r   r   )r	  r
  r  r  r  groupsr^   rD  r`   
linear_end)r   r   rY   r   rd   rms_norm_epsrS  r   r   r[  r   Conv1dconf_conv_kernel_sizedepthwise_conv1drp   r   r   rD  	conv_normr	   r]  causal_paddingrW  r!   r#   r$   r     sJ   

	z)Gemma3nAudioConformerLightConv1d.__init__r  r'   c                 C   s   |}|  |}| |\}}tj|dd}|ddd}t|| jdf}| |}|ddd}t	|| j
 | j
}| |}t|}| |\}}|| }|S )Nrw   r,   r   r   r   )rS  r[  r   glur   r   rc  ra  r   r6   rD  rb  rX  r]  )r    r  audio_encodings_residualr   audio_encodings_permutedaudio_encodings_permuted_paddedr@  r#   r#   r$   rM     s$   




z(Gemma3nAudioConformerLightConv1d.forwardr   rA  r#   r#   r!   r$   rZ    s    /rZ  c                       rB  )Gemma3nAudioConformerBlockNrX   rY   rZ   r[   c                    s   t    || _t||td|d| _t||td|d| _t||td|d| _	t||td|d| _
| jdt| jjdd t| jj| _d S )	Nffw_layer_startr   	attentionlconv1dffw_layer_endrD  Fr`   )r   r   rY   rP  r   ri  rC  rj  rZ  rk  rl  rp   r   r   rD  r   rd   r  rW  r!   r#   r$   r     s(   
z#Gemma3nAudioConformerBlock.__init__r  rK  r'   c                 C   sh   |  |}| ||}| }||d|j }| |}| |}t|| j	 | j	}| 
|}|S )Nrw   )ri  rj  rq   r2   r)   rk  rl  r   r6   rD  r  )r    r  rK  validity_mask_for_lconvaudio_encodings_for_lconv_inputr@  r#   r#   r$   rM   5  s   



z"Gemma3nAudioConformerBlock.forwardr   rO  r#   r#   r!   r$   rh    s"    rh  c                       sb   e Zd ZdZeZ		ddedee def fddZ	d	e
jd
e
jdee
je
jf fddZ  ZS )Gemma3nAudioEncoderz>A Universal Speech Encoder -- https://arxiv.org/abs/2303.01037NrX   rY   rZ   r[   c                    sN   t     | _t td|d| _t j fddtd|d| _d S )Nsubsample_conv_projectionr   c                    s   t  |dS )N)rY   rZ   r[   )rh  )r  r[   rY   rZ   r#   r$   <lambda>\  s
    z.Gemma3nAudioEncoder.__init__.<locals>.<lambda>	conformer)	r   r   rY   r'  r   rp  r   conf_num_hidden_layersrs  rW  r!   rq  r$   r   N  s   zGemma3nAudioEncoder.__init__	audio_melrK  r'   c                 C   s$  |  |}|jd }d}tt| jjD ]}|| jj| d 9 }qtj||jd| }tj	||jd d d}|j
dkrN|j
dkrN|d|jd d}n |j
|j
krn|jd dkrn|jd dkrn||jd krn|d}t|d|}|jd |kr|jd |kr|ddd|f }n||jd  }	tj|d|	fdd}t| jD ]	\}
}|||}q| jjdkr|dddd| jjf }|dddd| jjf }|jd |jd kr|jd }|jd }||kr|| }	tj|d|	fdd}n||kr|ddd|f }||dd	}||fS )
a  Encodes a batch of MELs.

        Args:
            audio_mel: a torch.Tensor of shape [batch, num_frames, mel_bins].
            audio_mel_mask: a torch.BoolTensor of shape [batch, num_frames].

        Returns:
            audio_encodings: a torch.Tensor of shape
                `[batch_size, reduced_time_frames, hidden_size]`
            audio_mel_mask: a torch.BoolTensor of shape [batch, reduced_time_frames].
        r   r   r   )rg   rw   NT)r   r   )rp  r/   r   r   rY   r  r   ro   rx   r6   ry   rq   expandgatherr   r   	enumeraters  conf_reduction_factormasked_fill)r    ru  rK  r  t_subtime_stride_productstride_pair_idxindicescurrent_maskpadding_neededr1  block
target_lenmask_current_lenr#   r#   r$   rM   d  sX   







zGemma3nAudioEncoder.forwardr   )rO   rP   rQ   rR   r   config_classr   r
   r   r   r   rU   r  r   rM   rV   r#   r#   r!   r$   ro  I  s&    ro  )$rl   typingr   r   r   r   torch.nnr   torch.nn.functional
functionalr   transformersr   r   sglang.srt.layers.linearr   r   r	   *sglang.srt.layers.quantization.base_configr
    sglang.srt.models.gemma3n_causalr   sglang.srt.utilsr   r   Moduler   rW   r   r  r'  rC  rP  rZ  rh  ro  r#   r#   r#   r$   <module>   s,    w  S4J72J1