o
    پi                     @   s  d dl Z d dlZd dlmZmZ d dlZd dlZd dlm	  m
Z d dlmZm	Z	 d dlmZ d dlmZ d dlmZ d dlmZmZmZmZmZmZmZmZmZmZmZ dZG d	d
 d
e	j Z!G dd de j"e	j Z#G dd de#Z$G dd de	j Z%G dd de	j Z&dS )    N)LiteralOptional)Tensornn)CheckpointWrapper)FullyShardedDataParallel)PretrainedConfig)AbsolutePositionalEncoding
ConvModuleFeedForwardMeanVarianceNormLayerMultiHeadedAttentionMultiSequentialNemoConvSubsamplingT5RelativeAttentionLogitBiasadaptive_enc_mask
get_offsetunfold_tensoriK c                       sn   e Zd ZdZ											
	
							
				
	
	ddef fddZ	ddee fddZ  Z	S )ConformerEncoderLayera  ConformerEncoder Layer module.
    for more details see conformer paper:
        https://arxiv.org/abs/2005.08100
    This module implement the Conformer block layer.

    Args:
        d_model: int
            attention dim.
        ext_pw_out_channel: int
            if > 0, ext_pw_out_channel is a dim channel size
             for the last pointwise conv after swish activation.
        depthwise_seperable_out_channel: int
            if set different to 0, the number of
             depthwise_seperable_out_channel will be used as a
             channel_out of the second conv1d layer.
             otherwise, it equal to 0, the second conv1d layer is skipped.
        depthwise_multiplier: int
            number of input_dim channels duplication. this value
             will be used to compute the hidden channels of the Conv1D.
        n_head: int
            the number of heads for multihead attention module.
        d_ffn: int
            output size of the feed_forward blocks.
        ext_pw_kernel_size: int
            kernel size of the conv pointwise of the conformer.
        kernel_size: int
            kernel size.
        dropout_rate: float
            dropout rate.
        causal: bool, optional
            if set to True, convolution have no access
             to future frames. default False.
        batch_norm: bool, optional
            if set to True, apply batchnorm before activation
            in ConvModule layer of the conformer.
            default False
        activation: str, optional
            activation function name,
            one of ["relu", "swish", "sigmoid"],
            sigmoid activation is only used with "glu_in_fnn=True",
            default "relu".
        chunk_se: int, optional
            0 for offline SE.
            1 for streaming SE, where mean is computed
             by accumulated history until current chunk_se.
            2 for streaming SE, where mean is computed
             by only the current chunk.
            default 0.
        chunk_size: int, optional
            chunk_size for cnn. default 18
        conv_activation: str, optional
            activation function used in ConvModule part
            of the conformer, default "relu".
        conv_glu_type: str, optional
            activation function used for the glu inside
            the ConvModule part of the conformer.
            default: "sigmoid".
        bias_in_glu: bool, optional
            if set to True, use additive bias in the weight module
             before GLU.
        linear_glu_in_convm: bool, optional
            if set to True, use GLULinear module,
             otherwise, used GLUPointWiseConv module.
              default to False.
        attention_inner_dim: int, optional
            if equal to -1, attention dim for linears k/q/v is
            equal to d_model. otherwise attention_inner_dim is used.
            default -1.
        attention_glu_type: str, optional
            activation function for glu used in the multihead attention,
             default "swish".
        activation_checkpointing: str, optional
            a dictionarry of {"module","interval","offload"}, where
                "module": str
                    accept ["transformer", "attention"] to select
                    which module should do activation checkpointing.
                "interval": int, default 1,
                    interval of applying activation checkpointing,
                    interval = 1 means that we apply checkpointing
                    on every layer (if activation), otherwise,
                    we apply it every x interval.
                "offload": bool, default False,
                    if set to True, we offload activation to cpu and
                    reload it during backward, otherwise,
                    we recalculate activation in backward.
            default "".
        export: bool, optional
            if set to True, it remove the padding from convolutional layers
             and allow the onnx conversion for inference.
              default False.
        use_pt_scaled_dot_product_attention: bool, optional
            if set to True, use pytorch's scaled dot product attention
            implementation in training.
        attn_group_sizes: int, optional
            the number of groups to use for attention, default 1
            (Multi-Head Attention),
            1 = typical Multi-Head Attention,
            1 < attn_group_sizes < attention_heads = Grouped-Query Attention
            attn_group_sizes = attention_heads = Multi-Query Attention
       r                  皙?Frelu   sigmoidTswish attn_group_sizesc                    s   t    t|||	||d| _t|||	|||||d| _t|||||||	|
||||||||d| _t|||	||d| _t	
|| _t	
|| _d S )N)d_modeld_innerdropout_rate
activationbias_in_glu)#use_pt_scaled_dot_product_attention
group_size)export)super__init__r   feed_forward_inr   	self_attnr
   convfeed_forward_outr   	LayerNormlayer_norm_att
layer_norm)selfr#   ext_pw_out_channeldepthwise_seperable_out_channeldepthwise_multipliern_headd_ffnext_pw_kernel_sizekernel_sizer%   causal
batch_normr&   chunk_se
chunk_sizeconv_activationconv_glu_typer'   linear_glu_in_convmattention_inner_dimattention_glu_typeactivation_checkpointingr*   r(   r"   	__class__ R/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/phi4mm_audio.pyr,      sZ   

zConformerEncoderLayer.__init__Nrelative_attention_biasc              
   C   sn   |d|  |  }| |}|| j|||||||d }|| | }|d| |  }| |}||||fS )a  ConformerEncoder forward.

        Args:
            x: torch.Tensor
                input feature of shape (batch, max_time_in, size)
            pos_k: torch.Tensor
                positional key embedding.
            mask: torch.Tensor
                mask for x (batch, max_time_in)
            relative_attention_bias: Optional[torch.Tensor]
                bias added to attention logits w.r.t. relative positions
                (1, n_head, time1, time2)
        g      ?rJ   )r-   r2   r.   r/   r0   r3   )r4   xpos_kpos_vmaskrJ   norm_xoutrH   rH   rI   forward   s   
	
zConformerEncoderLayer.forward)r   r   r   r   r   r   r   r   r   FFr   r   r   r   r   TFr   r    r!   FFr   N)
__name__
__module____qualname____doc__intr,   r   r   rR   __classcell__rH   rH   rF   rI   r   .   sB    gSr   c                       s   e Zd ZdZ														
	d!ded f fddZdd Zejdd Z	d"ddZ
dd Zdd Zdd Zdd Zd"ddZdd  Z  ZS )#TransformerEncoderBaseaj  The Base class for Transformer based encoders

    Please set causal = True in streaming model
    Args:
        input_size: int
            input feature dimension.
        chunk_size: int, list(int)
            Number of frames for each chunk
            This variable can take 2 forms:
            int:  Used for inference, or single chunk size training
            list(int) : Used only for variable chunk size training
            Some examples for the 2 cases:
            chunk_size = 12
            chunk_size = [6, 8, 12, 24]
        left_chunk: int, list(int)
            Number of chunks used for masking in streaming mode.
            This variable can take 2 forms:
            int:  Used for inference, or single chunk size training
            list(int) : Used only for variable chunk size training. When
            chunk_size is a list, left_chunk must be a list with same length.
            Some examples for the 2 cases:
            left_chunk = 6
            left_chunk = [12, 9, 6, 3]
        attention_dim: int, optional
            attention dimension. default 256.
        attention_heads: int, optional
            the number of heads. default 4
        input_layer: str, optional
            input layer type before Conformer,
            one of ["linear", "conv2d", "custom", "vgg2l", "embed"],
            default "conv2d"
        cnn_out: int, optional
            the number of CNN channels before Conformer.
            default -1.
        cnn_layer_norm: bool, optional
            layer norm between Conformer and the first CNN.
            default False.
        time_reduction: int, optional
            time reduction factor
            default 4
        dropout_rate: float, optional
            dropout rate. default 0.1
        padding_idx: int, optional
            padding index for input_layer=embed
            default -1
        relative_attention_bias_args: dict, optional
            use more efficient scalar bias-based relative multihead attention
            (Q*K^T + B) implemented in cmb.basics.embedding.
            [T5/ALiBi]RelativeAttentionLogitBias
            usage: relative_attention_bias_args={"type": t5/alibi}
            additional method-specific arguments can be provided (see
            transformer_base.py)
        positional_dropout_rate: float, optional
            dropout rate after positional encoding. default 0.0
        nemo_conv_settings: dict, optional
            A dictionary of settings for NeMo Subsampling.
            default None
        conv2d_extra_padding: str, optional
            Add extra padding in conv2d subsampling layers. Choices are
            (feat, feat_time, none, True).
            if True or feat_time, the extra padding is added into non full
            supraframe utts in batch.
            Default: none
        attention_group_size: int, optional
            the number of groups to use for attention, default 1
            (Multi-Head Attention),
            1 = typical Multi-Head Attention,
            1 < attention_group_size < attention_heads = Grouped-Query
            Attention
            attention_group_size = attention_heads = Multi-Query Attention
    r   r   	nemo_convr   F        Nnoner   conv2d_extra_paddingfeat	feat_timer]   Tc              	      s6  t    || _|| _|| _|| _|| _|| _|| _|	| _	|| _
|| _| jdkrSd| j	||ddt dd}|rJ|| dD ]
}||vsIJ dq?tdi || _ntd	| t||| _|rf|d
nd | _| jdkr| j| j dkszJ dt| j| j |dd|ddd| _ntt| jd | _d S )Nr[   dw_stridingr   r   Fsubsamplingsubsampling_factorfeat_infeat_outconv_channels subsampling_conv_chunking_factorr&   	is_causalre   rf   rg   6{i} should be specified outside of the NeMo dictionaryzunknown input_layer: typet5r   'attention_group_size must divide n_headt5_bias_max_distancei  t5_bias_symmetric)max_distance	symmetric
input_sizerH   )r+   r,   rt   input_layerr?   
left_chunkattention_dim	num_headsattention_group_sizetime_reductionnemo_conv_settingsencoder_embedding_configr   ReLUupdater   embed
ValueErrorr	   pos_embgetrelative_attention_bias_typer   relative_attention_bias_layerNotImplementedErrorr   encoder_embedding)r4   rt   r?   rv   rw   attention_headsru   cnn_outcnn_layer_normrz   r%   padding_idxrelative_attention_bias_argspositional_dropout_rater{   r^   ry   r|   default_nemo_conv_settingsirF   rH   rI   r,   S  sj   









zTransformerEncoderBase.__init__c                 C   s   | j dkrb| jdddv }| jdd}|rP|rPt|tr(t|| j  nt	|| j }|| j }t|trF||dk  d7  < |S |dkrN|d7 }|S t|t
rXt	jntj}||| j S dS )	aj  feature_lens: int
        return updated feature lens.

        This used to return a different lambda function for each case that
        computed the right thing.  That does not work within Torchscript.
        If you really need this to be faster, create nn.Module()-s for all
        the cases and return one of them.  Torchscript does support that.
        r[   rd   rb   )rb   stridingstriding_conv1drj   Fr   N)ru   r{   r   
isinstancer   torchceilrz   longmathrX   )r4   feature_lenssubsampling_causal_condrj   lens_changefeature_lens_remainder	ceil_funcrH   rH   rI   compute_lens_change  s,   
	

z*TransformerEncoderBase.compute_lens_changec                 C   s   dS )z'Abstract forward method implementation.NrH   r4   rH   rH   rI   rR     s    zTransformerEncoderBase.forwardc                 C   s   |du r| j }|du r| j}t|tr@ttjdt|dd}|| }t|ts,tdt|t|kr8td|| }||fS |}|}||fS )z>If chunk size is a list, we will randomly select a chunk size.Nr   )r   )lowhighsizez5Since chunk_size is a list, left_chunk must be a listzBThe length of left_chunk must be the same as length of chunk_size.)	r?   rv   r   listrX   r   randintlenr   )r4   r?   rv   chunk_size_indexchunk_size_train_effleft_chunk_train_effrH   rH   rI   _chunk_size_selection  s,   

z,TransformerEncoderBase._chunk_size_selectionc                 C   s0   t |t}t |t}|}|r|j}|r|j}|S rS   )r   r   r   _checkpoint_wrapped_modulemodule)r4   r   is_embed_using_act_chkptis_embed_fsdp_wrappedembed_classrH   rH   rI   _get_embed_class  s   

z'TransformerEncoderBase._get_embed_classc                 C   s2   |  | j}t|tsJ | ||\}}||fS rS   )r   r   r   r   )r4   input_tensormasksr   rH   rH   rI   _forward_embeddings_core  s   z/TransformerEncoderBase._forward_embeddings_corec                 C   s$   d }d }| j d u r| |}||fS rS   )r   r   )r4   r   rM   rN   rH   rH   rI   _position_embedding  s   
z*TransformerEncoderBase._position_embeddingc           	      C   sB   |  ||\}}td||}t|||dd|ddg}|S )Nr   )left_windowr   )r   nparanger   	unsqueezeexpand)	r4   seq_len
batch_sizer?   rv   r   r   chunk_start_idxenc_streaming_maskrH   rH   rI   _streaming_mask  s   z&TransformerEncoderBase._streaming_maskc                 C   s  t | |jd }|dkrtd| d|jd }| ||| j| j}|jr0| }| }|}| 	||\}}|}	|	durI|durI||	@ }
n	|durP|}
n|	}
|durq| ||||}|jre| }|durn||@ }n|}nd}| 
|\}}|du r||||
|fS ||||
||fS )a  Forwarding the inputs through the top embedding layers

        Args:
            xs_pad: torch.Tensor
                input tensor
            masks: torch.Tensor
                input mask
            chunk_size_nc: (optional, default is None) chunk size for
                            non-causal layers
            left_chunk_nc: (optional, default is None) # of left chunks for
                            non-causal layers
        r   r   zEThe sequence length after time reduction is invalid:
                z. Your input feature is too short. Consider
                filtering out the very short sentence from data
                loaderN)rX   r   shaper   r   r?   rv   is_cudacudar   r   )r4   xs_padr   chunk_size_ncleft_chunk_ncr   r   r   r   streaming_maskhs_maskenc_streaming_mask_nc
hs_mask_ncrM   rN   rH   rH   rI   forward_embeddings  sH   


z)TransformerEncoderBase.forward_embeddingsc                 C   s   t | j| jS )a!  Returns offset used when retaining inputs for decoding.

        This is essentially, how many additional frames have to be added to
        the front-end CNN input to ensure it can produce a single output.
        So if the "padding" parameter is 0, typically offset will be > 0.
        )r   ru   rz   r   rH   rH   rI   r   V  s   z!TransformerEncoderBase.get_offset)r   r   r[   r   Fr   r\   r   Nr\   Nr]   r   N)NN)rT   rU   rV   rW   r   r,   r   abcabstractmethodrR   r   r   r   r   r   r   r   rY   rH   rH   rF   rI   rZ   
  s:    MT"

	
@rZ   c                #       s   e Zd ZU dZee ed< ddddddd	d
ddddddddddddd
ddddg dddddddddf#ded f fddZdd Z	dd Z
ejjdd Z  ZS )ConformerEncodera  ConformerEncoder module.
    see original paper for more details:
        https://arxiv.org/abs/2005.08100

    Please set causal = True in streaming model
    Args:
        input_size: int
            input feature dimension.
        chunk_size: int, list(int)
            Number of frames for each chunk
            This variable can take 2 forms:
            int:  Used for inference, or single chunk size training
            list(int) : Used only for variable chunk size training
            Some examples for the 2 cases:
            chunk_size = 12
            chunk_size = [6, 8, 12, 24]
        left_chunk: int, list(int)
            Number of chunks used for masking in streaming mode.
            This variable can take 2 forms:
            int:  Used for inference, or single chunk size training
            list(int) : Used only for variable chunk size training. When
            chunk_size is a list, left_chunk must be a list with same length.
            Some examples for the 2 cases:
            left_chunk = 6
            left_chunk = [12, 9, 6, 3]
        left_chunk: int
            number of chunks used for masking in streaming mode.
        num_lang: int
            This parameter is used to store the number of languages in the
            lang_dict, only used for multiseed/multilingual models.
            default None.
        attention_dim: int, optional
            attention dimension. default 256.
        attention_heads: int, optional
            the number of heads. default 4
        linear_units:
            the number of units of position-wise feed forward.
            default 2048
        num_block:
            number of Transformer layer. default 6
        dropout_rate: float, optional
            dropout rate. default 0.1
        input_layer: str, optional
            input layer type before Conformer,
            one of ["linear", "conv2d", "custom", "vgg2l", "embed"],
            default "conv2d"
        causal: bool, optional
            if set to True, convolution have no access
             to future frames. default False.
        batch_norm: bool, optional
            if set to True, apply batchnorm before activation
            in ConvModule layer of the conformer.
            default False
        cnn_out: int, optional
            the number of CNN channels before Conformer.
            default -1.
        cnn_layer_norm: bool, optional
            layer norm between Conformer and the first CNN.
            default False.
        ext_pw_out_channel: int, optional
            the number of channel for CNN
            before depthwise_seperable_CNN.
            If 0 then use linear. default 0.
        ext_pw_kernel_size: int, optional
            kernel size of N before depthwise_seperable_CNN.
            only work for ext_pw_out_channel > 0.
            default 1
        depthwise_seperable_out_channel: int, optional
            the number of channel for
            depthwise_seperable_CNN.
            default 256.
        depthwise_multiplier: int, optional
            the number of multiplier for
            depthwise_seperable_CNN.
            default 1.
        chunk_se: int, optional
            0 for offline SE.
            1 for streaming SE, where mean is computed
             by accumulated history until current chunk_se.
            2 for streaming SE, where mean is computed
             by only the current chunk.
            default 0.
        kernel_size: int, optional
            the number of kernels for depthwise_seperable_CNN.
            default 3.
        activation: str, optional
            FeedForward block activation.
            one of ["relu", "swish", "sigmoid"]
            default "relu".
        conv_activation: str, optional
            activation function used in ConvModule part
            of the conformer, default "relu".
        conv_glu_type: str, optional
            activation used use glu in depthwise_seperable_CNN,
            default "sigmoid"
        bias_in_glu: bool, optional
            if set to True, use additive bias in the weight module
             before GLU. default True
        linear_glu_in_convm: bool, optional
            if set to True, use GLULinear module,
             otherwise, used GLUPointWiseConv module.
              default to False.
        attention_glu_type: str
            only work for glu_in_attention !=0
            default "swish".
        export: bool, optional
            if set to True, it remove the padding from convolutional layers
             and allow the onnx conversion for inference.
              default False.
        activation_checkpointing: str, optional
            a dictionarry of {"module","interval","offload"}, where
                "module": str
                    accept ["transformer", "attention"] to select
                    which module should do activation checkpointing.
                "interval": int, default 1,
                    interval of applying activation checkpointing,
                    interval = 1 means that we apply checkpointing
                    on every layer (if activation), otherwise,
                    we apply it every x interval.
                "offload": bool, default False,
                    if set to True, we offload activation to cpu and
                    reload it during backward, otherwise,
                    we recalculate activation in backward.
            default "".
        extra_layer_output_idx: int
            the layer index to be exposed.
        relative_attention_bias_args: dict, optional
            use more efficient scalar bias-based relative multihead attention
            (Q*K^T + B) implemented in cmb.basics.embedding.
            [T5/ALiBi]RelativeAttentionLogitBias
            usage: relative_attention_bias_args={"type": t5/alibi}
            additional method-specific arguments can be provided (see
            transformer_base.py)
        time_reduction: int optional
            time reduction factor
            default 4
        use_pt_scaled_dot_product_attention: whether to use pytorch scaled
            dot product attention in training.
            Default: False
        nemo_conv_settings: dict, optional
            A dictionary of settings for NeMo Subsampling.
            default: None
            usage: nemo_conv_settings=
                {
                    "subsampling":
                    dw_striding/striding/dw_striding_conv1d/striding_conv1d,
                    "conv_channels": int,
                    "subsampling_conv_chunking_factor": int,
                    "is_causal": True/False
                }
        conv2d_extra_padding: str, optional
            Add extra padding in conv2d subsampling layers. Choices are
            (feat, feat_time, none, True)
            Default: none
        replication_pad_for_subsample_embedding:  For batched-streaming
            decoding, use "replication" padding for the cache at start of
            utterance.
            Default: False
        attention_group_size: int, optional
            the number of groups to use for attention, default 1
            (Multi-Head Attention),
            1 = typical Multi-Head Attention,
            1 < attention_group_size < attention_heads = Grouped-Query
            Attention
            attention_group_size = attention_heads = Multi-Query Attention
    extra_multi_layer_output_idxsNr   r   r      r   r[   TFr   r   r   r   r   r   r    r!   r]   r^   r_   c'           '         s   t  j|
||
||| |d|"|#|&d || _|| _| _|$| _| j dks-J d| j | _t 	
fddt	|D  | _
|| _|| _| jdtdd	d
 d S )Nr\   )r%   r   r   r{   r^   ry   r|   r   ro   c                    s   g | ]L}t di d ddddddddd	d
d d	d
dddddddddqS )r#   r5   r6   r7   r8   r9   r:   r;   r%   r<   r=   r&   r>   r?   r@   rA   r'   rB   rD   rE   r*   r(   r"   rH   )r   .0_r&   rE   rw   rD   ry   r   r=   r'   r<   r>   r?   r@   rA   r7   r6   r%   r*   r:   r5   r;   rB   linear_unitsr(   rH   rI   
<listcomp>Q  sd    	
z-ConformerEncoder.__init__.<locals>.<listcomp>dev_typerH   F)
persistent)r+   r,   
num_blocksnum_langr;   'replication_pad_for_subsample_embeddingrx   num_heads_kr   rangeencodersextra_layer_output_idxr   register_bufferr   zeros)'r4   rt   r?   rv   r   rw   r   r   r   r%   ru   r<   r=   r   r   r5   r:   r6   r7   r>   r;   r&   r@   rA   r'   rB   rD   r*   r   r   rE   r   rz   r(   r{   r^   r   ry   r|   rF   r   rI   r,   
  sF   )6zConformerEncoder.__init__c                 C   s   | j r|  |S d S rS   )r   )r4   r   rH   rH   rI   init_relative_attention_biast  s   
z-ConformerEncoder.init_relative_attention_biasc           
      C   s   |j d }|j d }| ||| j| j}||}|d u r|S |d}|}tjd||d|	dd|
dk }	|	
d}	|	|@ }	|	S )Nr   r   )devicer   )r   r   r?   rv   tosumr   r   r   r   r   )
r4   r   r   rO   max_audio_lengthr   r   r   padding_lengthpad_maskrH   rH   rI   calculate_hs_maskx  s$   





z"ConformerEncoder.calculate_hs_maskc                 C   s  |  |}| ||\}}}}}d}|j\}}	}
d}|	|kr{d}|	| dkr-||	|  }nd}|dkrEt|ddd|fdd}||j}t||}|durq|d}t|d|fdd}|	d
 }t||}|d }nd}| ||j|}| |}| jdko|du }|r| ||||^}}nt| jD ]\}}||||||d	\}}}}q|r|jd }||d|}|dkr|ddd| ddf }||fS )
zConformer Forward function

        Args:
            xs_pad: torch.Tensor
                input tensor
            masks: torch.Tensor
                post-embedding input lengths
        Fi  Tr   constantNr   r   rK   )r   r   r   Fpadr   r   r   squeezer   floatboolr   r   r   r   	enumeratereshape)r4   r   r   r   rM   rN   r   unfoldedori_bzr   Dmax_seq_lenchunk_pad_sizeinput_tensor_padsubsampled_pad_mask extra_padded_subsamlped_pad_maskmasks_unfoldrJ   _simplified_pathr   r   layer	embed_dimrH   rH   rI   rR     st   




zConformerEncoder.forward)rT   rU   rV   rW   r   rX   __annotations__r   r,   r   r   r   jitignorerR   rY   rH   rH   rF   rI   r   `  s\   
  ($jr   c                       s^   e Zd ZdZ								dd	ed
ededededededef fddZdddZ  Z	S )WindowQformerzWindow-level Qformer   r      r   r   r\   Twindow_sizenum_queriesr   rw   r   r   r%   normalize_beforec	           	         sh   t    t fddt|D | _ttd| | _	r,tj
 ddnd | _|| _d S )Nc                    s$   g | ]}t j d ddqS )r   T)r#   nheaddim_feedforwarddropoutr&   batch_first
norm_first)r   TransformerDecoderLayerr   rw   r   r%   r   r   rH   rI   r     s    
z*WindowQformer.__init__.<locals>.<listcomp>r   g-q=)eps)r+   r,   r   
ModuleListr   decoders	Parameterr   r   queriesr1   
after_normr   )	r4   r   r   r   rw   r   r   r%   r   rF   r  rI   r,     s   


zWindowQformer.__init__Nc                 C   s  | dd}|jd | j }|dkrt|d| j| fdd}tj|ddddf d| jfd| jfd}|j\}}}||d| j|}| dd	 }||| | jd}| j	|| dd}	| j
D ]
}
|
|	|d|d
}	qb| jdurw| |	}	|dur|| j }|	||d}||fS )zforward decoderr   r   r   r   r   .N)r;   strider   )tgtmemorytgt_maskmemory_mask)	transposer   r   r   r   unfoldview
contiguousr  r   r
  r  )r4   audio_embedrO   	embed_lenpaddingembed_chunkbszr   slenqr   rQ   rH   rH   rI   rR     s0   



zWindowQformer.forward)r   r   r   r   r   r   r\   TrS   )
rT   rU   rV   rW   rX   r   r   r,   rR   rY   rH   rH   rF   rI   r     s8    	"r   c                	       s   e Zd ZdZdeddf fddZdejddfdd	Zd
ej	ddfddZ
		ddejdejdedejfddZ		ddejdejdedejfddZ  ZS )AudioEmbeddingzImage embedding.configreturnNc              	      s  t    || _t|dr|jn|j}d }d| _t|jt	rE|j
dd dkrE|j
dd }|d us4J tdi || _|d }|d }ntd|d usQJ d	|| _|| _|
d
d| _|
dd| _|
ddr~|
di }||d< tdi || _nd | _|
ddr| jd u sJ d|
di }d| j||ddt dd}	|r|	| dD ]
}
|
|vsJ dqtdi |	| _nd | _|
dd}|dkrt||| _ni|dkr7|}d}| js| jrdn| j| _t|| j |g}td|D ]}|t t||g qtj| | _t|| j |g}td|D ]}|t t||g qtj| | _ ntd| d|j!| _!d | _"d | _#d S )Nn_embdnamecascadesr  rw   rt   r!   z(Remember to set values for audio_dim_outfreeze_audio_processorFdownsample_rater   use_qformerqformer_configuse_conv_downsamplez6don't support use qformer and conv downsample togetherr{   rb   r   rc   rk   rl   projection_clslinearmlpr   zprojection_cls = z, not implementedrH   )$r+   r,   r  hasattrr!  hidden_size	layer_idxr   audio_processordictr   r   encoderr   audio_dim_outaudio_dim_inr%  r&  r   qformerr   r}   r~   r   conv_dsLinearaudio_projectionlinear_downsample_rater   extendGELU
Sequentialaudio_projection_for_vision
vocab_sizeinput_embedsaudio_embed_sizes)r4   r  kwargsr.  r3  encoder_confign_melsr(  r{   r   r   r*  dim_projectiondepthlayersr   rF   rH   rI   r,   6  s   








zAudioEmbedding.__init__r?  c                 C   
   || _ d S rS   )r?  )r4   r?  rH   rH   rI   set_audio_embeds     
zAudioEmbedding.set_audio_embedsr@  c                 C   rG  rS   )r@  )r4   r@  rH   rH   rI   set_audio_embed_sizes  rI  z$AudioEmbedding.set_audio_embed_sizesspeechaudio_attention_maskaudio_projection_modec                 C   sB  | j r t  | ||\}}W d   n1 sw   Y  n| ||\}}| jdur6| j|dd\}}| jdurM|durD|d}| j||d\}}| jdkr| \}}}	|| j }
|
dkrqt	
|ddd| j|
 fdd}|d}|||| j |	| j }|dkr| |}|S |dkr| |}|S td| d	)
zl
        arguments:
            input_embeds: audio features (B, T, D)  B: num audios in a sequence
        N)rO   r   r   r   rK  visionzaudio_projection_mode = z not implemented)r%  r   no_gradr2  r5  r6  r   r9  r   r   r   r  r8  r=  r   )r4   r?  rL  rM  audio_featuresr   r   bsr   feat_dimr  audio_set_tensorrH   rH   rI   get_audio_features  sJ   










z!AudioEmbedding.get_audio_featuresrP  c                 C   s   | j |||d}|S )z
        arguments:
            audio_features: audio features (num_audio_tokens, T, D)

        returns:
            audio_embeds: audio embeddings (num_audio_tokens, hidden_dim)
        )rL  rM  )rT  )r4   rP  rL  rM  audio_embedsrH   rH   rI   rR     s   zAudioEmbedding.forward)NrK  )rT   rU   rV   rW   r   r,   r   FloatTensorrH  
LongTensorrJ  r   strrT  rR   rY   rH   rH   rF   rI   r  3  s6    h
9r  )'r   r   typingr   r   numpyr   r   torch.nn.functionalr   
functionalr   r   ;torch.distributed.algorithms._checkpoint.checkpoint_wrapperr   2torch.distributed.fsdp.fully_sharded_data_parallelr   transformersr   sglang.srt.models.phi4mm_utilsr	   r
   r   r   r   r   r   r   r   r   r   _AUDIO_PLACEHOLDER_TOKEN_IDModuler   ABCrZ   r   r   r  rH   rH   rH   rI   <module>   s.   4 ]  X   	M