o
    
۾i                     @   s
  d dl Z d dlZd dlmZmZ d dlZd dlZd dlm	  m
Z d dlmZm	Z	 d dlmZ d dlmZ d dlmZ d dlmZmZmZmZmZmZmZmZmZmZmZ G dd	 d	e	jZ G d
d de j!e	jZ"G dd de"Z#G dd de	jZ$G dd de	jZ%dS )    N)AnyLiteral)Tensornn)CheckpointWrapper)FullyShardedDataParallel)PretrainedConfig)AbsolutePositionalEncoding
ConvModuleFeedForwardMeanVarianceNormLayerMultiHeadedAttentionMultiSequentialNemoConvSubsamplingT5RelativeAttentionLogitBiasadaptive_enc_mask
get_offsetunfold_tensorc                3       s   e Zd ZdZ											
	
							
				
	
	d5dedededededededededededededed ed!ed"ed#ed$ed%ed&ed'ed(ed)ed*d+f2 fd,d-Z	+d6d.e	j
d/e	j
d0e	j
d1e	j
d2e
d+B d*ee	j
e	j
e	j
e	j
f fd3d4Z  ZS )7ConformerEncoderLayera  ConformerEncoder Layer module.
    for more details see conformer paper:
        https://arxiv.org/abs/2005.08100
    This module implement the Conformer block layer.

    Args:
        d_model: int
            attention dim.
        ext_pw_out_channel: int
            if > 0, ext_pw_out_channel is a dim channel size
             for the last pointwise conv after swish activation.
        depthwise_seperable_out_channel: int
            if set different to 0, the number of
             depthwise_seperable_out_channel will be used as a
             channel_out of the second conv1d layer.
             otherwise, it equals to 0, the second conv1d layer is skipped.
        depthwise_multiplier: int
            number of input_dim channels duplication. this value
             will be used to compute the hidden channels of the Conv1D.
        n_head: int
            the number of heads for multihead attention module.
        d_ffn: int
            output size of the feed_forward blocks.
        ext_pw_kernel_size: int
            kernel size of the conv pointwise of the conformer.
        kernel_size: int
            kernel size.
        dropout_rate: float
            dropout rate.
        causal: bool, optional
            if set to True, convolution have no access
             to future frames. default False.
        batch_norm: bool, optional
            if set to True, apply batchnorm before activation
            in ConvModule layer of the conformer.
            default False
        activation: str, optional
            activation function name,
            one of ["relu", "swish", "sigmoid"],
            sigmoid activation is only used with "glu_in_fnn=True",
            default "relu".
        chunk_se: int, optional
            0 for offline SE.
            1 for streaming SE, where mean is computed
             by accumulated history until current chunk_se.
            2 for streaming SE, where mean is computed
             by only the current chunk.
            default 0.
        chunk_size: int, optional
            chunk_size for cnn. default 18
        conv_activation: str, optional
            activation function used in ConvModule part
            of the conformer, default "relu".
        conv_glu_type: str, optional
            activation function used for the glu inside
            the ConvModule part of the conformer.
            default: "sigmoid".
        bias_in_glu: bool, optional
            if set to True, use additive bias in the weight module
             before GLU.
        linear_glu_in_convm: bool, optional
            if set to True, use GLULinear module,
             otherwise, used GLUPointWiseConv module.
              default to False.
        attention_inner_dim: int, optional
            if equal to -1, attention dim for linears k/q/v is
            equal to d_model. otherwise attention_inner_dim is used.
            default -1.
        attention_glu_type: str, optional
            activation function for glu used in the multihead attention,
             default "swish".
        activation_checkpointing: str, optional
            a dictionary of {"module","interval","offload"}, where
                "module": str
                    accept ["transformer", "attention"] to select
                    which module should do activation checkpointing.
                "interval": int, default 1,
                    interval of applying activation checkpointing,
                    interval = 1 means that we apply checkpointing
                    on every layer (if activation), otherwise,
                    we apply it every x interval.
                "offload": bool, default False,
                    if set to True, we offload activation to cpu and
                    reload it during backward, otherwise,
                    we recalculate activation in backward.
            default "".
        export: bool, optional
            if set to True, it removes the padding from convolutional layers
             and allow the onnx conversion for inference.
              default False.
        use_pt_scaled_dot_product_attention: bool, optional
            if set to True, use pytorch's scaled dot product attention
            implementation in training.
        attn_group_sizes: int, optional
            the number of groups to use for attention, default 1
            (Multi-Head Attention),
            1 = typical Multi-Head Attention,
            1 < attn_group_sizes < attention_heads = Grouped-Query Attention
            attn_group_sizes = attention_heads = Multi-Query Attention
       r                  皙?Frelu   sigmoidTswish d_modelext_pw_out_channeldepthwise_seperable_out_channeldepthwise_multipliern_headd_ffnext_pw_kernel_sizekernel_sizedropout_ratecausal
batch_norm
activationchunk_se
chunk_sizeconv_activationconv_glu_typebias_in_glulinear_glu_in_convmattention_inner_dimattention_glu_typeactivation_checkpointingexport#use_pt_scaled_dot_product_attentionattn_group_sizesreturnNc                    s   t    t|||	||d| _t|||	|||||d| _t|||||||	|
||||||||d| _t|||	||d| _t	
|| _t	
|| _d S )N)r"   d_innerr*   r-   r2   )r8   
group_size)r7   )super__init__r   feed_forward_inr   	self_attnr
   convfeed_forward_outr   	LayerNormlayer_norm_att
layer_norm)selfr"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   	__class__ [/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/phi4mm_audio.pyr>      sZ   

zConformerEncoderLayer.__init__xpos_kpos_vmaskrelative_attention_biasc              
   C   sn   |d|  |  }| |}|| j|||||||d }|| | }|d| |  }| |}||||fS )a  ConformerEncoder forward.

        Args:
            x: input feature of shape (batch, max_time_in, size)
            pos_k: positional key embedding.
            pos_v: positional value embedding.
            mask: mask for x (batch, max_time_in)
            relative_attention_bias: bias added to attention logits w.r.t.
                relative positions (1, n_head, time1, time2)
        g      ?rO   )r?   rD   r@   rA   rB   rE   )rF   rK   rL   rM   rN   rO   norm_xoutrI   rI   rJ   forward   s   
	
zConformerEncoderLayer.forward)r   r   r   r   r   r   r   r   r   FFr   r   r   r   r   TFr   r    r!   FFr   N)__name__
__module____qualname____doc__intfloatboolstrr>   torchr   tuplerS   __classcell__rI   rI   rG   rJ   r   %   s    g	
Sr   c                %       sX  e Zd ZdZ														
	d;dedeee B deee B dedededededededede	ee
f dB dede	ee
f dB ded dede	ee
f dB ddf$ fddZd eejB deejB fd!d"Zejde
fd#d$Z		d<deee B dB deee B dB deeef fd%d&Zd'ejdejfd(d)Zd*ejd+ejdeejejf fd,d-Zd*ejdeejdB ejdB f fd.d/Zd0ed1edeee B deee B dejf
d2d3Z		d<d4ejd+ejd5eee B dB d6eee B dB deejejdB ejdB ejejf eejejdB ejdB ejejejf B f
d7d8Zdefd9d:Z  ZS )=TransformerEncoderBaseaj  The Base class for Transformer based encoders

    Please set causal = True in streaming model
    Args:
        input_size: int
            input feature dimension.
        chunk_size: int, list(int)
            Number of frames for each chunk
            This variable can take 2 forms:
            int:  Used for inference, or single chunk size training
            list(int) : Used only for variable chunk size training
            Some examples for the 2 cases:
            chunk_size = 12
            chunk_size = [6, 8, 12, 24]
        left_chunk: int, list(int)
            Number of chunks used for masking in streaming mode.
            This variable can take 2 forms:
            int:  Used for inference, or single chunk size training
            list(int) : Used only for variable chunk size training. When
            chunk_size is a list, left_chunk must be a list with same length.
            Some examples for the 2 cases:
            left_chunk = 6
            left_chunk = [12, 9, 6, 3]
        attention_dim: int, optional
            attention dimension. default 256.
        attention_heads: int, optional
            the number of heads. default 4
        input_layer: str, optional
            input layer type before Conformer,
            one of ["linear", "conv2d", "custom", "vgg2l", "embed"],
            default "conv2d"
        cnn_out: int, optional
            the number of CNN channels before Conformer.
            default -1.
        cnn_layer_norm: bool, optional
            layer norm between Conformer and the first CNN.
            default False.
        time_reduction: int, optional
            time reduction factor
            default 4
        dropout_rate: float, optional
            dropout rate. default 0.1
        padding_idx: int, optional
            padding index for input_layer=embed
            default -1
        relative_attention_bias_args: dict, optional
            use more efficient scalar bias-based relative multihead attention
            (Q*K^T + B) implemented in cmb.basics.embedding.
            [T5/ALiBi]RelativeAttentionLogitBias
            usage: relative_attention_bias_args={"type": t5/alibi}
            additional method-specific arguments can be provided (see
            transformer_base.py)
        positional_dropout_rate: float, optional
            dropout rate after positional encoding. default 0.0
        nemo_conv_settings: dict, optional
            A dictionary of settings for NeMo Subsampling.
            default None
        conv2d_extra_padding: str, optional
            Add extra padding in conv2d subsampling layers. Choices are
            (feat, feat_time, none, True).
            if True or feat_time, the extra padding is added into non full
            supraframe utts in batch.
            Default: none
        attention_group_size: int, optional
            the number of groups to use for attention, default 1
            (Multi-Head Attention),
            1 = typical Multi-Head Attention,
            1 < attention_group_size < attention_heads = Grouped-Query
            Attention
            attention_group_size = attention_heads = Multi-Query Attention
    r   r   	nemo_convr   F        Nnoner   
input_sizer/   
left_chunkattention_dimattention_headsinput_layercnn_outcnn_layer_normtime_reductionr*   padding_idxrelative_attention_bias_argspositional_dropout_ratenemo_conv_settingsconv2d_extra_paddingfeat	feat_timerc   Tattention_group_sizeencoder_embedding_configr:   c              	      s6  t    || _|| _|| _|| _|| _|| _|| _|	| _	|| _
|| _| jdkrSd| j	||ddt dd}|rJ|| dD ]
}||vsIJ dq?tdi || _ntd	| t||| _|rf|d
nd | _| jdkr| j| j dkszJ dt| j| j |dd|ddd| _ntt| jd | _d S )Nra   dw_stridingr   r   Fsubsamplingsubsampling_factorfeat_infeat_outconv_channels subsampling_conv_chunking_factorr-   	is_causalry   rz   r{   6{i} should be specified outside of the NeMo dictionaryzunknown input_layer: typet5r   'attention_group_size must divide n_headt5_bias_max_distancei  t5_bias_symmetric)max_distance	symmetricrd   rI   )r=   r>   rd   rh   r/   re   rf   	num_headsrt   rk   ro   ru   r   ReLUupdater   embed
ValueErrorr	   pos_embgetrelative_attention_bias_typer   relative_attention_bias_layerNotImplementedErrorr   encoder_embedding)rF   rd   r/   re   rf   rg   rh   ri   rj   rk   r*   rl   rm   rn   ro   rp   rt   ru   default_nemo_conv_settingsirG   rI   rJ   r>   G  sj   









zTransformerEncoderBase.__init__feature_lensc                 C   s   | j dkrb| jdddv }| jdd}|rP|rPt|tr(t|| j  nt	|| j }|| j }t|trF||dk  d7  < |S |dkrN|d7 }|S t|t
rXt	jntj}||| j S dS )	aj  feature_lens: int
        return updated feature lens.

        This used to return a different lambda function for each case that
        computed the right thing.  That does not work within Torchscript.
        If you really need this to be faster, create nn.Module()-s for all
        the cases and return one of them.  Torchscript does support that.
        ra   rx   rv   )rv   stridingstriding_conv1dr~   Fr   N)rh   ro   r   
isinstancer   r]   ceilrk   longmathrY   )rF   r   subsampling_causal_condr~   lens_changefeature_lens_remainder	ceil_funcrI   rI   rJ   compute_lens_change  s,   


z*TransformerEncoderBase.compute_lens_changec                 C   s   dS )z'Abstract forward method implementation.NrI   rF   rI   rI   rJ   rS     s    zTransformerEncoderBase.forwardc                 C   s   |du r| j }|du r| j}t|tr@ttjdt|dd}|| }t|ts,tdt|t|kr8td|| }||fS |}|}||fS )z>If chunk size is a list, we will randomly select a chunk size.Nr   )r   )lowhighsizez5Since chunk_size is a list, left_chunk must be a listzBThe length of left_chunk must be the same as length of chunk_size.)	r/   re   r   listrY   r]   randintlenr   )rF   r/   re   chunk_size_indexchunk_size_train_effleft_chunk_train_effrI   rI   rJ   _chunk_size_selection  s,   

z,TransformerEncoderBase._chunk_size_selectionr   c                 C   s0   t |t}t |t}|}|r|j}|r|j}|S rT   )r   r   r   _checkpoint_wrapped_modulemodule)rF   r   is_embed_using_act_chkptis_embed_fsdp_wrappedembed_classrI   rI   rJ   _get_embed_class  s   

z'TransformerEncoderBase._get_embed_classinput_tensormasksc                 C   s2   |  | j}t|tsJ | ||\}}||fS rT   )r   r   r   r   )rF   r   r   r   rI   rI   rJ   _forward_embeddings_core  s   z/TransformerEncoderBase._forward_embeddings_corec                 C   s$   d }d }| j d u r| |}||fS rT   )r   r   )rF   r   rL   rM   rI   rI   rJ   _position_embedding  s   
z*TransformerEncoderBase._position_embeddingseq_len
batch_sizec           	      C   sB   |  ||\}}td||}t|||dd|ddg}|S )Nr   )left_windowr   )r   nparanger   	unsqueezeexpand)	rF   r   r   r/   re   r   r   chunk_start_idxenc_streaming_maskrI   rI   rJ   _streaming_mask  s   z&TransformerEncoderBase._streaming_maskxs_padchunk_size_ncleft_chunk_ncc                 C   s  t | |jd }|dkrtd| d|jd }| ||| j| j}|jr0| }| }|}| 	||\}}|}	|	durI|durI||	@ }
n	|durP|}
n|	}
|durq| ||||}|jre| }|durn||@ }n|}nd}| 
|\}}|du r||||
|fS ||||
||fS )a  Forwarding the inputs through the top embedding layers

        Args:
            xs_pad: torch.Tensor
                input tensor
            masks: torch.Tensor
                input mask
            chunk_size_nc: (optional, default is None) chunk size for
                            non-causal layers
            left_chunk_nc: (optional, default is None) # of left chunks for
                            non-causal layers
        r   r   zFThe sequence length after time reduction is invalid: 
                z. Your input feature is too short. Consider 
                filtering out the very short sentence from data 
                loaderN)rY   r   shaper   r   r/   re   is_cudacudar   r   )rF   r   r   r   r   r   r   r   r   streaming_maskhs_maskenc_streaming_mask_nc
hs_mask_ncrL   rM   rI   rI   rJ   forward_embeddings  sH   %


z)TransformerEncoderBase.forward_embeddingsc                 C   s   t | j| jS )a!  Returns offset used when retaining inputs for decoding.

        This is essentially, how many additional frames have to be added to
        the front-end CNN input to ensure it can produce a single output.
        So if the "padding" parameter is 0, typically offset will be > 0.
        )r   rh   rk   r   rI   rI   rJ   r   o  s   z!TransformerEncoderBase.get_offset)r   r   ra   r   Fr   rb   r   Nrb   Nrc   r   N)NN)rU   rV   rW   rX   rY   r   r\   r[   rZ   dictr   r   r>   r]   r   r   abcabstractmethodrS   r^   r   r   Moduler   r   r   r   r   r   r_   rI   rI   rG   rJ   r`      s    M

	
T
$

 





Vr`   c                O       s  e Zd ZU dZee ed< ddddddd	d
ddddddddddddd
ddddg dddddddddf#dedeee B deee B dedB dedededededede	d e	d!ed"e	d#ed$ed%ed&ed'ed(ed)ed*ed+ed,e	d-e	d.ed/e	d0edee d1ed2e
eef dB d3ed4e	d5e
eef dB d6ed7 d8e	d9ed:e
eef dB d;dfN fd<d=Zd>ejd;ejdB fd?d@ZdAejdBejdCejdB d;ejfdDdEZejjdAejdFejd;eejejf fdGdHZ  ZS )IConformerEncoderar  ConformerEncoder module.
    see original paper for more details:
        https://arxiv.org/abs/2005.08100

    Please set causal = True in streaming model
    Args:
        input_size: int
            input feature dimension.
        chunk_size: int, list(int)
            Number of frames for each chunk
            This variable can take 2 forms:
            int:  Used for inference, or single chunk size training
            list(int) : Used only for variable chunk size training
            Some examples for the 2 cases:
            chunk_size = 12
            chunk_size = [6, 8, 12, 24]
        left_chunk: int, list(int)
            Number of chunks used for masking in streaming mode.
            This variable can take 2 forms:
            int:  Used for inference, or single chunk size training
            list(int) : Used only for variable chunk size training. When
            chunk_size is a list, left_chunk must be a list with same length.
            Some examples for the 2 cases:
            left_chunk = 6
            left_chunk = [12, 9, 6, 3]
        num_lang: int
            This parameter is used to store the number of languages in the
            lang_dict, only used for multiseed/multilingual models.
            default None.
        attention_dim: int, optional
            attention dimension. default 256.
        attention_heads: int, optional
            the number of heads. default 4
        linear_units:
            the number of units of position-wise feed forward.
            default 2048
        num_block:
            number of Transformer layer. default 6
        dropout_rate: float, optional
            dropout rate. default 0.1
        input_layer: str, optional
            input layer type before Conformer,
            one of ["linear", "conv2d", "custom", "vgg2l", "embed"],
            default "conv2d"
        causal: bool, optional
            if set to True, convolution have no access
             to future frames. default False.
        batch_norm: bool, optional
            if set to True, apply batchnorm before activation
            in ConvModule layer of the conformer.
            default False
        cnn_out: int, optional
            the number of CNN channels before Conformer.
            default -1.
        cnn_layer_norm: bool, optional
            layer norm between Conformer and the first CNN.
            default False.
        ext_pw_out_channel: int, optional
            the number of channel for CNN
            before depthwise_seperable_CNN.
            If 0 then use linear. default 0.
        ext_pw_kernel_size: int, optional
            kernel size of N before depthwise_seperable_CNN.
            only work for ext_pw_out_channel > 0.
            default 1
        depthwise_seperable_out_channel: int, optional
            the number of channel for
            depthwise_seperable_CNN.
            default 256.
        depthwise_multiplier: int, optional
            the number of multiplier for
            depthwise_seperable_CNN.
            default 1.
        chunk_se: int, optional
            0 for offline SE.
            1 for streaming SE, where mean is computed
             by accumulated history until current chunk_se.
            2 for streaming SE, where mean is computed
             by only the current chunk.
            default 0.
        kernel_size: int, optional
            the number of kernels for depthwise_seperable_CNN.
            default 3.
        activation: str, optional
            FeedForward block activation.
            one of ["relu", "swish", "sigmoid"]
            default "relu".
        conv_activation: str, optional
            activation function used in ConvModule part
            of the conformer, default "relu".
        conv_glu_type: str, optional
            activation used use glu in depthwise_seperable_CNN,
            default "sigmoid"
        bias_in_glu: bool, optional
            if set to True, use additive bias in the weight module
             before GLU. default True
        linear_glu_in_convm: bool, optional
            if set to True, use GLULinear module,
             otherwise, used GLUPointWiseConv module.
              default to False.
        attention_glu_type: str
            only work for glu_in_attention !=0
            default "swish".
        export: bool, optional
            if set to True, it removes the padding from convolutional layers
             and allow the onnx conversion for inference.
              default False.
        activation_checkpointing: str, optional
            a dictionarry of {"module","interval","offload"}, where
                "module": str
                    accept ["transformer", "attention"] to select
                    which module should do activation checkpointing.
                "interval": int, default 1,
                    interval of applying activation checkpointing,
                    interval = 1 means that we apply checkpointing
                    on every layer (if activation), otherwise,
                    we apply it every x interval.
                "offload": bool, default False,
                    if set to True, we offload activation to cpu and
                    reload it during backward, otherwise,
                    we recalculate activation in backward.
            default "".
        extra_layer_output_idx: int
            the layer index to be exposed.
        relative_attention_bias_args: dict, optional
            use more efficient scalar bias-based relative multihead attention
            (Q*K^T + B) implemented in cmb.basics.embedding.
            [T5/ALiBi]RelativeAttentionLogitBias
            usage: relative_attention_bias_args={"type": t5/alibi}
            additional method-specific arguments can be provided (see
            transformer_base.py)
        time_reduction: int optional
            time reduction factor
            default 4
        use_pt_scaled_dot_product_attention: whether to use pytorch scaled
            dot product attention in training.
            Default: False
        nemo_conv_settings: dict, optional
            A dictionary of settings for NeMo Subsampling.
            default: None
            usage: nemo_conv_settings=
                {
                    "subsampling":
                    dw_striding/striding/dw_striding_conv1d/striding_conv1d,
                    "conv_channels": int,
                    "subsampling_conv_chunking_factor": int,
                    "is_causal": True/False
                }
        conv2d_extra_padding: str, optional
            Add extra padding in conv2d subsampling layers. Choices are
            (feat, feat_time, none, True)
            Default: none
        replication_pad_for_subsample_embedding:  For batched-streaming
            decoding, use "replication" padding for the cache at start of
            utterance.
            Default: False
        attention_group_size: int, optional
            the number of groups to use for attention, default 1
            (Multi-Head Attention),
            1 = typical Multi-Head Attention,
            1 < attention_group_size < attention_heads = Grouped-Query
            Attention
            attention_group_size = attention_heads = Multi-Query Attention
    extra_multi_layer_output_idxsNr   r   r      r   ra   TFr   r   r   r   r   r   r    r!   rc   rd   r/   re   num_langrf   rg   linear_units
num_blocksr*   rh   r+   r,   ri   rj   r#   r(   r$   r%   r.   r)   r-   r0   r1   r2   r3   r5   r7   extra_layer_output_idxr6   rm   rk   r8   ro   rp   rq   'replication_pad_for_subsample_embeddingrt   ru   r:   c'           '         s   t  j|
||
||| |d|"|#|&d || _|| _| _|$| _| j dks-J d| j | _t 	
fddt	|D  | _
|| _|| _| jdtdd	d
 d S )Nrb   )r*   rm   rn   ro   rp   rt   ru   r   r   c                    s   g | ]L}t di d ddddddddd	d
d d	d
dddddddddqS )r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r5   r6   r7   r8   r9   rI   )r   .0_r-   r6   rf   r5   rt   rg   r,   r2   r+   r.   r/   r0   r1   r%   r$   r*   r7   r(   r#   r)   r3   r   r8   rI   rJ   
<listcomp>h  sd    	
z-ConformerEncoder.__init__.<locals>.<listcomp>dev_typerI   F)
persistent)r=   r>   r   r   r)   r   r   num_heads_kr   rangeencodersr   r   register_bufferr]   zeros)'rF   rd   r/   re   r   rf   rg   r   r   r*   rh   r+   r,   ri   rj   r#   r(   r$   r%   r.   r)   r-   r0   r1   r2   r3   r5   r7   r   r   r6   rm   rk   r8   ro   rp   r   rt   ru   rG   r   rJ   r>   !  sF   )6zConformerEncoder.__init__r   c                 C   s   | j r|  |S d S rT   )r   )rF   r   rI   rI   rJ   init_relative_attention_bias  s   
z-ConformerEncoder.init_relative_attention_biasr   devicerN   c           
      C   s   |j d }|j d }| ||| j| j}||}|d u r|S |d}|}tjd||d|	dd|
dk }	|	
d}	|	|@ }	|	S )Nr   r   )r   r   )r   r   r/   re   tosumr]   r   r   r   r   )
rF   r   r   rN   max_audio_lengthr   r   r   padding_lengthpad_maskrI   rI   rJ   calculate_hs_mask  s$   





z"ConformerEncoder.calculate_hs_maskr   c                 C   s  |  |}| ||\}}}}}d}|j\}}	}
d}|	|kr{d}|	| dkr-||	|  }nd}|dkrEt|ddd|fdd}||j}t||}|durq|d}t|d|fdd}|	d
 }t||}|d }nd}| ||j|}| |}| jdko|du }|r| ||||^}}nt| jD ]\}}||||||d	\}}}}q|r|jd }||d|}|dkr|ddd| ddf }||fS )
zConformer Forward function

        Args:
            xs_pad: torch.Tensor
                input tensor
            masks: torch.Tensor
                post-embedding input lengths
        Fi  Tr   constantNr   r   rP   )r   r   r   Fpadr   r   r   squeezer   rZ   r[   r   r   r   r   	enumeratereshape)rF   r   r   r   rL   rM   r   unfoldedori_bzr   Dmax_seq_lenchunk_pad_sizeinput_tensor_padsubsampled_pad_mask extra_padded_subsamlped_pad_maskmasks_unfoldrO   _simplified_pathr   r   layer	embed_dimrI   rI   rJ   rS     st   



zConformerEncoder.forward)rU   rV   rW   rX   r   rY   __annotations__rZ   r\   r[   r   r   r   r>   r]   r   r   r   r   jitignorer^   rS   r_   rI   rI   rG   rJ   r   y  s  
  &

	
 !"#$%&'(j

r   c                       s   e Zd ZdZ								dd	ed
ededededededef fddZ	ddej	dej	dB dedB de
ej	edB f fddZ  ZS )WindowQformerzWindow-level Qformer   r      r   r   rb   Twindow_sizenum_queriesr   rf   rg   r   r*   normalize_beforec	           	         sh   t    t fddt|D | _ttd| | _	r,tj
 ddnd | _|| _d S )Nc                    s$   g | ]}t j d ddqS )r   T)r"   nheaddim_feedforwarddropoutr-   batch_first
norm_first)r   TransformerDecoderLayerr   rf   rg   r*   r   r  rI   rJ   r     s    
z*WindowQformer.__init__.<locals>.<listcomp>r   g-q=)eps)r=   r>   r   
ModuleListr   decoders	Parameterr]   r   queriesrC   
after_normr  )	rF   r  r  r   rf   rg   r   r*   r  rG   r  rJ   r>     s   


zWindowQformer.__init__Naudio_embedrN   	embed_lenr:   c                 C   s  | dd}|jd | j }|dkrt|d| j| fdd}tj|ddddf d| jfd| jfd}|j\}}}||d| j|}| dd	 }||| | jd}| j	|| dd}	| j
D ]
}
|
|	|d|d
}	qb| jdurw| |	}	|dur|| j }|	||d}||fS )zforward decoderr   r  r   r   r   .N)r)   strider   )tgtmemorytgt_maskmemory_mask)	transposer   r  r   r   unfoldview
contiguousr  r   r  r  )rF   r  rN   r  paddingembed_chunkbszr   slenqr   rR   rI   rI   rJ   rS   (  s0   	



zWindowQformer.forward)r  r   r  r   r  r   rb   TrT   )rU   rV   rW   rX   rY   rZ   r[   r>   r]   r   r^   rS   r_   rI   rI   rG   rJ   r     sJ    	&r   c                	       s   e Zd ZdZdededdf fddZdejddfd	d
Z	dejddfddZ
		ddejdejdB dedejfddZ		ddejdejdB dedejfddZ  ZS )AudioEmbeddingzImage embedding.configkwargsr:   Nc              	      s  t    || _t|dr|jn|j}d }d| _t|jt	rE|j
dd dkrE|j
dd }|d us4J tdi || _|d }|d }ntd|d usQJ d	|| _|| _|
d
d| _|
dd| _|
ddr~|
di }||d< tdi || _nd | _|
ddr| jd u sJ d|
di }d| j||ddt dd}	|r|	| dD ]
}
|
|vsJ dqtdi |	| _nd | _|
dd}|dkrt||| _ni|dkr7|}d}| js| jrdn| j| _t|| j |g}td|D ]}|t t||g qtj| | _t|| j |g}td|D ]}|t t||g qtj| | _ ntd| d|j!| _!d | _"d | _#d S )Nn_embdnamecascadesr$  rf   rd   r!   z(Remember to set values for audio_dim_outfreeze_audio_processorFdownsample_rater   use_qformerqformer_configuse_conv_downsamplez6don't support use qformer and conv downsample togetherro   rv   r   rw   r   r   projection_clslinearmlpr  zprojection_cls = z, not implementedrI   )$r=   r>   r$  hasattrr&  hidden_size	layer_idxr   audio_processorr   r   r   encoderr   audio_dim_outaudio_dim_inr*  r+  r   qformerr   r   r   r   conv_dsLinearaudio_projectionlinear_downsample_rater   extendGELU
Sequentialaudio_projection_for_vision
vocab_sizeinput_embedsaudio_embed_sizes)rF   r$  r%  r3  r7  encoder_confign_melsr-  ro   r   r   r/  dim_projectiondepthlayersr   rG   rI   rJ   r>   X  s   








zAudioEmbedding.__init__rC  c                 C   
   || _ d S rT   )rC  )rF   rC  rI   rI   rJ   set_audio_embeds     
zAudioEmbedding.set_audio_embedsrD  c                 C   rJ  rT   )rD  )rF   rD  rI   rI   rJ   set_audio_embed_sizes  rL  z$AudioEmbedding.set_audio_embed_sizesspeechaudio_attention_maskaudio_projection_modec                 C   sB  | j r t  | ||\}}W d   n1 sw   Y  n| ||\}}| jdur6| j|dd\}}| jdurM|durD|d}| j||d\}}| jdkr| \}}}	|| j }
|
dkrqt	
|ddd| j|
 fdd}|d}|||| j |	| j }|dkr| |}|S |dkr| |}|S td| d	)
zl
        arguments:
            input_embeds: audio features (B, T, D)  B: num audios in a sequence
        N)rN   r   r   r   rN  visionzaudio_projection_mode = z not implemented)r*  r]   no_gradr6  r9  r:  r   r=  r   r   r   r  r<  rA  r   )rF   rC  rO  rP  audio_featuresr   r   bsr   feat_dimr  audio_set_tensorrI   rI   rJ   get_audio_features  sJ   










z!AudioEmbedding.get_audio_featuresrS  c                 C   s    | j |d||d}|dS )z
        arguments:
            audio_features: audio features (T, D)

        returns:
            audio_embeds: audio embeddings (num_audio_tokens, hidden_dim)
        r   )rO  rP  )rW  r   r   )rF   rS  rO  rP  audio_embedsrI   rI   rJ   rS     s   
zAudioEmbedding.forward)NrN  )rU   rV   rW   rX   r   r   r>   r]   r   rK  rM  r\   rW  rS   r_   rI   rI   rG   rJ   r#  U  s6    h
9r#  )&r   r   typingr   r   numpyr   r]   torch.nn.functionalr   
functionalr   r   ;torch.distributed.algorithms._checkpoint.checkpoint_wrapperr   2torch.distributed.fsdp.fully_sharded_data_parallelr   transformersr   'vllm.model_executor.models.phi4mm_utilsr	   r
   r   r   r   r   r   r   r   r   r   r   r   ABCr`   r   r   r#  rI   rI   rI   rJ   <module>   s,   4 Z  }   R