o
    i_l                     @   s  d Z ddlmZ ddlmZ ddlmZ ddlmZ ddlZddlmZ ddlm	Z	 dd	l
mZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddl m!Z! ddl"m#Z# G dd dej$Z%G dd dej$e!Z&e#'ddG dd de&Z(e#'ddG dd de&Z)e#'ddG d d de&Z*e#'dd!G d"d! d!e&Z+e#'dd#G d$d# d#e&Z,e#'dd%G d&d% d%e&Z-dS )'zDecoder definition.    )Any)List)Sequence)TupleN)nn)MultiHeadedAttention)CosineDistanceAttention)DynamicConvolution)DynamicConvolution2D)PositionalEncoding)	LayerNorm)LightweightConvolution)LightweightConvolution2D)subsequent_mask)make_pad_mask)PositionwiseFeedForward)repeat)BatchScorerInterface)tablesc                       s0   e Zd ZdZ		d	 fdd	Zd
ddZ  ZS )DecoderLayera  Single decoder layer module.

    Args:
        size (int): Input dimension.
        self_attn (torch.nn.Module): Self-attention module instance.
            `MultiHeadedAttention` instance can be used as the argument.
        src_attn (torch.nn.Module): Self-attention module instance.
            `MultiHeadedAttention` instance can be used as the argument.
        feed_forward (torch.nn.Module): Feed-forward module instance.
            `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
            can be used as the argument.
        dropout_rate (float): Dropout rate.
        normalize_before (bool): Whether to use layer_norm before the first block.
        concat_after (bool): Whether to concat attention layer's input and output.
            if True, additional linear will be applied.
            i.e. x -> x + linear(concat(x, att(x)))
            if False, no additional linear will be applied. i.e. x -> x + att(x)


    TFc                    s   t t|   || _|| _|| _|| _t|| _t|| _	t|| _
t|| _|| _|| _| jrEt|| || _t|| || _dS dS )z!Construct an DecoderLayer object.N)superr   __init__size	self_attnsrc_attnfeed_forwardr   norm1norm2norm3r   Dropoutdropoutnormalize_beforeconcat_afterLinearconcat_linear1concat_linear2)selfr   r   r   r   dropout_rater!   r"   	__class__ \/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/sa_asr/transformer_decoder.pyr   7   s   


zDecoderLayer.__init__Nc              	   C   s  |}| j r
| |}|du r|}|}nT|j|jd |jd d | jfks:J |j d|jd |jd d | jf |ddddddf }|ddddddf }d}|durg|ddddddf }| jrtj|| ||||fdd}	|| |	 }
n|| 	| |||| }
| j s| |
}
|
}| j r| 
|
}
| jrtj|
| |
|||fdd}|| | }
n|| 	| |
||| }
| j s| 
|
}
|
}| j r| |
}
|| 	| |
 }
| j s| |
}
|durtj||
gdd}
|
|||fS )a"  Compute decoded features.

        Args:
            tgt (torch.Tensor): Input tensor (#batch, maxlen_out, size).
            tgt_mask (torch.Tensor): Mask for input tensor (#batch, maxlen_out).
            memory (torch.Tensor): Encoded memory, float32 (#batch, maxlen_in, size).
            memory_mask (torch.Tensor): Encoded memory mask (#batch, maxlen_in).
            cache (List[torch.Tensor]): List of cached tensors.
                Each tensor shape should be (#batch, maxlen_out - 1, size).

        Returns:
            torch.Tensor: Output tensor(#batch, maxlen_out, size).
            torch.Tensor: Mask for output tensor (#batch, maxlen_out).
            torch.Tensor: Encoded memory (#batch, maxlen_in, size).
            torch.Tensor: Encoded memory mask (#batch, maxlen_in).

        Nr      z == dim)r!   r   shaper   r"   torchcatr   r$   r    r   r   r%   r   r   )r&   tgttgt_maskmemorymemory_maskcacheresidualtgt_q
tgt_q_mask
tgt_concatxx_concatr*   r*   r+   forwardQ   sT   
&




zDecoderLayer.forward)TFN)__name__
__module____qualname____doc__r   r>   __classcell__r*   r*   r(   r+   r   !   s    r   c                       s   e Zd ZdZddddedfdedededed	ed
edef fddZ	de
jde
jde
jde
jdee
je
jf f
ddZ	d#de
jde
jde
jdee
j dee
jee
j f f
ddZdd Zde
jdee d e
jdee
jee f fd!d"Z  ZS )$BaseTransformerDecodera  Base class of Transfomer decoder module.

    Args:
        vocab_size: output dim
        encoder_output_size: dimension of attention
        attention_heads: the number of heads of multi head attention
        linear_units: the number of units of position-wise feed forward
        num_blocks: the number of decoder blocks
        dropout_rate: dropout rate
        self_attention_dropout_rate: dropout rate for attention
        input_layer: input layer type
        use_output_layer: whether to use output layer
        pos_enc_class: PositionalEncoding or ScaledPositionalEncoding
        normalize_before: whether to use layer_norm before the first block
        concat_after: whether to concat attention layer's input and output
            if True, additional linear will be applied.
            i.e. x -> x + linear(concat(x, att(x)))
            if False, no additional linear will be applied.
            i.e. x -> x + att(x)
    皙?embedT
vocab_sizeencoder_output_sizer'   positional_dropout_rateinput_layeruse_output_layerr!   c	           
   	      s   t    |}	|dkrtjtj||	||	|| _n*|dkr?tjtj||	tj|	tj	|tj
 ||	|| _ntd| || _| jrQt|	| _|r\tj|	|| _nd | _d | _d S )NrG   linearz'only 'embed' or 'linear' is supported: )r   r   r1   r   
Sequential	EmbeddingrG   r#   r   r   ReLU
ValueErrorr!   
after_normoutput_layerdecoders)
r&   rH   rI   r'   rJ   rK   rL   pos_enc_classr!   attention_dimr(   r*   r+   r      s.   




zBaseTransformerDecoder.__init__hs_padhlens	ys_in_pad
ys_in_lensreturnc                 C   s  |}t |dddddf  |j}t|d|jdd}||@ }|}t ||dd dddddf |j}	|	jd |jd kr^|jd |	jd  }
tjj	
|	d|
fdd}	| |}| ||||	\}}}}	| jrw| |}| jdur| |}|d}||fS )	@  Forward decoder.

        Args:
            hs_pad: encoded memory, float32  (batch, maxlen_in, feat)
            hlens: (batch)
            ys_in_pad:
                input token ids, int64 (batch, maxlen_out)
                if input_layer == "embed"
                input tensor (batch, maxlen_out, #mels) in the other cases
            ys_in_lens: (batch)
        Returns:
            (tuple): tuple containing:

            x: decoded token score before softmax (batch, maxlen_out, token)
                if use_output_layer is True,
            olens: (batch, )
        Nr-   devicer   r,   maxlenconstantF)r   tor^   r   r   	unsqueezer0   r1   r   
functionalpadrG   rT   r!   rR   rS   sum)r&   rW   rX   rY   rZ   r3   r4   mr5   r6   padlenr<   olensr*   r*   r+   r>      s"   $.




zBaseTransformerDecoder.forwardNr3   r4   r5   r7   c                 C   s   |  |}|du rdgt| j }g }t|| jD ]\}}||||d|d\}}}}	|| q| jr?| |dddf }
n|dddf }
| jdurVtj	| |
dd}
|
|fS )a5  Forward one step.

        Args:
            tgt: input token ids, int64 (batch, maxlen_out)
            tgt_mask: input token mask,  (batch, maxlen_out)
                      dtype=torch.uint8 in PyTorch 1.2-
                      dtype=torch.bool in PyTorch 1.2+ (include 1.2)
            memory: encoded memory, float32  (batch, maxlen_in, feat)
            cache: cached output list of (batch, max_time_out-1, size)
        Returns:
            y, cache: NN output value and cache per `self.decoders`.
            y.shape` is (batch, maxlen_out, token)
        Nr7   r-   r.   )
rG   lenrT   zipappendr!   rR   rS   r1   log_softmax)r&   r3   r4   r5   r7   r<   	new_cachecdecoderr6   yr*   r*   r+   forward_one_step  s   

z'BaseTransformerDecoder.forward_one_stepc                 C   sH   t t||jdd}| j|d||d|d\}}|d|fS )zScore.r]   r   rj   )r   rk   r^   rc   rs   squeeze)r&   ysstater<   ys_masklogpr*   r*   r+   score,  s   "zBaseTransformerDecoder.scoreru   statesxsc                    s   t | t | jd du rd}n fddtD }t|d|jdd}| j||||d\}fddt D }||fS )	a  Score new token batch.

        Args:
            ys (torch.Tensor): torch.int64 prefix tokens (n_batch, ylen).
            states (List[Any]): Scorer states for prefix tokens.
            xs (torch.Tensor):
                The encoder feature that generates ys (n_batch, xlen, n_feat).

        Returns:
            tuple[torch.Tensor, List[Any]]: Tuple of
                batchfied scores for next token with shape of `(n_batch, n_vocab)`
                and next state list for ys.

        r   Nc                    s*   g | ] t  fd dtD qS )c                       g | ]}|   qS r*   r*   ).0b)irz   r*   r+   
<listcomp>K      ABaseTransformerDecoder.batch_score.<locals>.<listcomp>.<listcomp>)r1   stackranger}   )n_batchrz   )r   r+   r   J  s    z6BaseTransformerDecoder.batch_score.<locals>.<listcomp>r-   r]   rj   c                    s$   g | ]  fd dt D qS )c                    r|   r*   r*   )r}   r   )r~   rz   r*   r+   r   S  r   r   )r   r   )n_layersrz   )r~   r+   r   S  s   $ )rk   rT   r   r   r   r^   rc   rs   )r&   ru   rz   r{   batch_staterw   rx   
state_listr*   )r   r   rz   r+   batch_score2  s   
z"BaseTransformerDecoder.batch_scorer?   )r@   rA   rB   rC   r   intfloatstrboolr   r1   Tensorr   r>   r   rs   ry   r   r   rD   r*   r*   r(   r+   rE      sp    	)
6
%rE   decoder_classesTransformerDecoderc                       sl   e Zd Zdddddddddeddfd	ed
edededededededededededef fddZ  Z	S )r            rF           rG   TFrH   rI   attention_headslinear_units
num_blocksr'   rJ   self_attention_dropout_ratesrc_attention_dropout_raterK   rL   r!   r"   c              
      sF   t  j||||
||d | t| fdd| _d S )NrH   rI   r'   rJ   rK   rL   rU   r!   c                    ,   t  t t t S r?   r   r   r   lnumrV   r   r"   r'   r   r!   r   r   r*   r+   <lambda>x      


z-TransformerDecoder.__init__.<locals>.<lambda>)r   r   r   rT   )r&   rH   rI   r   r   r   r'   rJ   r   r   rK   rL   rU   r!   r"   r(   r   r+   r   Y  s   
zTransformerDecoder.__init__)
r@   rA   rB   r   r   r   r   r   r   rD   r*   r*   r(   r+   r   W  sP    	
ParaformerDecoderSANc                       s   e Zd ZdZdddddddddedd	d
fdedededededededededededededef fddZ	de
jde
jde
jde
jdee
je
jf f
d d!Z  ZS )"r   z
    Author: Speech Lab of DAMO Academy, Alibaba Group
    Paraformer: Fast and Accurate Parallel Transformer for Non-autoregressive End-to-End Speech Recognition
    https://arxiv.org/abs/2006.01713
    r   r   r   rF   r   rG   TFr-   rH   rI   r   r   r   r'   rJ   r   r   rK   rL   r!   r"   	embeds_idc              
      sR   t  j||||
||d | t| fdd| _|| _ | _d S )Nr   c                    r   r?   r   r   r   r*   r+   r     r   z/ParaformerDecoderSAN.__init__.<locals>.<lambda>)r   r   r   rT   r   rV   )r&   rH   rI   r   r   r   r'   rJ   r   r   rK   rL   rU   r!   r"   r   r(   r   r+   r     s"   
zParaformerDecoderSAN.__init__rW   rX   rY   rZ   r[   c                 C   s"  |}t |dddddf  |j}|}t ||dd dddddf |j}|jd |jd krM|jd |jd  }	tjj|d|	fdd}|}
d}t	| j
D ]\}}||
|||\}
}}}|| jkrl|
}qV| jru| |
}
| jdur| |
}
|d}|dur|
||fS |
|fS )r\   Nr,   r_   r-   r   ra   F)r   rb   r^   r   r0   r1   r   rd   re   	enumeraterT   r   r!   rR   rS   rf   )r&   rW   rX   rY   rZ   r3   r4   r5   r6   rh   r<   embeds_outputslayer_idrq   ri   r*   r*   r+   r>     s,   $.





zParaformerDecoderSAN.forward)r@   rA   rB   rC   r   r   r   r   r   r   r1   r   r   r>   rD   r*   r*   r(   r+   r     sn    
	
-(LightweightConvolutionTransformerDecoderc                !          e Zd Zdddddddddedddd	dfd
ededededededededededededededee def  fddZ	  Z
S )r   r   r   r   rF   r   rG   TF   r   r   r   r   r   rH   rI   r   r   r   r'   rJ   r   r   rK   rL   r!   r"   conv_wshareconv_kernel_lengthconv_usebiasc                    p   t |krtdt  d| t j||||
||d | t| 	
fdd| _d S )NCconv_kernel_length must have equal number of values to num_blocks:  != r   c              
      8   t  t 	|  ddt 
t S NT)wsharen_featr'   kernel_sizeuse_kernel_maskuse_bias)r   r   r   r   r   rV   r   r"   r   r   r   r'   r   r!   r   r   r*   r+   r          

zCLightweightConvolutionTransformerDecoder.__init__.<locals>.<lambda>rk   rQ   r   r   r   rT   r&   rH   rI   r   r   r   r'   rJ   r   r   rK   rL   rU   r!   r"   r   r   r   r(   r   r+   r     .   
z1LightweightConvolutionTransformerDecoder.__init__r@   rA   rB   r   r   r   r   r   r   r   rD   r*   r*   r(   r+   r     b    	
*LightweightConvolution2DTransformerDecoderc                !       r   )r   r   r   r   rF   r   rG   TFr   rH   rI   r   r   r   r'   rJ   r   r   rK   rL   r!   r"   r   r   r   c                    r   )Nr   r   r   c              
      r   r   )r   r   r   r   r   r   r*   r+   r   S  r   zELightweightConvolution2DTransformerDecoder.__init__.<locals>.<lambda>r   r   r(   r   r+   r   ,  r   z3LightweightConvolution2DTransformerDecoder.__init__r   r*   r*   r(   r+   r   *  r   $DynamicConvolutionTransformerDecoderc                !       r   )r   r   r   r   rF   r   rG   TFr   rH   rI   r   r   r   r'   rJ   r   r   rK   rL   r!   r"   r   r   r   c                    r   )Nr   r   r   c              
      r   r   )r   r	   r   r   r   r   r*   r+   r     r   z?DynamicConvolutionTransformerDecoder.__init__.<locals>.<lambda>r   r   r(   r   r+   r   h  .   

z-DynamicConvolutionTransformerDecoder.__init__r   r*   r*   r(   r+   r   f  r   &DynamicConvolution2DTransformerDecoderc                !       r   )r   r   r   r   rF   r   rG   TFr   rH   rI   r   r   r   r'   rJ   r   r   rK   rL   r!   r"   r   r   r   c                    r   )Nr   r   r   c              
      r   r   )r   r
   r   r   r   r   r*   r+   r     r   zADynamicConvolution2DTransformerDecoder.__init__.<locals>.<lambda>r   r   r(   r   r+   r     r   z/DynamicConvolution2DTransformerDecoder.__init__r   r*   r*   r(   r+   r     r   ).rC   typingr   r   r   r   r1   r   #funasr.models.transformer.attentionr   funasr.models.sa_asr.attentionr   ,funasr.models.transformer.utils.dynamic_convr	   .funasr.models.transformer.utils.dynamic_conv2dr
   #funasr.models.transformer.embeddingr   $funasr.models.transformer.layer_normr   )funasr.models.transformer.utils.lightconvr   +funasr.models.transformer.utils.lightconv2dr   $funasr.models.transformer.utils.maskr   *funasr.models.transformer.utils.nets_utilsr   3funasr.models.transformer.positionwise_feed_forwardr   &funasr.models.transformer.utils.repeatr   2funasr.models.transformer.scorers.scorer_interfacer   funasr.registerr   Moduler   rE   registerr   r   r   r   r   r   r*   r*   r*   r+   <module>   sH   v 
A
,
i
;
;
;