o
    iV`                     @   s  d Z ddlmZ ddlmZ ddlmZ ddlmZ ddlZddlmZ ddlm	Z	 dd	l
mZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddl m!Z! G dd dej"Z#G dd dej"Z$G dd dej"eZ%e!&ddG dd de%Z'e!&ddG dd de%Z(e!&dd G d!d  d e%Z)e!&dd"G d#d" d"e%Z*e!&dd$G d%d$ d$e%Z+dS )&zDecoder definition.    )Any)List)Sequence)TupleN)nn)MultiHeadedAttention)DynamicConvolution)DynamicConvolution2D)PositionalEncoding)	LayerNorm)LightweightConvolution)LightweightConvolution2D)subsequent_mask)make_pad_mask)PositionwiseFeedForward)repeat)BatchScorerInterface)tablesc                       s0   e Zd ZdZ		d	 fdd	Zd
ddZ  ZS )DecoderLayera  Single decoder layer module.

    Args:
        size (int): Input dimension.
        self_attn (torch.nn.Module): Self-attention module instance.
            `MultiHeadedAttention` instance can be used as the argument.
        src_attn (torch.nn.Module): Self-attention module instance.
            `MultiHeadedAttention` instance can be used as the argument.
        feed_forward (torch.nn.Module): Feed-forward module instance.
            `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
            can be used as the argument.
        dropout_rate (float): Dropout rate.
        normalize_before (bool): Whether to use layer_norm before the first block.
        concat_after (bool): Whether to concat attention layer's input and output.
            if True, additional linear will be applied.
            i.e. x -> x + linear(concat(x, att(x)))
            if False, no additional linear will be applied. i.e. x -> x + att(x)


    TFc                    s   t t|   || _|| _|| _|| _t|| _t|| _	t|| _
t|| _|| _|| _| jrEt|| || _t|| || _dS dS )z!Construct an DecoderLayer object.N)superr   __init__size	self_attnsrc_attnfeed_forwardr   norm1norm2norm3r   Dropoutdropoutnormalize_beforeconcat_afterLinearconcat_linear1concat_linear2)selfr   r   r   r   dropout_rater    r!   	__class__ U/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/transformer/decoder.pyr   6   s   


zDecoderLayer.__init__Nc              	   C   s  |}| j r
| |}|du r|}|}nT|j|jd |jd d | jfks:J |j d|jd |jd d | jf |ddddddf }|ddddddf }d}|durg|ddddddf }| jrtj|| ||||fdd}	|| |	 }
n|| 	| |||| }
| j s| |
}
|
}| j r| 
|
}
| jrtj|
| |
|||fdd}|| | }
n|| 	| |
||| }
| j s| 
|
}
|
}| j r| |
}
|| 	| |
 }
| j s| |
}
|durtj||
gdd}
|
|||fS )a"  Compute decoded features.

        Args:
            tgt (torch.Tensor): Input tensor (#batch, maxlen_out, size).
            tgt_mask (torch.Tensor): Mask for input tensor (#batch, maxlen_out).
            memory (torch.Tensor): Encoded memory, float32 (#batch, maxlen_in, size).
            memory_mask (torch.Tensor): Encoded memory mask (#batch, maxlen_in).
            cache (List[torch.Tensor]): List of cached tensors.
                Each tensor shape should be (#batch, maxlen_out - 1, size).

        Returns:
            torch.Tensor: Output tensor(#batch, maxlen_out, size).
            torch.Tensor: Mask for output tensor (#batch, maxlen_out).
            torch.Tensor: Encoded memory (#batch, maxlen_in, size).
            torch.Tensor: Encoded memory mask (#batch, maxlen_in).

        Nr      z == dim)r    r   shaper   r!   torchcatr   r#   r   r   r   r$   r   r   )r%   tgttgt_maskmemorymemory_maskcacheresidualtgt_q
tgt_q_mask
tgt_concatxx_concatr)   r)   r*   forwardP   sT   
&




zDecoderLayer.forward)TFN)__name__
__module____qualname____doc__r   r=   __classcell__r)   r)   r'   r*   r       s    r   c                       s&   e Zd Z fddZdddZ  ZS )DecoderLayerExportc                    s>   t    |j| _|j| _|j| _|j| _|j| _|j| _d S r>   )r   r   r   r   r   r   r   r   )r%   modelr'   r)   r*   r      s   
zDecoderLayerExport.__init__Nc           
      C   st   |}|  |}|}|}|| |||| }	|	}| |	}	|| |	||| }	|	}| |	}	|| |	 }	|	|||fS r>   )r   r   r   r   r   r   )
r%   r2   r3   r4   r5   r6   r7   r8   r9   r;   r)   r)   r*   r=      s   


zDecoderLayerExport.forwardr>   )r?   r@   rA   r   r=   rC   r)   r)   r'   r*   rD      s    	rD   c                       s   e Zd ZdZddddedfdedededed	ed
edef fddZ	de
jde
jde
jde
jdee
je
jf f
ddZ	d#de
jde
jde
jdee
j dee
jee
j f f
ddZdd Zde
jdee d e
jdee
jee f fd!d"Z  ZS )$BaseTransformerDecodera  Base class of Transfomer decoder module.

    Args:
        vocab_size: output dim
        encoder_output_size: dimension of attention
        attention_heads: the number of heads of multi head attention
        linear_units: the number of units of position-wise feed forward
        num_blocks: the number of decoder blocks
        dropout_rate: dropout rate
        self_attention_dropout_rate: dropout rate for attention
        input_layer: input layer type
        use_output_layer: whether to use output layer
        pos_enc_class: PositionalEncoding or ScaledPositionalEncoding
        normalize_before: whether to use layer_norm before the first block
        concat_after: whether to concat attention layer's input and output
            if True, additional linear will be applied.
            i.e. x -> x + linear(concat(x, att(x)))
            if False, no additional linear will be applied.
            i.e. x -> x + att(x)
    皙?embedT
vocab_sizeencoder_output_sizer&   positional_dropout_rateinput_layeruse_output_layerr    c	           
   	      s   t    |}	|dkrtjtj||	||	|| _n*|dkr?tjtj||	tj|	tj	|tj
 ||	|| _ntd| || _| jrQt|	| _|r\tj|	|| _nd | _d | _d S )NrH   linearz'only 'embed' or 'linear' is supported: )r   r   r0   r   
Sequential	EmbeddingrH   r"   r   r   ReLU
ValueErrorr    
after_normoutput_layerdecoders)
r%   rI   rJ   r&   rK   rL   rM   pos_enc_classr    attention_dimr'   r)   r*   r      s.   




zBaseTransformerDecoder.__init__hs_padhlens	ys_in_pad
ys_in_lensreturnc                 C   s  |}t |dddddf  |j}t|d|jdd}||@ }|}t ||dd dddddf |j}	|	jd |jd kr^|jd |	jd  }
tjj	
|	d|
fdd}	| |}| ||||	\}}}}	| jrw| |}| jdur| |}|d}||fS )	a@  Forward decoder.

        Args:
            hs_pad: encoded memory, float32  (batch, maxlen_in, feat)
            hlens: (batch)
            ys_in_pad:
                input token ids, int64 (batch, maxlen_out)
                if input_layer == "embed"
                input tensor (batch, maxlen_out, #mels) in the other cases
            ys_in_lens: (batch)
        Returns:
            (tuple): tuple containing:

            x: decoded token score before softmax (batch, maxlen_out, token)
                if use_output_layer is True,
            olens: (batch, )
        Nr,   devicer   r+   )maxlenconstantF)r   tor^   r   r   	unsqueezer/   r0   r   
functionalpadrH   rU   r    rS   rT   sum)r%   rX   rY   rZ   r[   r2   r3   mr4   r5   padlenr;   olensr)   r)   r*   r=      s"   $.




zBaseTransformerDecoder.forwardNr2   r3   r4   r6   c                 C   s   |  |}|du rdgt| j }g }t|| jD ]\}}||||d|d\}}}}	|| q| jr?| |dddf }
n|dddf }
| jdurVtj	| |
dd}
|
|fS )a5  Forward one step.

        Args:
            tgt: input token ids, int64 (batch, maxlen_out)
            tgt_mask: input token mask,  (batch, maxlen_out)
                      dtype=torch.uint8 in PyTorch 1.2-
                      dtype=torch.bool in PyTorch 1.2+ (include 1.2)
            memory: encoded memory, float32  (batch, maxlen_in, feat)
            cache: cached output list of (batch, max_time_out-1, size)
        Returns:
            y, cache: NN output value and cache per `self.decoders`.
            y.shape` is (batch, maxlen_out, token)
        Nr6   r,   r-   )
rH   lenrU   zipappendr    rS   rT   r0   log_softmax)r%   r2   r3   r4   r6   r;   	new_cachecdecoderr5   yr)   r)   r*   forward_one_step#  s   

z'BaseTransformerDecoder.forward_one_stepc                 C   sH   t t||jdd}| j|d||d|d\}}|d|fS )zScore.r]   r   ri   )r   rj   r^   rb   rr   squeeze)r%   ysstater;   ys_masklogpr)   r)   r*   scoreH  s   "zBaseTransformerDecoder.scorert   statesxsc                    s   t | t | jd du rd}n fddtD }t|d|jdd}| j||||d\}fddt D }||fS )	a  Score new token batch.

        Args:
            ys (torch.Tensor): torch.int64 prefix tokens (n_batch, ylen).
            states (List[Any]): Scorer states for prefix tokens.
            xs (torch.Tensor):
                The encoder feature that generates ys (n_batch, xlen, n_feat).

        Returns:
            tuple[torch.Tensor, List[Any]]: Tuple of
                batchfied scores for next token with shape of `(n_batch, n_vocab)`
                and next state list for ys.

        r   Nc                    s*   g | ] t  fd dtD qS )c                       g | ]}|   qS r)   r)   ).0b)iry   r)   r*   
<listcomp>g      ABaseTransformerDecoder.batch_score.<locals>.<listcomp>.<listcomp>)r0   stackranger|   )n_batchry   )r~   r*   r   f  s    z6BaseTransformerDecoder.batch_score.<locals>.<listcomp>r,   r]   ri   c                    s$   g | ]  fd dt D qS )c                    r{   r)   r)   )r|   r~   )r}   ry   r)   r*   r   o  r   r   )r   r   )n_layersry   )r}   r*   r   o  s   $ )rj   rU   r   r   r   r^   rb   rr   )r%   rt   ry   rz   batch_staterv   rw   
state_listr)   )r   r   ry   r*   batch_scoreN  s   
z"BaseTransformerDecoder.batch_scorer>   )r?   r@   rA   rB   r
   intfloatstrboolr   r0   Tensorr   r=   r   rr   rx   r   r   rC   r)   r)   r'   r*   rF      sp    	)
6
%rF   decoder_classesTransformerDecoderc                       sl   e Zd Zdddddddddeddfd	ed
edededededededededededef fddZ  Z	S )r            rG           rH   TFrI   rJ   attention_headslinear_units
num_blocksr&   rK   self_attention_dropout_ratesrc_attention_dropout_raterL   rM   r    r!   c              
      sF   t  j||||
||d | t| fdd| _d S )NrI   rJ   r&   rK   rL   rM   rV   r    c                    s,   t  t t t S r>   )r   r   r   lnumrW   r   r!   r&   r   r    r   r   r)   r*   <lambda>  s    


z-TransformerDecoder.__init__.<locals>.<lambda>)r   r   r   rU   )r%   rI   rJ   r   r   r   r&   rK   r   r   rL   rM   rV   r    r!   r'   r   r*   r   u  s   
zTransformerDecoder.__init__)
r?   r@   rA   r
   r   r   r   r   r   rC   r)   r)   r'   r*   r   s  sP    	
(LightweightConvolutionTransformerDecoderc                !          e Zd Zdddddddddedddd	dfd
ededededededededededededededee def  fddZ	  Z
S )r   r   r   r   rG   r   rH   TF   r   r   r   r   r   rI   rJ   r   r   r   r&   rK   r   r   rL   rM   r    r!   conv_wshareconv_kernel_lengthconv_usebiasc                    p   t |krtdt  d| t j||||
||d | t| 	
fdd| _d S )NCconv_kernel_length must have equal number of values to num_blocks:  != r   c              
      8   t  t 	|  ddt 
t S NT)wsharen_featr&   kernel_sizeuse_kernel_maskuse_bias)r   r   r   r   r   rW   r   r!   r   r   r   r&   r   r    r   r   r)   r*   r          

zCLightweightConvolutionTransformerDecoder.__init__.<locals>.<lambda>rj   rR   r   r   r   rU   r%   rI   rJ   r   r   r   r&   rK   r   r   rL   rM   rV   r    r!   r   r   r   r'   r   r*   r     .   
z1LightweightConvolutionTransformerDecoder.__init__r?   r@   rA   r
   r   r   r   r   r   r   rC   r)   r)   r'   r*   r     b    	
*LightweightConvolution2DTransformerDecoderc                !       r   )r   r   r   r   rG   r   rH   TFr   rI   rJ   r   r   r   r&   rK   r   r   rL   rM   r    r!   r   r   r   c                    r   )Nr   r   r   c              
      r   r   )r   r   r   r   r   r   r)   r*   r     r   zELightweightConvolution2DTransformerDecoder.__init__.<locals>.<lambda>r   r   r'   r   r*   r     r   z3LightweightConvolution2DTransformerDecoder.__init__r   r)   r)   r'   r*   r     r   $DynamicConvolutionTransformerDecoderc                !       r   )r   r   r   r   rG   r   rH   TFr   rI   rJ   r   r   r   r&   rK   r   r   rL   rM   r    r!   r   r   r   c                    r   )Nr   r   r   c              
      r   r   )r   r   r   r   r   r   r)   r*   r   A  r   z?DynamicConvolutionTransformerDecoder.__init__.<locals>.<lambda>r   r   r'   r   r*   r     .   

z-DynamicConvolutionTransformerDecoder.__init__r   r)   r)   r'   r*   r     r   &DynamicConvolution2DTransformerDecoderc                !       r   )r   r   r   r   rG   r   rH   TFr   rI   rJ   r   r   r   r&   rK   r   r   rL   rM   r    r!   r   r   r   c                    r   )Nr   r   r   c              
      r   r   )r   r	   r   r   r   r   r)   r*   r   }  r   zADynamicConvolution2DTransformerDecoder.__init__.<locals>.<lambda>r   r   r'   r   r*   r   V  r   z/DynamicConvolution2DTransformerDecoder.__init__r   r)   r)   r'   r*   r   T  r   ),rB   typingr   r   r   r   r0   r   #funasr.models.transformer.attentionr   ,funasr.models.transformer.utils.dynamic_convr   .funasr.models.transformer.utils.dynamic_conv2dr	   #funasr.models.transformer.embeddingr
   $funasr.models.transformer.layer_normr   )funasr.models.transformer.utils.lightconvr   +funasr.models.transformer.utils.lightconv2dr   $funasr.models.transformer.utils.maskr   *funasr.models.transformer.utils.nets_utilsr   3funasr.models.transformer.positionwise_feed_forwardr   &funasr.models.transformer.utils.repeatr   2funasr.models.transformer.scorers.scorer_interfacer   funasr.registerr   Moduler   rD   rF   registerr   r   r   r   r   r)   r)   r)   r*   <module>   sD   v 
A
,
;
;
;