o
    i8L                     @   sH  d Z ddlmZ ddlmZ ddlmZ ddlmZ ddlZddlmZ ddlm	Z	 dd	l
mZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddl m!Z! ddl"m#Z# G dd dejZG dd dej$Z%G dd dej$eZ&e#'ddG dd de&Z(dS )zDecoder definition.    )Any)List)Sequence)TupleN)nn)MultiHeadedAttention)DynamicConvolution)DynamicConvolution2D)PositionalEncoding)	LayerNorm)LightweightConvolution)LightweightConvolution2D)subsequent_mask)make_pad_mask)PositionwiseFeedForward)repeat)BatchScorerInterface)	OmegaConf)tablesc                       s   e Zd Z fddZ  ZS )r   c                    s   t  | |jS N)superforwardfloattypedtype)selfx	__class__ X/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/conformer_rwkv/decoder.pyr   !   s   zLayerNorm.forward)__name__
__module____qualname__r   __classcell__r   r   r   r    r       s    r   c                       s4   e Zd ZdZdddi f fdd	Zd	ddZ  ZS )
DecoderLayera  Single decoder layer module.

    Args:
        size (int): Input dimension.
        self_attn (torch.nn.Module): Self-attention module instance.
            `MultiHeadedAttention` instance can be used as the argument.
        src_attn (torch.nn.Module): Self-attention module instance.
            `MultiHeadedAttention` instance can be used as the argument.
        feed_forward (torch.nn.Module): Feed-forward module instance.
            `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
            can be used as the argument.
        dropout_rate (float): Dropout rate.
        normalize_before (bool): Whether to use layer_norm before the first block.
        concat_after (bool): Whether to concat attention layer's input and output.
            if True, additional linear will be applied.
            i.e. x -> x + linear(concat(x, att(x)))
            if False, no additional linear will be applied. i.e. x -> x + att(x)


    TFNc	                    sR  t t|   || _|| _|| _t|| _t|| _t|| _	t
|| _|| _|| _| jr@t
|| || _t
|| || _|| _|dddkrXddlm}
 ddlm} n!|dddkrmddlm}
 ddlm} nddlm}
 ddlm} |||d	| _|| _d
| _| jdkr|ddst|j| _|ddrtd d}d| |d d }t
j !| jj"| |ddrtd d| |d d }t
j !| jj"| |ddrtd t
j j#| jj$j"dd t
j j#| jj%j"dd t
j j#| jj&j"dd t
j j#| jj'j"dd t
j (| jj)j" |dddkr'| j*t+j, d
S d
S )z!Construct an DecoderLayer object.versionv4r   )	RWKVLayer)RWKV_TimeMixv5)RWKV_Tmix_x052)RWKV_Tmix_x060)layer_idNln0T	init_rwkv   n_layergffffff?)gain皙?datatypebf16)-r   r%   __init__sizesrc_attnfeed_forwardr   norm1norm2norm3r   Dropoutdropoutnormalize_beforeconcat_afterLinearconcat_linear1concat_linear2r-   get!funasr.models.sense_voice.rwkv_v4r(   r)   !funasr.models.sense_voice.rwkv_v5r+   !funasr.models.sense_voice.rwkv_v6r,   	self_attnargsr.   n_embdprintinit	constant_weightorthogonal_
receptancekeyvaluegatezeros_outputtotorchbfloat16)r   r7   r8   r9   dropout_rater?   r@   r-   rI   kwargsr(   	RWKV_Tmixscaler   r   r    r6   ;   s\   


zDecoderLayer.__init__c           	   	   C   s(  | j dkr| jdur| |}| jdddkr| }|}| |}|du r4|| | j||d }nd}|| | j||d }|dddddf }| jdddkr\|t	j
}|}| |}|| | |||| }|}| |}|| | | }|durt	j||gdd}||||fS )	a"  Compute decoded features.

        Args:
            tgt (torch.Tensor): Input tensor (#batch, maxlen_out, size).
            tgt_mask (torch.Tensor): Mask for input tensor (#batch, maxlen_out).
            memory (torch.Tensor): Encoded memory, float32 (#batch, maxlen_in, size).
            memory_mask (torch.Tensor): Encoded memory mask (#batch, maxlen_in).
            cache (List[torch.Tensor]): List of cached tensors.
                Each tensor shape should be (#batch, maxlen_out - 1, size).

        Returns:
            torch.Tensor: Output tensor(#batch, maxlen_out, size).
            torch.Tensor: Mask for output tensor (#batch, maxlen_out).
            torch.Tensor: Encoded memory (#batch, maxlen_in, size).
            torch.Tensor: Encoded memory mask (#batch, maxlen_in).

        r   Nr4   r5   )maskr0   dim)r-   r.   rI   rD   rX   r:   r>   rH   rV   rW   float32r;   r8   r<   r9   cat)	r   tgttgt_maskmemorymemory_maskcacheresidualr   
tgt_q_maskr   r   r    r      s,   



zDecoderLayer.forwardr   )r!   r"   r#   __doc__r6   r   r$   r   r   r   r    r%   %   s    Gr%   c                       s   e Zd ZdZddddedfdedededed	ed
edef fddZ	de
jde
jde
jde
jdee
je
jf f
ddZ	d#de
jde
jde
jdee
j dee
jee
j f f
ddZdd Zde
jdee d e
jdee
jee f fd!d"Z  ZS )$BaseTransformerDecodera  Base class of Transfomer decoder module.

    Args:
        vocab_size: output dim
        encoder_output_size: dimension of attention
        attention_heads: the number of heads of multi head attention
        linear_units: the number of units of position-wise feed forward
        num_blocks: the number of decoder blocks
        dropout_rate: dropout rate
        self_attention_dropout_rate: dropout rate for attention
        input_layer: input layer type
        use_output_layer: whether to use output layer
        pos_enc_class: PositionalEncoding or ScaledPositionalEncoding
        normalize_before: whether to use layer_norm before the first block
        concat_after: whether to concat attention layer's input and output
            if True, additional linear will be applied.
            i.e. x -> x + linear(concat(x, att(x)))
            if False, no additional linear will be applied.
            i.e. x -> x + att(x)
    r3   embedT
vocab_sizeencoder_output_sizerY   positional_dropout_rateinput_layeruse_output_layerr?   c	           
   	      s   t    |}	|dkrtjtj||	||	|| _n*|dkr?tjtj||	tj|	tj	|tj
 ||	|| _ntd| || _| jrQt|	| _|r\tj|	|| _nd | _d | _d S )Nrl   linearz'only 'embed' or 'linear' is supported: )r   r6   rW   r   
Sequential	Embeddingrl   rA   r   r=   ReLU
ValueErrorr?   
after_normoutput_layerdecoders)
r   rm   rn   rY   ro   rp   rq   pos_enc_classr?   attention_dimr   r   r    r6      s.   




zBaseTransformerDecoder.__init__hs_padhlens	ys_in_pad
ys_in_lensreturnc                 C     |}t |dddddf  |j}t|d|jdd}||@ }|}t ||dd dddddf |j}	|	jd |jd kr^|jd |	jd  }
tjj	
|	d|
fdd}	| |}| ||||	\}}}}	| jrw| |}| jdur| |}|d}||fS 	a@  Forward decoder.

        Args:
            hs_pad: encoded memory, float32  (batch, maxlen_in, feat)
            hlens: (batch)
            ys_in_pad:
                input token ids, int64 (batch, maxlen_out)
                if input_layer == "embed"
                input tensor (batch, maxlen_out, #mels) in the other cases
            ys_in_lens: (batch)
        Returns:
            (tuple): tuple containing:

            x: decoded token score before softmax (batch, maxlen_out, token)
                if use_output_layer is True,
            olens: (batch, )
        Nr^   devicer   r0   )maxlenconstantFr   rV   r   r   r7   	unsqueezeshaperW   r   
functionalpadrl   ry   r?   rw   rx   sumr   r|   r}   r~   r   rc   rd   mre   rf   padlenr   olensr   r   r    r      "   $.




zBaseTransformerDecoder.forwardNrc   rd   re   rg   c                 C      |  |}|du rdgt| j }g }t|| jD ]\}}||||d|d\}}}}	|| q| jr?| |dddf }
n|dddf }
| jdurVtj	| |
dd}
|
|fS a5  Forward one step.

        Args:
            tgt: input token ids, int64 (batch, maxlen_out)
            tgt_mask: input token mask,  (batch, maxlen_out)
                      dtype=torch.uint8 in PyTorch 1.2-
                      dtype=torch.bool in PyTorch 1.2+ (include 1.2)
            memory: encoded memory, float32  (batch, maxlen_in, feat)
            cache: cached output list of (batch, max_time_out-1, size)
        Returns:
            y, cache: NN output value and cache per `self.decoders`.
            y.shape` is (batch, maxlen_out, token)
        Nrg   r^   r_   
rl   lenry   zipappendr?   rw   rx   rW   log_softmaxr   rc   rd   re   rg   r   	new_cachecdecoderrf   yr   r   r    forward_one_step*     

z'BaseTransformerDecoder.forward_one_stepc                 C   sH   t t||jdd}| j|d||d|d\}}|d|fS )zScore.r   r   r   )r   r   r   r   r   squeeze)r   ysstater   ys_masklogpr   r   r    scoreO  s   "zBaseTransformerDecoder.scorer   statesxsc                    s   t | t | jd du rd}n fddtD }t|d|jdd}| j||||d\}fddt D }||fS )	a  Score new token batch.

        Args:
            ys (torch.Tensor): torch.int64 prefix tokens (n_batch, ylen).
            states (List[Any]): Scorer states for prefix tokens.
            xs (torch.Tensor):
                The encoder feature that generates ys (n_batch, xlen, n_feat).

        Returns:
            tuple[torch.Tensor, List[Any]]: Tuple of
                batchfied scores for next token with shape of `(n_batch, n_vocab)`
                and next state list for ys.

        r   Nc                    s*   g | ] t  fd dtD qS )c                       g | ]}|   qS r   r   ).0b)ir   r   r    
<listcomp>n      ABaseTransformerDecoder.batch_score.<locals>.<listcomp>.<listcomp>)rW   stackranger   )n_batchr   )r   r    r   m  s    z6BaseTransformerDecoder.batch_score.<locals>.<listcomp>r^   r   r   c                    s$   g | ]  fd dt D qS )c                    r   r   r   )r   r   )r   r   r   r    r   v  r   r   )r   r   )n_layersr   )r   r    r   v  s   $ )r   ry   r   r   r7   r   r   r   )r   r   r   r   batch_stater   r   
state_listr   )r   r   r   r    batch_scoreU  s   
z"BaseTransformerDecoder.batch_scorer   )r!   r"   r#   rj   r
   intr   strboolr6   rW   Tensorr   r   r   r   r   r   r   r$   r   r   r   r    rk      sp    	)
6
%rk   decoder_classesTransformerRWKVDecoderc                       s   e Zd Zdddddddddeddfd	ed
edededededededededededef fddZde	j
de	j
de	j
de	j
dee	j
e	j
f f
ddZ	d&d e	j
d!e	j
d"e	j
d#ee	j
 dee	j
ee	j
 f f
d$d%Z  ZS )'r      i      r3   g        rl   TFrm   rn   attention_headslinear_units
num_blocksrY   ro   self_attention_dropout_ratesrc_attention_dropout_raterp   rq   r?   r@   c              
      s   t  j||||
||d |di }t| |t| fdd| _ ddrEtd tj	j
| jd jdd	d
 d S d S )N)rm   rn   rY   ro   rp   rq   rz   r?   rwkv_cfgc              
      s(   t tt|  dS )N)rI   )r%   r   r   )lnumrI   r{   r   r@   rY   r   r?   r   r   r    <lambda>  s    

z1TransformerRWKVDecoder.__init__.<locals>.<lambda>r/   Tr   g-C6g-C6?)ar   )r   r6   rD   r   creater   ry   rK   r   rL   uniform_rl   rN   )r   rm   rn   r   r   r   rY   ro   r   r   rp   rq   rz   r?   r@   rZ   r   r   r   r    r6   |  s*   
zTransformerRWKVDecoder.__init__r|   r}   r~   r   r   c                 C   r   r   r   r   r   r   r    r     r   zTransformerRWKVDecoder.forwardNrc   rd   re   rg   c                 C   r   r   r   r   r   r   r    r     r   z'TransformerRWKVDecoder.forward_one_stepr   )r!   r"   r#   r
   r   r   r   r   r6   rW   r   r   r   r   r   r$   r   r   r   r    r   z  s~    	
5
6))rj   typingr   r   r   r   rW   r   #funasr.models.transformer.attentionr   ,funasr.models.transformer.utils.dynamic_convr   .funasr.models.transformer.utils.dynamic_conv2dr	   #funasr.models.transformer.embeddingr
   $funasr.models.transformer.layer_normr   )funasr.models.transformer.utils.lightconvr   +funasr.models.transformer.utils.lightconv2dr   $funasr.models.transformer.utils.maskr   *funasr.models.transformer.utils.nets_utilsr   3funasr.models.transformer.positionwise_feed_forwardr   &funasr.models.transformer.utils.repeatr   2funasr.models.transformer.scorers.scorer_interfacer   	omegaconfr   funasr.registerr   Moduler%   rk   registerr   r   r   r   r    <module>   s8     
A