o
    i_X                     @   s   d dl Z d dlmZmZ d dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dlmZmZ G dd de jjZe ddG dd deZ!dS )    N)ListTuple)tables)utils)repeat)DecoderLayer)	LayerNorm)PositionalEncoding)MultiHeadedAttention)make_pad_mask)BaseTransformerDecoder)PositionwiseFeedForward)"PositionwiseFeedForwardDecoderSANM)MultiHeadedAttentionSANMDecoderMultiHeadedAttentionCrossAttc                       sP   e Zd ZdZ		d fdd	ZdddZdd	d
ZdddZ	dddZ  Z	S )DecoderLayerSANMa  Single decoder layer module.

    Args:
        size (int): Input dimension.
        self_attn (torch.nn.Module): Self-attention module instance.
            `MultiHeadedAttention` instance can be used as the argument.
        src_attn (torch.nn.Module): Self-attention module instance.
            `MultiHeadedAttention` instance can be used as the argument.
        feed_forward (torch.nn.Module): Feed-forward module instance.
            `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
            can be used as the argument.
        dropout_rate (float): Dropout rate.
        normalize_before (bool): Whether to use layer_norm before the first block.
        concat_after (bool): Whether to concat attention layer's input and output.
            if True, additional linear will be applied.
            i.e. x -> x + linear(concat(x, att(x)))
            if False, no additional linear will be applied. i.e. x -> x + att(x)


    TFc                    s   t t|   || _|| _|| _|| _t|| _|dur!t|| _	|dur*t|| _
tj|| _|| _|| _| jrNtj|| || _tj|| || _d| _g | _dS )z!Construct an DecoderLayer object.NF)superr   __init__size	self_attnsrc_attnfeed_forwardr   norm1norm2norm3torchnnDropoutdropoutnormalize_beforeconcat_afterLinearconcat_linear1concat_linear2reserve_attnattn_mat)selfr   r   r   r   dropout_rater   r    	__class__ a/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/paraformer_v2_community/decoder.pyr   0   s$   



zDecoderLayerSANM.__init__Nc                 C   s   |}| j r
| |}| |}|}| jr+| j r| |}| ||\}}|| | }| jdur_|}| j r:| |}| jrO| j|||dd\}	}
| j	
|
 n	| j|||dd}	|| |	 }|||||fS )"  Compute decoded features.

        Args:
            tgt (torch.Tensor): Input tensor (#batch, maxlen_out, size).
            tgt_mask (torch.Tensor): Mask for input tensor (#batch, maxlen_out).
            memory (torch.Tensor): Encoded memory, float32 (#batch, maxlen_in, size).
            memory_mask (torch.Tensor): Encoded memory mask (#batch, maxlen_in).
            cache (List[torch.Tensor]): List of cached tensors.
                Each tensor shape should be (#batch, maxlen_out - 1, size).

        Returns:
            torch.Tensor: Output tensor(#batch, maxlen_out, size).
            torch.Tensor: Mask for output tensor (#batch, maxlen_out).
            torch.Tensor: Encoded memory (#batch, maxlen_in, size).
            torch.Tensor: Encoded memory mask (#batch, maxlen_in).

        NTret_attnF)r   r   r   r   r   r   r   r   r$   r%   append)r&   tgttgt_maskmemorymemory_maskcacheresidualx_
x_src_attnr%   r*   r*   r+   forwardN   s(   




zDecoderLayerSANM.forwardc           
      C   st   |}|  |}| |}|}| jd ur&| |}| j|||d\}}|| }|}| |}| j|||dd\}}	|	S )Nr4   Tr-   )r   r   r   r   r   r   )
r&   r0   r1   r2   r3   r4   r5   r6   r8   r%   r*   r*   r+   get_attn_mat{   s   




zDecoderLayerSANM.get_attn_matc                 C   s   |}| j r
| |}| |}|}| jr2| j r| |}| jr!d}| j|||d\}}|| | }| jdurM|}| j rA| |}|| | ||| }|||||fS )r,   Nr:   )	r   r   r   r   r   trainingr   r   r   )r&   r0   r1   r2   r3   r4   r5   r6   r*   r*   r+   forward_one_step   s$   




z!DecoderLayerSANM.forward_one_stepr   c           	      C   s   |}| j r
| |}| |}|}| jr,| j r| |}| |d|\}}|| | }| jdurK|}| j r;| |}| j|||||\}}|| }||||fS )r,   N)	r   r   r   r   r   r   r   r   forward_chunk)	r&   r0   r2   
fsmn_cache	opt_cache
chunk_size	look_backr5   r6   r*   r*   r+   r>      s"   




zDecoderLayerSANM.forward_chunk)TF)NN)NNNr   )
__name__
__module____qualname____doc__r   r9   r;   r=   r>   __classcell__r*   r*   r(   r+   r      s    

-
+r   decoder_classes"ParaformerSANMDecoder_v2_communityc                1       s  e Zd ZdZdddddddddd	edd	dd
ddddddddfdedededededededededededededed ed!ed"ed#e	e d$ed%ed&ed'e
d(ed)ef0 fd*d+Z					dDd,ejd-ejd.ejd/ejd0ejd1ed2ed3eejejf fd4d5Zd6d7 Zd,ejd-ejd.ejd/ejfd8d9Zd,ejd-ejd.ejd/ejfd:d;Z	dEd<ejd=ejd>ed3eejejf fd?d@Z	dEd=ejdAejd<ejd>e	ej d3eeje	ej f f
dBdCZ  ZS )FParaformerSANMDecoderz
    Author: Speech Lab of DAMO Academy, Alibaba Group
    Paraformer: Fast and Accurate Parallel Transformer for Non-autoregressive End-to-End Speech Recognition
    https://arxiv.org/abs/2006.01713
       i      g?g        embedTF   r   N      )   decoderzseq2seq/decoder
vocab_sizeencoder_output_sizeattention_headslinear_units
num_blocksr'   positional_dropout_rateself_attention_dropout_ratesrc_attention_dropout_rateinput_layeruse_output_layerwo_input_layerr   r    att_layer_numkernel_size
sanm_shfit	lora_list	lora_rank
lora_alphalora_dropoutchunk_multiply_factor!tf2torch_tensor_name_prefix_torchtf2torch_tensor_name_prefix_tfc                    s  t  j||||
||
d | |rd | _n;|
dkr'tjtj| | _n*|
dkrJtjtj| tj tj	tj
 | || _ntd|
 
| _| jr\t | _|rgtj || _nd | _|| _|| _d u rzd d t| 	
fdd| _|| d	krd | _nt||  
fd
d| _td 
fdd| _|| _|| _|| _d S )N)rS   rT   r'   rX   r[   r\   pos_enc_classr   rM   linearz'only 'embed' or 'linear' is supported: rQ      c                    s8   t  t dt 	t 
S )Nr`   )r   r   r   r   lnumattention_dimrU   r    r'   r_   rV   rc   rd   ra   rb   r   r`   rY   rZ   r*   r+   <lambda>1  s&    
	z0ParaformerSANMDecoder.__init__.<locals>.<lambda>r   c                    s(   t  t ddd t S )Nr   rk   )r   r   r   rl   )ro   r    r'   r_   rV   r   rY   r*   r+   rp   J  s    
c                    s   t  d d t S N)r   r   rl   )ro   r    r'   rV   r   r*   r+   rp   Y  s    
)r   r   rM   r   r   
Sequential	Embeddingr!   r   r   ReLU
ValueErrorr   
after_normoutput_layerr^   rW   r   decoders	decoders2	decoders3rf   rg   re   )r&   rS   rT   rU   rV   rW   r'   rX   rY   rZ   r[   r\   r]   rh   r   r    r^   r_   r`   ra   rb   rc   rd   re   rf   rg   r(   rn   r+   r      sj   


$
zParaformerSANMDecoder.__init__hs_padhlens	ys_in_pad
ys_in_lens
chunk_maskreturn_hiddenreturn_bothreturnc                 C   sX  |}t j||jddddddf }	|}
t j||
jddddddf }|durL|| }|	d|dkrLtj||ddddddf fdd}| |}| ||	|
|\}}	}
}}| jdurp| ||	|
|\}}	}
}}| 	||	|
|\}}	}
}}| j
r| |}|	d}| jdur|du r| |}||fS |r| |}|||fS ||fS )@  Forward decoder.

        Args:
            hs_pad: encoded memory, float32  (batch, maxlen_in, feat)
            hlens: (batch)
            ys_in_pad:
                input token ids, int64 (batch, maxlen_out)
                if input_layer == "embed"
                input tensor (batch, maxlen_out, #mels) in the other cases
            ys_in_lens: (batch)
        Returns:
            (tuple): tuple containing:

            x: decoded token score before softmax (batch, maxlen_out, token)
                if use_output_layer is True,
            olens: (batch, )
        deviceNrQ   dimF)myutilssequence_maskr   r   r   catrM   rx   ry   rz   r   rv   sumrw   )r&   r{   r|   r}   r~   r   r   r   r0   r1   r2   r3   r6   r7   hiddenolensr*   r*   r+   r9   g  s.   ""(






zParaformerSANMDecoder.forwardc                 C   sd   t jtjt|gtjd|jddddddf }| j|d||d|d\}}|	d|fS )zScore.)dtyper   Nr   r:   )
r   r   r   tensorlenint32r   r=   	unsqueezesqueeze)r&   ysstater6   ys_masklogpr*   r*   r+   score  s   "zParaformerSANMDecoder.scorec                 C   s   |}t j||jdd d d d d f }|}t j||jdd d d d d f }| jd ||||\}}}}}	| jjd ||||}
|
S )Nr   r   rQ   )r   r   r   rx   modelr;   r&   r{   r|   r}   r~   r0   r1   r2   r3   r7   r%   r*   r*   r+   forward_asf2  s   ""z"ParaformerSANMDecoder.forward_asf2c                 C   s   |}t j||jdd d d d d f }|}t j||jdd d d d d f }| jd ||||\}}}}}	| jd ||||\}}}}}	| jd ||||\}}}}}	| jd ||||\}}}}}	| jd ||||\}}}}}	| jd ||||}
|
S )Nr   r   rQ   rj      rK      )r   r   r   rx   r;   r   r*   r*   r+   forward_asf6  s   ""z"ParaformerSANMDecoder.forward_asf6r2   r0   r4   c              	   C   s  |}|d du rt | j}| jdur|t | j7 }dg| }n|d }|d du r4t | j}dg| }n|d }t| jD ]"}| j| }	|	j|||| || |d |d d\}}||< ||< q=| j| j dkrt| j| j D ]}|| j }
| j| }	|	j||||
 d\}}||
< }qp| jD ]}	|	||\}}}}q| jr| 	|}| j
dur| 
|}||d< |d d	ks|d d
kr||d< |S )r   decode_fsmnNoptrA   decoder_chunk_look_back)r?   r@   rA   rB   rQ   )r?   r   r   )r   rx   ry   ranger^   r>   rW   rz   r   rv   rw   )r&   r2   r0   r4   r6   cache_layer_numr?   r@   irR   jr7   r*   r*   r+   r>     sL   



	






z#ParaformerSANMDecoder.forward_chunkr1   c                 C   st  |  |}|du rt| j}| jdur|t| j7 }dg| }g }t| jD ]}| j| }	|| }
|	j|||d|
d\}}}}}|| q&| j| j dkr{t| j| j D ]$}|| j }| j| }	|| }
|	j|||d|
d\}}}}}|| qV| j	D ]}	|	j|||ddd\}}}}}q~| j
r| |dddf }n|dddf }| jdurtj| |dd}||fS )a5  Forward one step.

        Args:
            tgt: input token ids, int64 (batch, maxlen_out)
            tgt_mask: input token mask,  (batch, maxlen_out)
                      dtype=torch.uint8 in PyTorch 1.2-
                      dtype=torch.bool in PyTorch 1.2+ (include 1.2)
            memory: encoded memory, float32  (batch, maxlen_in, feat)
            cache: cached output list of (batch, max_time_out-1, size)
        Returns:
            y, cache: NN output value and cache per `self.decoders`.
            y.shape` is (batch, maxlen_out, token)
        Nr:   rQ   r   r   )rM   r   rx   ry   r   r^   r=   r/   rW   rz   r   rv   rw   r   log_softmax)r&   r0   r1   r2   r4   r6   r   	new_cacher   rR   cr3   c_retr   r7   yr*   r*   r+   r=     sB   











z&ParaformerSANMDecoder.forward_one_step)NFFrq   )rC   rD   rE   rF   r	   intfloatstrboolr   tupler   r   Tensorr   r9   r   r   r   dictr>   r=   rG   r*   r*   r(   r+   rJ      s   
	
 	
6


JrJ   )"r   typingr   r   funasr.registerr   funasr.models.scamar   r   &funasr.models.transformer.utils.repeatr   !funasr.models.transformer.decoderr   $funasr.models.transformer.layer_normr   #funasr.models.transformer.embeddingr	   #funasr.models.transformer.attentionr
   *funasr.models.transformer.utils.nets_utilsr   r   3funasr.models.transformer.positionwise_feed_forwardr   ,funasr.models.sanm.positionwise_feed_forwardr   funasr.models.sanm.attentionr   r   r   Moduler   registerrJ   r*   r*   r*   r+   <module>   s$    
G