o
    iaH                     @   s   d dl mZ d dl mZ d dlZd dlZd dlmZ d dlZd dl	m
Z d dlmZ d dlmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ G dd dejZeddG dd deZdS )    )List)TupleN)utils)BaseTransformerDecoder)MultiHeadedAttentionSANMDecoderMultiHeadedAttentionCrossAtt)PositionalEncoding)	LayerNorm)"PositionwiseFeedForwardDecoderSANM)repeat)tablesc                       sF   e Zd ZdZ		d fdd	ZdddZdd	d
Z	dddZ  ZS )DecoderLayerSANMa  Single decoder layer module.

    Args:
        size (int): Input dimension.
        self_attn (torch.nn.Module): Self-attention module instance.
            `MultiHeadedAttention` instance can be used as the argument.
        src_attn (torch.nn.Module): Self-attention module instance.
            `MultiHeadedAttention` instance can be used as the argument.
        feed_forward (torch.nn.Module): Feed-forward module instance.
            `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
            can be used as the argument.
        dropout_rate (float): Dropout rate.
        normalize_before (bool): Whether to use layer_norm before the first block.
        concat_after (bool): Whether to concat attention layer's input and output.
            if True, additional linear will be applied.
            i.e. x -> x + linear(concat(x, att(x)))
            if False, no additional linear will be applied. i.e. x -> x + att(x)


    TFc                    s   t t|   || _|| _|| _|| _t|| _|dur!t|| _	|dur*t|| _
t|| _|| _|| _| jrMt|| || _t|| || _dS dS )z!Construct an DecoderLayer object.N)superr   __init__size	self_attnsrc_attnfeed_forwardr	   norm1norm2norm3nnDropoutdropoutnormalize_beforeconcat_afterLinearconcat_linear1concat_linear2)selfr   r   r   r   dropout_rater   r   	__class__ N/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/sanm/decoder.pyr   2   s"   


zDecoderLayerSANM.__init__Nc           	      C   s   |}| j r
| |}| |}|}| jr+| j r| |}| ||\}}|| | }| jdurF|}| j r:| |}|| | ||| }|||||fS "  Compute decoded features.

        Args:
            tgt (torch.Tensor): Input tensor (#batch, maxlen_out, size).
            tgt_mask (torch.Tensor): Mask for input tensor (#batch, maxlen_out).
            memory (torch.Tensor): Encoded memory, float32 (#batch, maxlen_in, size).
            memory_mask (torch.Tensor): Encoded memory mask (#batch, maxlen_in).
            cache (List[torch.Tensor]): List of cached tensors.
                Each tensor shape should be (#batch, maxlen_out - 1, size).

        Returns:
            torch.Tensor: Output tensor(#batch, maxlen_out, size).
            torch.Tensor: Mask for output tensor (#batch, maxlen_out).
            torch.Tensor: Encoded memory (#batch, maxlen_in, size).
            torch.Tensor: Encoded memory mask (#batch, maxlen_in).

        N)r   r   r   r   r   r   r   r   )	r   tgttgt_maskmemorymemory_maskcacheresidualx_r#   r#   r$   forwardN   s    




zDecoderLayerSANM.forwardc                 C   s   |}| j r
| |}| |}|}| jr2| j r| |}| jr!d}| j|||d\}}|| | }| jdurM|}| j rA| |}|| | ||| }|||||fS )r&   Nr+   )	r   r   r   r   r   trainingr   r   r   )r   r'   r(   r)   r*   r+   r,   r-   r#   r#   r$   forward_one_stepv   s$   




z!DecoderLayerSANM.forward_one_stepr   c           	      C   s   |}| j r
| |}| |}|}| jr,| j r| |}| |d|\}}|| | }| jdurK|}| j r;| |}| j|||||\}}|| }||||fS r%   )	r   r   r   r   r   r   r   r   forward_chunk)	r   r'   r)   
fsmn_cache	opt_cache
chunk_size	look_backr,   r-   r#   r#   r$   r3      s"   




zDecoderLayerSANM.forward_chunk)TFNN)NNNr   )	__name__
__module____qualname____doc__r   r/   r2   r3   __classcell__r#   r#   r!   r$   r      s    

(+r   decoder_classesFsmnDecoderc                +       sH  e Zd ZdZdddddddddedd	dd
dd	ddddfdedededededededededededededededededed ed!ed"ef* fd#d$Z			d7d%e
jd&e
jd'e
jd(e
jd)e
jd*e
jd+ee
je
jf fd,d-Z		d7d*e
jfd.d/Z			d8d0e
jd1e
jd2e
jd3e
jd*e
jd4ee
j d+ee
jee
j f fd5d6Z  ZS )9r?   z
    Author: Zhifu Gao, Shiliang Zhang, Ming Lei, Ian McLoughlin
    San-m: Memory equipped self-attention for end-to-end speech recognition
    https://arxiv.org/abs/2006.01713
       i      g?g        embedTF   Ndecoderzseq2seq/decoder
vocab_sizeencoder_output_sizeattention_headslinear_units
num_blocksr    positional_dropout_rateself_attention_dropout_ratesrc_attention_dropout_rateinput_layeruse_output_layerr   r   att_layer_numkernel_size
sanm_shfitconcat_embedsattention_dim!tf2torch_tensor_name_prefix_torchtf2torch_tensor_name_prefix_tfembed_tensor_name_prefix_tfc                    s  t  j|||
||d  d u r |
dkr%tjtj| | _n*|
dkrHtjtj| tj tj	tj
 | || _ntd|
 | _| jrZt | _|retj || _nd | _|| _|| _d u rxd d t| 	
fdd| _|| d	krd | _nt||  	fd
d| _td fdd| _|rtd fdd| _nd | _|| _|| _|| _|| _d S )N)rE   rF   r    rJ   rM   rN   pos_enc_classr   rB   linearz'only 'embed' or 'linear' is supported:       c              	      s4   t  t 	dt 
dt S )NrQ   )rF   )r   r   r   r
   lnumrS   rG   r   r    rF   rP   rH   r   rQ   rK   rL   r#   r$   <lambda>  s     
z&FsmnDecoder.__init__.<locals>.<lambda>r   c                    s(   t  t dd t S )Nr[   )r   r   r
   r\   )rS   r   r    rP   rH   r   rQ   rK   r#   r$   r_   ,  s    
c                    s   t  d d t S )Nr   r
   r\   )rS   r   r    rH   r   r#   r$   r_   >  s    
c              
      s(   t   d d t   dS )N)adimr`   r\   )rS   r   r    rF   rH   r   r#   r$   r_   K  s    )r   r   torchr   
Sequential	EmbeddingrB   r   r	   r   ReLU
ValueErrorr   
after_normoutput_layerrO   rI   r   decoders	decoders2	decoders3embed_concat_ffnrR   rT   rU   rV   )r   rE   rF   rG   rH   rI   r    rJ   rK   rL   rM   rN   rW   r   r   rO   rP   rQ   rR   rS   rT   rU   rV   r!   r^   r$   r      sv   




zFsmnDecoder.__init__hs_padhlens	ys_in_pad
ys_in_lens
chunk_maskpre_acoustic_embedsreturnc                 C   sl  |}t j||jddddddf }|}	t j||	jddddddf }
|durL|
| }
|d|
dkrLtj|
|
ddddddf fdd}
| |}|duro| jrotj||fdd}| |dddd\}}}}}| 	|||	|
\}}}	}
}| j
dur| 
|||	|
\}}}	}
}| |||	|
\}}}	}
}| jr| |}| jdur| |}|d}||fS )a@  Forward decoder.

        Args:
            hs_pad: encoded memory, float32  (batch, maxlen_in, feat)
            hlens: (batch)
            ys_in_pad:
                input token ids, int64 (batch, maxlen_out)
                if input_layer == "embed"
                input tensor (batch, maxlen_out, #mels) in the other cases
            ys_in_lens: (batch)
        Returns:
            (tuple): tuple containing:

            x: decoded token score before softmax (batch, maxlen_out, token)
                if use_output_layer is True,
            olens: (batch, )
        deviceNrY   dim)myutilssequence_maskru   r   rb   catrB   rR   rl   ri   rj   rk   r   rg   rh   sum)r   rm   rn   ro   rp   rq   rr   r'   r(   r)   r*   r-   r.   olensr#   r#   r$   r/   a  s,   ""(





zFsmnDecoder.forwardc                 C   sh   t jtjt|gtjd|jddddddf }| j|d||d|||d\}}|	d|fS )zScore.)dtypert   Nr   )r*   rr   r+   )
rz   r{   rb   tensorlenint32ru   r2   	unsqueezesqueeze)r   ysstater-   x_maskrr   ys_masklogpr#   r#   r$   score  s   	
zFsmnDecoder.scorer'   r(   r)   r*   r+   c                 C   s  |ddddf }d}|  |}|dur/| jr/tj||fdd}| |dddd\}}}}}|du rIt| j}	| jdurD|	t| j7 }	dg|	 }g }
t| j	D ]}| j| }|| }|j
|||||d\}}}}}|
| qP| j| j	 dkrt| j| j	 D ]$}|| j	 }| j| }|| }|j
|||||d\}}}}}|
| q| jD ]}|j
|||ddd\}}}}}q| jr| |dddf }n|dddf }| jdur| |}tj|dd}||
fS )a5  Forward one step.

        Args:
            tgt: input token ids, int64 (batch, maxlen_out)
            tgt_mask: input token mask,  (batch, maxlen_out)
                      dtype=torch.uint8 in PyTorch 1.2-
                      dtype=torch.bool in PyTorch 1.2+ (include 1.2)
            memory: encoded memory, float32  (batch, maxlen_in, feat)
            cache: cached output list of (batch, max_time_out-1, size)
        Returns:
            y, cache: NN output value and cache per `self.decoders`.
            y.shape` is (batch, maxlen_out, token)
        Nrw   rx   r0   rY   )rB   rR   rb   r|   rl   r   ri   rj   rangerO   r2   appendrI   rk   r   rg   rh   log_softmax)r   r'   r(   r)   r*   rr   r+   r-   r.   cache_layer_num	new_cacheirD   cc_retjyr#   r#   r$   r2     sN   












zFsmnDecoder.forward_one_stepr8   )NNN)r9   r:   r;   r<   r   intfloatstrboolr   rb   Tensorr   r/   r   r   r2   r=   r#   r#   r!   r$   r?      s    
	
 
;
)typingr   r   loggingrb   torch.nnr   numpynpfunasr.models.scamar   rz   !funasr.models.transformer.decoderr   funasr.models.sanm.attentionr   r   #funasr.models.transformer.embeddingr   $funasr.models.transformer.layer_normr	   ,funasr.models.sanm.positionwise_feed_forwardr
   &funasr.models.transformer.utils.repeatr   funasr.registerr   Moduler   registerr?   r#   r#   r#   r$   <module>   s$    
0