o
    i3C                     @   s   d dl Z d dlZd dlZd dlmZ d dlmZ d dlm	Z
 d dlmZ d dlmZ d dlmZ d dlmZmZ d d	lmZ d d
lmZmZ G dd de jjZG dd de jjZeddG dd deZeddG dd de jjZdS )    N)Tuple)tables)utils)repeat)	LayerNorm)PositionalEncoding)DecoderLayerSANMParaformerSANMDecoder)"PositionwiseFeedForwardDecoderSANM)MultiHeadedAttentionSANMDecoderMultiHeadedAttentionCrossAttc                       s.   e Zd Z		d fdd	Z	d	ddZ  ZS )
ContextualDecoderLayerTFc                    s   t t|   || _|| _|| _|| _t|| _|dur!t|| _	|dur*t|| _
tj|| _|| _|| _| jrPtj|| || _tj|| || _dS dS z!Construct an DecoderLayer object.N)superr   __init__size	self_attnsrc_attnfeed_forwardr   norm1norm2norm3torchnnDropoutdropoutnormalize_beforeconcat_afterLinearconcat_linear1concat_linear2)selfr   r   r   r   dropout_rater   r   	__class__ _/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/contextual_paraformer/decoder.pyr      s"   


zContextualDecoderLayer.__init__Nc                 C   s   t |tr	|\}}|}| jr| |}| |}|}| jr"| |}| jr'd }| j|||d\}}|| | }|}	|}| jrD| 	|}| 
|||}|}
|| | }|||	|
fS )N)cache)
isinstancer   r   r   r   r   trainingr   r   r   r   )r!   tgttgt_maskmemorymemory_maskr'   _residualxx_self_attn
x_src_attnr%   r%   r&   forward5   s*   
	



zContextualDecoderLayer.forward)TFN__name__
__module____qualname__r   r3   __classcell__r%   r%   r#   r&   r      s    "r   c                       s*   e Zd Z	d fdd	ZdddZ  ZS )	ContextualBiasDecoderTc                    sD   t t|   || _|| _|durt|| _tj	|| _
|| _dS r   )r   r:   r   r   r   r   r   r   r   r   r   r   )r!   r   r   r"   r   r#   r%   r&   r   Y   s   

zContextualBiasDecoder.__init__Nc                 C   s@   |}| j d ur| jr| |}| |  |||}|||||fS r4   )r   r   r   r   )r!   r*   r+   r,   r-   r'   r0   r%   r%   r&   r3   i   s   

zContextualBiasDecoder.forward)T)NNr5   r%   r%   r#   r&   r:   X   s    r:   decoder_classesContextualParaformerDecoderc                !       s   e Zd ZdZdddddddddedd	dd
dfdedededededededededededededededef  fddZ				d)de
jd e
jd!e
jd"e
jd#e
jd$ed%ed&ee
je
jf fd'd(Z  ZS )*r<   z
    Author: Speech Lab of DAMO Academy, Alibaba Group
    Paraformer: Fast and Accurate Parallel Transformer for Non-autoregressive End-to-End Speech Recognition
    https://arxiv.org/abs/2006.01713
       i      g?g        embedTF   r   
vocab_sizeencoder_output_sizeattention_headslinear_units
num_blocksr"   positional_dropout_rateself_attention_dropout_ratesrc_attention_dropout_rateinput_layeruse_output_layerr   r   att_layer_numkernel_size
sanm_shfitc                    s  t  j||||
||d | |
dkrd | _|
dkr(tjtj| | _n*|
dkrKtjtj| tj tj	tj
 | || _ntd|
 | _| jr]t | _|rhtj || _nd | _|| _|| _d u r{d d t|d  	f
dd	| _tj	| _t t 	d
d| _tjj d  ddd| _t t dt 	t | _|| dkrd | _nt||  fdd	| _td fdd	| _d S )N)rA   rB   r"   rF   rI   rJ   pos_enc_classr   noner?   linearz'only 'embed' or 'linear' is supported:       c                    s0   t  t dt 	t S )NrM   )r   r   r   r
   lnum
attention_dimrC   r   r"   rL   rD   r   rM   rG   rH   r%   r&   <lambda>   s    
z6ContextualParaformerDecoder.__init__.<locals>.<lambda>T)r   r   r"   r   F)biasrS   r   c                    s(   t  t ddd t S )Nr   rS   )r   r   r
   rT   )rW   r   r"   rL   rD   r   rG   r%   r&   rX      s    
c                    s   t  d d t S r4   )r   r
   rT   )rW   r   r"   rD   r   r%   r&   rX      s    
)r   r   r?   r   r   
Sequential	Embeddingr   r   r   ReLU
ValueErrorr   
after_normoutput_layerrK   rE   r   decodersr   r:   r   bias_decoderConv1dbias_outputr   r   r
   last_decoder	decoders2	decoders3)r!   rA   rB   rC   rD   rE   r"   rF   rG   rH   rI   rJ   rN   r   r   rK   rL   rM   r#   rV   r&   r   z   s   




z$ContextualParaformerDecoder.__init__      ?hs_padhlens	ys_in_pad
ys_in_lenscontextual_info
clas_scalereturn_hiddenreturnc                 C   s  |}t j||jddddddf }	|}
t j||
jddddddf }|}| ||	|
|\}}	}
}}| ||	|
|\}}}}t|jd g 	|jd }t j||
jddddddf }| j
||	||d\}}	}}}| jdurtj||| gdd}| |dddd}|| | }| jdur| ||	|
|\}}	}
}}| ||	|
|\}}	}
}}| jr| |}|	d}| jdur|du r| |}||fS )	a@  Forward decoder.

        Args:
            hs_pad: encoded memory, float32  (batch, maxlen_in, feat)
            hlens: (batch)
            ys_in_pad:
                input token ids, int64 (batch, maxlen_out)
                if input_layer == "embed"
                input tensor (batch, maxlen_out, #mels) in the other cases
            ys_in_lens: (batch)
        Returns:
            (tuple): tuple containing:

            x: decoded token score before softmax (batch, maxlen_out, token)
                if use_output_layer is True,
            olens: (batch, )
        )deviceNrQ   r   r-   rR   dimF)myutilssequence_maskrp   r`   rd   r   Tensorshapeintr   ra   rc   cat	transposer   re   rf   r   r^   sumr_   )r!   rh   ri   rj   rk   rl   rm   rn   r*   r+   r,   r-   r0   r.   r1   r2   contextual_lengthcontextual_maskcxolensr%   r%   r&   r3      s2   """"




z#ContextualParaformerDecoder.forward)rg   F)r6   r7   r8   __doc__r   rx   floatstrboolr   r   rv   r   r3   r9   r%   r%   r#   r&   r<   r   s    
	
 	!ContextualParaformerDecoderExportc                
       sZ   e Zd Z			ddef fddZdd Zd	ejd
ejdejdejdejf
ddZ  Z	S )r      decoderTonnxc                    s<  t    ddlm} || _||dd| _ddlm} ddlm} ddl	m
}	 ddlm}
 t| jjD ]0\}}t|jtrC|
|j|_t|jtrO||j|_t|jtr[||j|_|	|| jj|< q3| jjd urt| jjD ]$\}}t|jtr|
|j|_t|jtr||j|_|	|| jj|< qpt| jjD ]\}}t|jtr|
|j|_|	|| jj|< q|j| _|j| _|| _t| jjjtr|| jjj| jj_| jj| _t| jjjtr|| jjj| jj_t| jjjtr|| jjj| jj_t| jjjtr|
| jjj| jj_| jj| _| jj| _| jj| _d S )	Nr   )ru   F)flip)%MultiHeadedAttentionSANMDecoderExport)"MultiHeadedAttentionCrossAttExport)DecoderLayerSANMExport)(PositionwiseFeedForwardDecoderSANMExport)r   r   funasr.utils.torch_functionru   modelmake_pad_maskfunasr.models.sanm.attentionr   r    funasr.models.paraformer.decoderr   3funasr.models.transformer.positionwise_feed_forwardr   	enumerater`   r(   r   r
   r   r   r   r   re   rf   r_   r^   
model_namera   rd   rc   r   )r!   r   max_seq_lenr   r   kwargsru   r   r   r   r   idr#   r%   r&   r   =  sd   







z*ContextualParaformerDecoderExport.__init__c                 C   sz   |d d d d d f }t |jdkr!d|d d d d d d f  }nt |jdkr5d|d d d d d f  }|d }||fS )NrR   rQ      g     )lenrw   )r!   maskmask_3d_btdmask_4d_bhltr%   r%   r&   prepare_mask  s   z.ContextualParaformerDecoderExport.prepare_maskrh   ri   rj   rk   
bias_embedc                 C   s|  |}|  |}| |\}}|}	|  |}
| |
\}}
|}| j|||	|
\}}}	}
}| |||	|
\}}}}t|jd g 	|jd }|  |}| |\}}|
ddd}| j||||d\}}}}}| jd urtj||gdd}| |
dd
dd}|| | }| jjd ur| j|||	|
\}}}	}
}| j|||	|
\}}}	}
}| |}| |}||fS )NrQ   r   rR   rq   rr   )r   r   r   r`   rd   r   rv   rw   rx   r   rz   	unsqueezera   rc   ry   r   re   rf   r^   r_   )r!   rh   ri   rj   rk   r   r*   r+   r.   r,   r-   r0   r1   r2   r|   r}   r~   r%   r%   r&   r3     s8   	

"



z)ContextualParaformerDecoderExport.forward)r   r   T)
r6   r7   r8   r   r   r   r   rv   r3   r9   r%   r%   r#   r&   r   ;  s&    G
) r   loggingnumpynptypingr   funasr.registerr   funasr.models.scamar   rt   &funasr.models.transformer.utils.repeatr   $funasr.models.transformer.layer_normr   #funasr.models.transformer.embeddingr   r   r   r	   ,funasr.models.sanm.positionwise_feed_forwardr
   r   r   r   r   Moduler   r:   registerr<   r   r%   r%   r%   r&   <module>   s&   @
 
I