o
    i,                     @   sZ  d dl mZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlm	Z	 d dl
Z
d dlZd d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dl m!Z! d dl"m#Z# d dl$m%Z% eej&edkrd dl'm(Z( nedddZ(d dl)Z)d dl*Z*d dl+Z+G dd de%Z,dS )    )contextmanager)LooseVersion)Dict)List)Optional)Tuple)UnionN)ErrorCalculator)th_accuracy)add_sos_eos)LabelSmoothingLoss)CTC)
AbsDecoder)
AbsEncoder)AbsFrontend)AbsPreEncoder)
AbsSpecAug)AbsNormalize)force_gatherable)FunASRModelz1.6.0)autocastTc                 c   s    d V  d S N )enabledr   r   U/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/mfcca/e2e_asr_mfcca.pyr      s   
r   c                (       s  e Zd ZdZ												d7d
edeeedf ee f de	e
 de	e de	e dedededddededededededededede	e f& fddZd ejd!ejd"ejd#ejd$eejeeejf ejf f
d%d&Zd ejd!ejd"ejd#ejd$eeejf f
d'd(Zd ejd!ejd$eejejf fd)d*Zd ejd!ejd$eejejf fd+d,Zd-ejd.ejd/ejd0ejfd1d2Zd-ejd.ejd/ejd0ejfd3d4Zd-ejd.ejd/ejd0ejfd5d6Z  ZS )8MFCCAz
    Author: Audio, Speech and Language Processing Group (ASLP@NPU), Northwestern Polytechnical University
    MFCCA:Multi-Frame Cross-Channel attention for multi-speaker ASR in Multi-party meeting scenario
    https://arxiv.org/abs/2210.05265
    N      ?        FT<space><blank>
vocab_size
token_list.frontendspecaug	normalizeencoderdecoderctcrnnt_decoder
ctc_weight	ignore_id
lsm_weight
mask_ratiolength_normalized_loss
report_cer
report_wer	sym_space	sym_blank
preencoderc                    s   d|
  krdksJ |
 J |
|	d u sJ dt    |d | _|d | _|| _|| _|
| _| | _|| _	|| _
|| _|| _|| _|| _|
dkrQd | _n|| _|
dkr\d | _n|| _|	| _t||||d| _|so|rzt|||||| _d S d | _d S )Nr         ?zNot implemented   )sizepadding_idx	smoothingnormalize_length)super__init__soseosr!   r+   r*   copyr"   r-   r#   r$   r%   r3   r&   r'   r(   r)   r   criterion_attr	   error_calculator)selfr!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   	__class__r   r   r;   0   sB   $






zMFCCA.__init__speechspeech_lengthstexttext_lengthsreturnc              	   C   s<  |  dksJ |j|jd |jd   kr&|jd   kr&|jd ks3n J |j|j|j|jf|  dkr|ddkr| jdkrt }|| jkrtt d }|dkrq|ddddtdd| 	 j
f }n|ddddtdd f }|jd }|ddd| f }| ||\}}	| jdkrd\}
}}}n| ||	||\}
}}}| jd	krd
\}}n
| ||	||\}}| jdur| ||	||}| jd	kr|
}n| jdkr|}n| j| d| j |
  }t| |
dur|
 nd|dur| nd||||d}t|||f|j\}}}|||fS )zFrontend + Encoder + Decoder + Calc loss
        Args:
            speech: (Batch, Length, ...)
            speech_lengths: (Batch, )
            text: (Batch, Length)
            text_lengths: (Batch,)
        r5   r            Nr4   )NNNNr   NN)lossloss_attloss_ctcacccerwercer_ctc)dimshaper6   r-   randommathceiltorchrandpermsortvaluesmaxencoder*   _calc_att_loss_calc_ctc_lossr)   _calc_rnnt_lossdictdetachr   device)rA   rD   rE   rF   rG   rate_numretain_channel
batch_sizeencoder_outencoder_out_lensrN   acc_attcer_attwer_attrO   rS   _rM   statsweightr   r   r   forwardt   sT   :$
, 







zMFCCA.forwardc                 C   s   |  ||\}}}||dS )N)featsfeats_lengths)_extract_feats)rA   rD   rE   rF   rG   rq   rr   channel_sizer   r   r   collect_feats   s   
zMFCCA.collect_featsc           	      C   s4  t d. | ||\}}}| jdur| jr| ||\}}| jdur+| ||\}}W d   n1 s5w   Y  | jdurG| ||\}}| |||\}}}|d|dksfJ | |df| dkr|d|	 ksJ | |	 f||fS |d|	 ksJ | |	 f||fS )zFrontend + Encoder. Note that this method is used by asr_inference.py
        Args:
            speech: (Batch, Length, ...)
            speech_lengths: (Batch, )
        FNr      rJ   r5   )
r   rs   r$   trainingr%   r3   r&   r6   rT   r]   )	rA   rD   rE   rq   rr   rt   rh   ri   rm   r   r   r   r^      s4   



zMFCCA.encodec                 C   sd   |  dksJ |j|d d d | f }| jd ur&| ||\}}}n||}}d}|||fS )Nr5   )rT   rU   r]   r#   )rA   rD   rE   rq   rr   rt   r   r   r   rs      s   


zMFCCA._extract_featsrh   ri   ys_padys_pad_lensc                 C   s   t || j| j| j\}}|d }| ||||\}}	| ||}
t|d| j|| jd}| j	s5| j
d u r:d\}}n|jdd}| 
| | \}}|
|||fS )Nr5   r   )ignore_labelrL   )rT   )r   r<   r=   r+   r'   r?   r
   viewr!   rw   r@   argmaxcpu)rA   rh   ri   rx   ry   	ys_in_pad
ys_out_pad
ys_in_lensdecoder_outrm   rN   rj   rk   rl   ys_hatr   r   r   r_     s   
zMFCCA._calc_att_lossc                 C   sh   |  dkr|d}| ||||}d }| js0| jd ur0| j|j}| j| | dd}||fS )Nrv   r5   T)is_ctc)rT   meanr(   rw   r@   r|   datar}   )rA   rh   ri   rx   ry   rO   rS   r   r   r   r   r`   &  s   
zMFCCA._calc_ctc_lossc                 C   s   t r   )NotImplementedError)rA   rh   ri   rx   ry   r   r   r   ra   9  s   zMFCCA._calc_rnnt_loss)Nr   r   r   r   FTTr   r    N)__name__
__module____qualname____doc__intr   r   strr   r   r   r   r   r   r   r   floatboolr   r;   rY   Tensorr   rp   ru   r^   rs   r_   r`   ra   __classcell__r   r   rB   r   r   )   s    	
D
N


*


r   )T)-
contextlibr   distutils.versionr   typingr   r   r   r   r   loggingrY   funasr.metricsr	   funasr.metrics.compute_accr
   +funasr.models.transformer.utils.add_sos_eosr   "funasr.losses.label_smoothing_lossr   funasr.models.ctcr   !funasr.models.decoder.abs_decoderr   !funasr.models.encoder.abs_encoderr   funasr.frontends.abs_frontendr   'funasr.models.preencoder.abs_preencoderr   !funasr.models.specaug.abs_specaugr   funasr.layers.abs_normalizer   funasr.train_utils.device_funcsr   funasr.models.base_modelr   __version__torch.cuda.ampr   pdbrV   rW   r   r   r   r   r   <module>   s<    