o
    iq.                     @   s  d dl Z d dlZd dlZd dlmZ d dlmZmZmZm	Z	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dlm Z m!Z! d dl"m#Z#m$Z$ e%ddG dd dej&j'Z(e%ddG dd dej&j'Z)dS )    N)autocast)UnionDictListTupleOptional)tables)CTC)postprocess_utils)th_accuracy)DatadirWriter)
Hypothesis)mae_loss)force_gatherable)LabelSmoothingLoss)add_sos_eos)make_pad_maskpad_list)load_audio_text_image_videoextract_fbankmodel_classes	FsmnKWSMTc                       sF  e Zd ZdZddddddddg ddfdee dee ded	ee d
edee dee dedededef fddZ	de
jde
jde
jde
jde
jde
jdee
jeee
jf e
jf fddZde
jde
jdee
je
jf fddZde
jde
jd e
jd!e
jfd"d#Zde
jde
jd e
jd!e
jfd$d%Z				d)d&efd'd(Z  ZS )*r   
    Author: Speech Lab of DAMO Academy, Alibaba Group
    Deep-FSMN for Large Vocabulary Continuous Speech Recognition
    https://arxiv.org/abs/1803.05030
    Nh  r   specaugspecaug_conf	normalizenormalize_confencoderencoder_confctc_conf
input_size
vocab_size	ignore_idblank_idc                    s   t    |d urtj|}|di |}|d ur'tj|}|di |}tj|}|di |}| }| }t	d|	d |d|}t	d|	d |d|}|| _
|
| _|| _|| _|| _|| _|| _d | _d S )Nr   )odimencoder_output_size    )super__init__r   specaug_classesgetnormalize_classesencoder_classesoutput_sizeoutput_size2r	   r%   r$   r   r   r   ctcctc2error_calculator)selfr   r   r   r   r   r    r!   r"   r#   r$   r%   kwargsspecaug_classnormalize_classencoder_classr'   encoder_output_size2r2   r3   	__class__r)   S/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/fsmn_kws_mt/model.pyr+   "   s:   

zFsmnKWSMT.__init__speechspeech_lengthstexttext_lengthstext2text2_lengthsreturnc                 K   s  t | dkr|dddf }t | dkr |dddf }|jd }| ||\}	}
}| |	|||\}}| |
|||\}}t }|durM| nd|d< ||d< |dur]| nd|d< ||d< d| d|  }||d	< ||d
< t| |d< t	|||f|j
\}}}|||fS )a'  Encoder + Decoder + Calc loss
        Args:
                speech: (Batch, Length, ...)
                speech_lengths: (Batch, )
                text: (Batch, Length)
                text_lengths: (Batch,)
                text2: (Batch, Length)
                text2_lengths: (Batch,)
        r(   Nr   loss_ctccer_ctc	loss_ctc2cer_ctc2g      ?cercer2loss)lensizeshapeencode_calc_ctc_lossdictdetachtorchcloner   device)r5   r>   r?   r@   rA   rB   rC   r6   
batch_sizeencoder_outencoder_out2encoder_out_lensrE   rF   rG   rH   statsrK   weightr)   r)   r=   forwardS   s.   

zFsmnKWSMT.forwardc                 K   s   t d% | jdur| jr| ||\}}| jdur"| ||\}}W d   n1 s,w   Y  | |\}}|}t|trC|d }t|trL|d }|||fS )zEncoder. Note that this method is used by asr_inference.py
        Args:
                speech: (Batch, Length, ...)
                speech_lengths: (Batch, )
                ind: int
        FNr   )r   r   trainingr   r   
isinstancetuple)r5   r>   r?   r6   rW   rX   rY   r)   r)   r=   rO      s   





zFsmnKWSMT.encoderW   rY   ys_padys_pad_lensc                 C   R   |  ||||}d }| js%| jd ur%| j |j}| j| | dd}||fS NT)is_ctc)r2   r]   r4   argmaxdatacpur5   rW   rY   r`   ra   rE   rF   ys_hatr)   r)   r=   rP         zFsmnKWSMT._calc_ctc_lossc                 C   rb   rc   )r3   r]   r4   re   rf   rg   rh   r)   r)   r=   _calc_ctc2_loss   rj   zFsmnKWSMT._calc_ctc2_losskeyc           "      K   s>  | d}ddlm} || j||d j|d jd| _|| j||d j|d jd| _i }	t	|t
jrb| dddkrb||}
}t|
jd	k rR|
d d d d d f }
|d ur\|d
}nQ|
jd }nKt }t||j| dd| dd|d}t }|| d|	d< t|| dd|d\}
}t }|| d|	d< |  |j |j d |	d< |
j|d d}
|j|d d}| |
|\}}}t	|tr|d }t	|tr|d }g }| dd urt| dst| d| _t|dD ]}||d || d d f }| j |}|d |d |d }}}|r7d| d t!| | jd || < d| d t!| }nd| jd || < d}||d || d d f }| j |}|d |d |d }}}|rd| d t!| | jd || < d| d t!| } nd| jd || < d} || || d}!|"|! q||	fS )Nkeywordsr   )KwsCtcPrefixDecoder)r2   rm   
token_listseg_dictr(   	data_typesoundfbank   r   fsi>  )ru   audio_fsrq   	tokenizerz0.3f	load_data)rq   frontendextract_feati  batch_data_timerU   )rU   
output_dirwriter   z	detected  detectrejecteddetect2)rl   r@   rB   )#r-   funasr.utils.kws_utilsrn   r2   ro   rp   kws_decoderr3   kws_decoder2r^   rS   TensorrL   rN   squeezetimeperf_counterr   ru   r   sumitemframe_shiftlfr_ntorO   r_   hasattrr   r}   rangerM   decodestrappend)"r5   data_indata_lengthsrl   rw   ry   r6   rm   rn   	meta_datar>   r?   time1audio_sample_listtime2time3rW   rX   rY   resultsixdetect_resultis_deteddet_keyword	det_scoredet_infox2detect_result2	is_deted2det_keyword2
det_score2	det_info2result_ir)   r)   r=   	inference   s   
	




 


""zFsmnKWSMT.inference)NNNN)__name__
__module____qualname____doc__r   r   r   intlistr+   rS   r   r   r\   rO   rP   rk   r   __classcell__r)   r)   r;   r=   r      s    	
1	
4
!

FsmnKWSMTConvertc                       sh   e Zd ZdZ						ddedee dee d	ed
edef fddZ	dd Z
dd Zdd Z  ZS )r   r   N      ?r   r   r   r    r!   
ctc_weightr"   r%   c           
         sB   t    tj|}|di |}| }	|| _|| _d | _d S )Nr)   )	r*   r+   r   r/   r-   r0   r%   r   r4   )
r5   r   r    r!   r   r"   r%   r6   r9   r'   r;   r)   r=   r+   6  s   


zFsmnKWSMTConvert.__init__c                 C   
   | j  S N)r   to_kaldi_netr5   r)   r)   r=   r   J     
zFsmnKWSMTConvert.to_kaldi_netc                 C   r   r   )r   to_kaldi_net2r   r)   r)   r=   r   M  r   zFsmnKWSMTConvert.to_kaldi_net2c                 C   s   | j |S r   )r   to_pytorch_net)r5   
kaldi_filer)   r)   r=   r   P  s   zFsmnKWSMTConvert.to_pytorch_net)NNNr   r   r   )r   r   r   r   r   r   r   floatr   r+   r   r   r   r   r)   r)   r;   r=   r   .  s0    )*r   rS   loggingtorch.cuda.ampr   typingr   r   r   r   r   funasr.registerr   funasr.models.ctc.ctcr	   funasr.utilsr
   funasr.metrics.compute_accr   funasr.utils.datadir_writerr   funasr.models.paraformer.searchr   &funasr.models.paraformer.cif_predictorr   funasr.train_utils.device_funcsr   "funasr.losses.label_smoothing_lossr   +funasr.models.transformer.utils.add_sos_eosr   *funasr.models.transformer.utils.nets_utilsr   r   funasr.utils.load_utilsr   r   registernnModuler   r   r)   r)   r)   r=   <module>   s.   
  
