o
    i&                     @   s  d dl Z d dlZd dlZd dlmZ d dlmZmZmZm	Z	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dlm Z m!Z! d dl"m#Z#m$Z$ e%ddG dd dej&j'Z(e%ddG dd dej&j'Z)dS )    N)autocast)UnionDictListTupleOptional)tables)CTC)postprocess_utils)th_accuracy)DatadirWriter)
Hypothesis)mae_loss)force_gatherable)LabelSmoothingLoss)add_sos_eos)make_pad_maskpad_list)load_audio_text_image_videoextract_fbankmodel_classesFsmnKWSc                       s$  e Zd ZdZ													d(dee dee d	ed
ee dedee dedee dededededef fddZ	de
jde
jde
jde
jdee
jeee
jf e
jf f
ddZde
jde
jdee
je
jf fddZde
jd e
jd!e
jd"e
jfd#d$Z				d)d%efd&d'Z  ZS )*r   
    Author: Speech Lab of DAMO Academy, Alibaba Group
    Deep-FSMN for Large Vocabulary Continuous Speech Recognition
    https://arxiv.org/abs/1803.05030
    N      ?h  r   specaugspecaug_conf	normalizenormalize_confencoderencoder_confctcctc_conf
ctc_weight
input_size
vocab_size	ignore_idblank_idc                    s   t    |d urtj|}|di |}|d ur'tj|}|di |}tj|}|di |}| }|d u r>i }td||d|}|| _	|| _
|| _|	| _|| _|| _|| _|| _d | _d S N)odimencoder_output_size )super__init__r   specaug_classesgetnormalize_classesencoder_classesoutput_sizer	   r(   r&   r'   r$   r   r   r    r"   error_calculator)selfr   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   kwargsspecaug_classnormalize_classencoder_classr+   	__class__r,   P/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/fsmn_kws/model.pyr.   "   s4   

zFsmnKWS.__init__speechspeech_lengthstexttext_lengthsreturnc                 K   s   t | dkr|dddf }t | dkr |dddf }|jd }| ||\}}| ||||\}	}
t }|	durB|	 nd|d< |
|d< | j|	 }|
|d< t	| |d< t
|||f|j\}}}|||fS )zEncoder + Decoder + Calc loss
        Args:
                speech: (Batch, Length, ...)
                speech_lengths: (Batch, )
                text: (Batch, Length)
                text_lengths: (Batch,)
           Nr   loss_ctccer_ctccerloss)lensizeshapeencode_calc_ctc_lossdictdetachr$   torchcloner   device)r5   r=   r>   r?   r@   r6   
batch_sizeencoder_outencoder_out_lensrC   rD   statsrF   weightr,   r,   r<   forwardT   s"   


zFsmnKWS.forwardc                 K   s   t d% | jdur| jr| ||\}}| jdur"| ||\}}W d   n1 s,w   Y  | |}|}t|trA|d }||fS )zEncoder. Note that this method is used by asr_inference.py
        Args:
                speech: (Batch, Length, ...)
                speech_lengths: (Batch, )
                ind: int
        FNr   )r   r   trainingr   r    
isinstancetuple)r5   r=   r>   r6   rR   rS   r,   r,   r<   rJ   ~   s   
	



zFsmnKWS.encoderR   rS   ys_padys_pad_lensc                 C   sR   |  ||||}d }| js%| jd ur%| j |j}| j| | dd}||fS )NT)is_ctc)r"   rW   r4   argmaxdatacpu)r5   rR   rS   rZ   r[   rC   rD   ys_hatr,   r,   r<   rK      s   zFsmnKWS._calc_ctc_losskeyc                 K   sj  | d}ddlm} || j||j|jd| _i }	t|tj	rN| dddkrN||}
}t
|
jdk r>|
d d d d d f }
|d urH|d	}nQ|
jd
 }nKt }t||j| dd| dd|d}t }|| d|	d< t|| dd|d\}
}t }|| d|	d< |  |j |j d |	d< |
j|d d}
|j|d d}| |
|\}}t|tr|d }g }| dd urt| dst| d| _t|dD ]Y}||d || d d f }| j|}|d |d
 |d }}}|rd| d t| | jd || < d| d t| }nd| jd || < d}|| |d}| | q||	fS )Nkeywordsr   )KwsCtcPrefixDecoder)r"   rb   
token_listseg_dict	data_typesoundfbank   r   rB   fsi>  )rj   audio_fsrf   	tokenizerz0.3f	load_data)rf   frontendextract_feati  batch_data_timerP   )rP   
output_dirwriter   z	detected  detectrejected)ra   r?   )!r0   funasr.utils.kws_utilsrc   r"   rd   re   kws_decoderrX   rN   TensorrG   rI   squeezetimeperf_counterr   rj   r   sumitemframe_shiftlfr_ntorJ   rY   hasattrr   rr   rangerH   decodestrappend)r5   data_indata_lengthsra   rl   rn   r6   rb   rc   	meta_datar=   r>   time1audio_sample_listtime2time3rR   rS   resultsixdetect_resultis_deteddet_keyword	det_scoredet_inforesult_ir,   r,   r<   	inference   sZ   
	
$ 

"zFsmnKWS.inference)NNNNNNNNr   r   r   r   r   )NNNN)__name__
__module____qualname____doc__r   r   r   floatintr.   rN   ry   r   rV   rJ   rK   listr   __classcell__r,   r,   r:   r<   r      s    	
2
*

FsmnKWSConvertc                       sl   e Zd ZdZ								ddedee d	ed
ee dedededef fddZ	dd Z
dd Z  ZS )r   r   Nr   r   r   r   r    r!   r"   r#   r$   r%   r&   r(   c	                    st   t    tj|}
|
di |}| }|d u ri }td||d|}|| _|| _|| _	|| _
|| _d | _d S r)   )r-   r.   r   r2   r0   r3   r	   r(   r&   r$   r    r"   r4   )r5   r    r!   r"   r#   r$   r%   r&   r(   r6   r9   r+   r:   r,   r<   r.      s"   

zFsmnKWSConvert.__init__c                 C   s
   | j  S N)r    to_kaldi_net)r5   r,   r,   r<   r     s   
zFsmnKWSConvert.to_kaldi_netc                 C   s   | j |S r   )r    to_pytorch_net)r5   
kaldi_filer,   r,   r<   r     s   zFsmnKWSConvert.to_pytorch_net)NNNNr   r   r   r   )r   r   r   r   r   r   r   r   r   r.   r   r   r   r,   r,   r:   r<   r      s:    	 )*r{   rN   loggingtorch.cuda.ampr   typingr   r   r   r   r   funasr.registerr   funasr.models.ctc.ctcr	   funasr.utilsr
   funasr.metrics.compute_accr   funasr.utils.datadir_writerr   funasr.models.paraformer.searchr   &funasr.models.paraformer.cif_predictorr   funasr.train_utils.device_funcsr   "funasr.losses.label_smoothing_lossr   +funasr.models.transformer.utils.add_sos_eosr   *funasr.models.transformer.utils.nets_utilsr   r   funasr.utils.load_utilsr   r   registernnModuler   r   r,   r,   r,   r<   <module>   s,   
 
V