o
    iI%                     @   s  d dl Z d dlZd dlZd dlmZ d dlmZmZmZm	Z	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dlmZ d dl m!Z! d dl"m#Z#m$Z$ e%ddG dd dej&j'Z(dS )    N)autocast)UnionDictListTupleOptional)tables)CTC)postprocess_utils)th_accuracy)	to_device)DatadirWriter)
Hypothesis)mae_loss)force_gatherable)LabelSmoothingLoss)add_sos_eos)make_pad_mask)load_audio_text_image_videoextract_fbankmodel_classesSanmKWSc                       s8  e Zd ZdZ															d.d	ee d
ee dedee dedee dedee dededededededef fddZ	de
jde
jde
jde
jdee
jeee
jf e
jf f
dd Zde
jde
jdee
je
jf fd!d"Zd#e
jd$e
jd%e
jd&e
jfd'd(Z				d/d)efd*d+Zd,d- Z  ZS )0r   z
    Author: Speech Lab of DAMO Academy, Alibaba Group
    Paraformer: Fast and Accurate Parallel Transformer for Non-autoregressive End-to-End Speech Recognition
    https://arxiv.org/abs/2206.08317
    N      ?h  r         specaugspecaug_conf	normalizenormalize_confencoderencoder_confctcctc_conf
ctc_weight
input_size
vocab_size	ignore_idblank_idsoseosc                    s   t    |d urtj|}|di |}|d ur'tj|}|di |}tj|}|dd|
i|}| }|d u r@i }td||d|}|| _	|d urS|n|d | _
|d ur^|n|d | _|| _|| _|	| _|| _|| _|| _|| _d | _d S )Nr&   )odimencoder_output_sizer    )super__init__r   specaug_classesgetnormalize_classesencoder_classesoutput_sizer	   r)   r*   r+   r'   r(   r%   r   r   r!   r#   error_calculator)selfr   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   kwargsspecaug_classnormalize_classencoder_classr-   	__class__r.   P/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/sanm_kws/model.pyr0   #   s0   

zSanmKWS.__init__speechspeech_lengthstexttext_lengthsreturnc                 K   s   t | dkr|dddf }t | dkr |dddf }|jd }| ||\}}| ||||\}	}
t }|	durB|	 nd|d< |
|d< |
|d< |	}t| |d< ||d< t	|||f|j
\}}}|||fS )	zEncoder + Decoder + Calc loss
        Args:
                speech: (Batch, Length, ...)
                speech_lengths: (Batch, )
                text: (Batch, Length)
                text_lengths: (Batch,)
        r   Nr   loss_ctccer_ctccerloss
batch_size)lensizeshapeencode_calc_ctc_lossdictdetachtorchcloner   device)r7   r?   r@   rA   rB   r8   rH   encoder_outencoder_out_lensrD   rE   statsrG   weightr.   r.   r>   forwardZ   s$   

zSanmKWS.forwardc                 K   s   t d% | jdur| jr| ||\}}| jdur"| ||\}}W d   n1 s,w   Y  | ||\}}}t|trC|d }||fS )zEncoder. Note that this method is used by asr_inference.py
        Args:
                speech: (Batch, Length, ...)
                speech_lengths: (Batch, )
                ind: int
        FNr   )r   r   trainingr   r!   
isinstancetuple)r7   r?   r@   r8   rS   rT   _r.   r.   r>   rL      s   



zSanmKWS.encoderS   rT   ys_padys_pad_lensc                 C   sR   |  ||||}d }| js%| jd ur%| j |j}| j| | dd}||fS )NT)is_ctc)r#   rX   r6   argmaxdatacpu)r7   rS   rT   r\   r]   rD   rE   ys_hatr.   r.   r>   rM      s   zSanmKWS._calc_ctc_losskeyc                 K   s~  | d}ddlm} || j||j|jd| _i }	t|tj	rN| dddkrN||}
}t
|
jdk r>|
d d d d d f }
|d urH|d	}nQ|
jd
 }nKt }t||j| dd| dd|d}t }|| d|	d< t|| dd|d\}
}t }|| d|	d< |  |j |j d |	d< |
j|d d}
|j|d d}| ddr|
 }
| |
|\}}t|tr|d }g }| dd urt| dst| d| _t|dD ]Y}||d || d d f }| j|}|d |d
 |d }}}|r#d| d t | | jd || < d| d t | }nd| jd || < d}|| |d}|!| q||	fS ) Nkeywordsr   )KwsCtcPrefixDecoder)r#   rd   
token_listseg_dict	data_typesoundfbank   r   r   fsi>  )rl   audio_fsrh   	tokenizerz0.3f	load_data)rh   frontendextract_feati  batch_data_timerR   )rR   fp16F
output_dirwriterr   z	detected  detectrejected)rc   rA   )"r2   funasr.utils.kws_utilsre   r#   rf   rg   kws_decoderrY   rP   TensorrI   rK   squeezetimeperf_counterr   rl   r   sumitemframe_shiftlfr_ntohalfrL   rZ   hasattrr   ru   rangerJ   decodestrappend)r7   data_indata_lengthsrc   rn   rp   r8   rd   re   	meta_datar?   r@   time1audio_sample_listtime2time3rS   rT   resultsixdetect_resultis_deteddet_keyword	det_scoredet_inforesult_ir.   r.   r>   	inference   st   
	






"zSanmKWS.inferencec                 K   s2   ddl m} d|vrd|d< |dd| i|}|S )Nr   )export_rebuild_modelmax_seq_leni   modelr.   )export_metar   )r7   r8   r   modelsr.   r.   r>   export  s
   zSanmKWS.export)NNNNNNNNr   r   r   r   r   r   r   )NNNN)__name__
__module____qualname____doc__r   r   r   floatintr0   rP   r{   r   rW   rL   rM   listr   r   __classcell__r.   r.   r<   r>   r      s    	
7
,


Q))r}   rP   loggingtorch.cuda.ampr   typingr   r   r   r   r   funasr.registerr   funasr.models.ctc.ctcr	   funasr.utilsr
   funasr.metrics.compute_accr   funasr.train_utils.device_funcsr   funasr.utils.datadir_writerr   funasr.models.paraformer.searchr   &funasr.models.paraformer.cif_predictorr   r   "funasr.losses.label_smoothing_lossr   +funasr.models.transformer.utils.add_sos_eosr   *funasr.models.transformer.utils.nets_utilsr   funasr.utils.load_utilsr   r   registernnModuler   r.   r.   r.   r>   <module>   s(   
