o
    i?                     @   s2  d dl Z d dlZd dlZd dlmZmZ d dlmZ d dlm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dl m!Z!m"Z" d dl#m$Z$m%Z% e	ej&e	dkrd dl'm(Z( nedddZ(e)ddG dd deZ*dS )    N)DictTuple)contextmanager)LooseVersion)tables)CTC)postprocess_utils)th_accuracy)DatadirWriter)SanmKWS)
Hypothesis)mae_loss)force_gatherable)LabelSmoothingLoss)add_sos_eos)make_pad_maskpad_list)load_audio_text_image_videoextract_fbankz1.6.0)autocastTc                 c   s    d V  d S N )enabledr   r   Z/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/sanm_kws_streaming/model.pyr       s   
r   model_classesSanmKWSStreamingc                       s   e Zd ZdZ fddZdejdejdejdejdeejee	ejf ejf f
d	d
Z
	ddejdejdedeejejf fddZi fdefddZ				ddefddZddddi fdedefddZdd Z  ZS )r   z
    Author: Speech Lab of DAMO Academy, Alibaba Group
    Paraformer: Fast and Accurate Parallel Transformer for Non-autoregressive End-to-End Speech Recognition
    https://arxiv.org/abs/2206.08317
    c                    s   t  j|i | d S r   )super__init__)selfargskwargs	__class__r   r   r   -   s   zSanmKWSStreaming.__init__speechspeech_lengthstexttext_lengthsreturnc                 K   sB  | d}t| dkr|dddf }t| dkr%|dddf }|jd }t| jdrD| jj| j|}| j	|||d\}	}
n| 	||\}	}
t| jdr_| jjj
|	|
dd\}}n|	|
}}| ||||\}}t }|dury| nd|d< ||d	< |}||d
< t| |d< t|||f|j\}}}|||fS )zEncoder + Decoder + Calc loss
        Args:
                speech: (Batch, Length, ...)
                speech_lengths: (Batch, )
                text: (Batch, Length)
                text_lengths: (Batch,)
        decoding_ind   Nr   overlap_chunk_cls)ind)
chunk_outsloss_ctccer_ctccerloss)getlensizeshapehasattrencoderr*   random_choicetrainingencoderemove_chunk_calc_ctc_lossdictdetachtorchcloner   device)r   r#   r$   r%   r&   r    r(   
batch_sizer+   encoder_outencoder_out_lensencoder_out_ctcencoder_out_lens_ctcr-   r.   statsr0   weightr   r   r   forward4   s4   



zSanmKWSStreaming.forwardNcachec                 K   s   t d% | jdur| jr| ||\}}| jdur"| ||\}}W d   n1 s,w   Y  | jj|||d d\}}}t|trH|d }|t	|
dgfS )zFrontend + Encoder. Note that this method is used by asr_inference.py
        Args:
                speech: (Batch, Length, ...)
                speech_lengths: (Batch, )
                ind: int
        FNr6   )rI   r   r)   )r   specaugr8   	normalizer6   forward_chunk
isinstancetupler>   tensorr3   )r   r#   r$   rI   r    rB   rC   _r   r   r   encode_chunkl   s   




zSanmKWSStreaming.encode_chunkc                 K   s   | dg d}| dd}| dd}d}|d d }|d	 d
 |d	 d  }dt|d|ft|dfd d ||dd t||d |d  |fdd}	|	|d< d |d |d}
|
|d< i |d< td|d< |S )N
chunk_size)r   
      encoder_chunk_look_backr   decoder_chunk_look_backr)   encoder_confoutput_sizefrontend_confn_melslfr_mF   )	start_idx
cif_hidden
cif_alphasrB   rC   rR   rU   
last_chunkoptfeats
tail_chunkr6   )decode_fsmnrV   ra   rR   decoderfrontendprev_samples)r1   r>   zerosempty)r   rI   r    rR   rU   rV   rA   enc_output_size
feats_dimscache_encodercache_decoderr   r   r   
init_cache   s8   zSanmKWSStreaming.init_cachekeyc                 K   s  | di }|j|d d}|j|d d}| dd}| j||||d\}	}
t|	tr0|	d }	|d d	 }|d }|
d |d |d
  |d  krPJ td|
d |d |d
  |d  kri|d |d
  }n(|
d |d |d
  kr~|d |d
  }n|
d |d kr|
d }nJ td|d d }|d urtj||	d d ||d d f fd
d}n|	d d ||d d f }||d d< |d d d ur|d d d  || 7  < n|
|d d< || |d d d< |rm| dd urt| dst	| d| _
g }t|dD ]^}||d |d d | d d f }| j|}|d |d
 |d }}}|rRd| d t| | j
d || < d| d t| }nd| j
d || < d}|| |d}|| q|S d S )NrI   r@   )r@   is_finalF)rI   rp   r   r6   rR   r)   r\   zimpossible case 1 !zimpossible case 2 !rB   )dimrC   
output_dirwriterz	detected  detectrejected)ro   r%   )r1   torQ   rM   rN   printr>   catr5   r
   rs   ranger3   kws_decoderdecodestrappend)r   r#   r$   ro   	tokenizerrf   r    rI   rp   rB   rC   rR   real_start_posreal_end_posencoder_out_accumresultsixdetect_resultis_deteddet_keyword	det_scoredet_inforesult_ir   r   r   generate_chunk   s^   	

  
*""zSanmKWSStreaming.generate_chunkc           "      K   s  | d}ddlm}	 |	| j||j|jd| _i }
|d }t|d d }t|d d }t|dkr<| j	|fi | t
 }d	| d	d
i}t||j| dd| dd||d}|d	 }t
 }|| d|
d< t|dksuJ dt|d |d f}t||k rtd|t| |d| }t|g| dd||d d
d\}}||d  }tt|| }t|D ]}|dkr||d d d d |d d d d f< d
|d	< ||| |d |  }|d	 rt|dk rd|d d< |d d }tj|jd gtjd|j}nt|g| dd||d |d	 d\}}t
 }|| d|
d< |  |j |j d |
d< | j||f||||d|}|d u sRJ q||| d  }t|dk r||d	< d|d d< |d d }tj|jd gtjd|j}| j||f||||d|}nt||kr||d	< t|g| dd||d |d	 d\}}| j||f||||d|}nkt||kr,t||k r,d
|d	< t|g| dd||d |d	 d\}}| j||f||||d|} | d u sJ ||d	< d|d d< |d d }tj|jd gtjd|j}| j||f||||d|}|}!|r:| j	|fi | | d rNt| d!sNt| d | _ |!|
fS )"Nkeywordsr   )KwsCtcPrefixDecoder)ctcr   
token_listseg_dictrR   r)   i  r\   rp   Ffsi>  	data_typesound)r   audio_fsr   r   rI   z0.3f	load_datazbatch_size must be set 1rg   z,key: {}, audio is too short for inference {}rf   )r   rf   rI   rp   r6   rb   Trc   )dtypeextract_feati  batch_data_time)ro   r   rI   rf   rr   rs   )!r1   funasr.utils.kws_utilsr   r   r   r   r{   intr2   rn   timeperf_counterr   r   r>   ry   rx   formatr   rz   rO   r4   int64rw   r@   sumitemframe_shiftlfr_nr   r5   r
   rs   )"r   data_indata_lengthsro   r   rf   rI   r    r   r   	meta_datarR   chunk_stride_samplesfirst_chunk_padding_samplestime1cfgaudio_sample_list	_is_finaltime2audio_sampleaudio_sample_prefeat_prefeat_pre_lengthsaudio_chunksr   audio_sample_ir#   r$   time3results_chunk_itail_audio_sampleresults_chunk_tailresults_chunkresultr   r   r   	inference   s@  





&

	

	



zSanmKWSStreaming.inferencec                 K   s"   ddl m} |dd| i|}|S )Nr)   )export_rebuild_modelmodelr   )export_metar   )r   r    r   modelsr   r   r   export  s   zSanmKWSStreaming.exportr   )NNNN)__name__
__module____qualname____doc__r   r>   Tensorr   r   r}   rH   r<   rQ   rn   listr   r   r   __classcell__r   r   r!   r   r   %   sX    
<
 &
L
 ?)T)+r   r>   loggingtypingr   r   
contextlibr   distutils.versionr   funasr.registerr   funasr.models.ctc.ctcr   funasr.utilsr   funasr.metrics.compute_accr	   funasr.utils.datadir_writerr
   funasr.models.sanm_kws.modelr   funasr.models.paraformer.searchr   &funasr.models.paraformer.cif_predictorr   funasr.train_utils.device_funcsr   "funasr.losses.label_smoothing_lossr   +funasr.models.transformer.utils.add_sos_eosr   *funasr.models.transformer.utils.nets_utilsr   r   funasr.utils.load_utilsr   r   __version__torch.cuda.ampr   registerr   r   r   r   r   <module>   s2   
