o
    i                     @   sH  d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZ d dlm	Z	 d dl
mZ d dlmZ d dlmZmZmZmZmZ d dlmZ d dlmZmZ G d	d
 d
e	ZG dd de	ZG dd de	ZG dd de	ZG dd dZG dd deZG dd deZ G dd deZ!G dd deZ"e#ddG dd dej$Z%dS )    N)nn)Enum)	dataclass)tables)ListTupleDictAnyOptional)DatadirWriter)load_audio_text_image_videoextract_fbankc                   @      e Zd ZdZdZdZdS )VadStateMachine         N)__name__
__module____qualname__ kVadInStateStartPointNotDetectedkVadInStateInSpeechSegmentkVadInStateEndPointDetected r   r   Z/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/fsmn_vad_streaming/model.pyr          r   c                   @   r   )
FrameStater   r   N)r   r   r   kFrameStateInvalidkFrameStateSpeechkFrameStateSilr   r   r   r   r      r   r   c                   @   s$   e Zd ZdZdZdZdZdZdZdS )AudioChangeStater   r   r   r         N)	r   r   r   kChangeStateSpeech2SpeechkChangeStateSpeech2SilkChangeStateSil2SilkChangeStateSil2SpeechkChangeStateNoBeginkChangeStateInvalidr   r   r   r   r!   #   s    r!   c                   @   s   e Zd ZdZdZdS )VadDetectModer   r   N)r   r   r   kVadSingleUtteranceDetectModekVadMutipleUtteranceDetectModer   r   r   r   r*   ,   s    r*   c                ;   @   s   e Zd ZdZdejjddddddddd	d
ddddddddddd
dgdddddfdededededededededed ed!e	d"ed#ed$ed%ed&ed'ed(ed)ed*ed+e	d,e	d-ed.e
e d/e	d0e	d1ed2ed3ef:d4d5Zd6S )7VADXOptions
    Author: Speech Lab of DAMO Academy, Alibaba Group
    Deep-FSMN for Large Vocabulary Continuous Speech Recognition
    https://arxiv.org/abs/1803.05030
    >  r   i   i  T            ?r   d   `     r"         Yg333333?g-C6?gg333333?F
      sample_ratedetect_modesnr_modemax_end_silence_timemax_start_silence_timedo_start_point_detectiondo_end_point_detectionwindow_size_mssil_to_speech_time_thresspeech_to_sil_time_thresspeech_2_noise_ratio	do_extendlookback_time_start_pointlookahead_time_end_pointmax_single_segment_timenn_eval_block_sizedcd_block_size	snr_thresnoise_frame_num_used_for_snrdecibel_thresspeech_noise_thresfe_prior_thressilence_pdf_numsil_pdf_idsspeech_noise_thresh_lowspeech_noise_thresh_highoutput_frame_probsframe_in_msframe_length_msc                 K   s   || _ || _|| _|| _|| _|| _|| _|| _|	| _|
| _	|| _
|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _d S N)r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   )selfr9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   kwargsr   r   r   __init__8   s:   !
zVADXOptions.__init__N)r   r   r   __doc__r*   r,   valueintboolfloatr   rY   r   r   r   r   r-   1   s    	
r-   c                   @   s    e Zd ZdZdd Zdd ZdS )E2EVadSpeechBufWithDoar.   c                 C   (   d| _ d| _g | _d| _d| _d| _d S Nr   Fstart_msend_msbuffercontain_seg_start_pointcontain_seg_end_pointdoarW   r   r   r   rY         
zE2EVadSpeechBufWithDoa.__init__c                 C   r`   ra   rb   ri   r   r   r   Reset   rj   zE2EVadSpeechBufWithDoa.ResetN)r   r   r   rZ   rY   rk   r   r   r   r   r_   x   s    r_   c                   @   s   e Zd ZdZdd ZdS )E2EVadFrameProbr.   c                 C   s"   d| _ d| _d| _d| _d| _d S )N        r   )
noise_probspeech_probscoreframe_id	frm_stateri   r   r   r   rY      s
   
zE2EVadFrameProb.__init__N)r   r   r   rZ   rY   r   r   r   r   rl      s    rl   c                	   @   sn   e Zd ZdZdedededefddZdd
dZdefddZi fdedede	de
fddZdefddZd	S )WindowDetectorr.   r@   sil_to_speech_timespeech_to_sil_timeframe_size_msc                 C   s   || _ || _|| _|| _t|| | _d| _dg| j | _d| _t	j
| _t	j
| _t|| | _t|| | _d| _d| _d| _d S Nr   )r@   rt   ru   rv   r\   win_size_framewin_sum	win_statecur_win_posr   r    pre_frame_statecur_frame_statesil_to_speech_frmcnt_thresspeech_to_sil_frmcnt_thresvoice_last_frame_countnoise_last_frame_counthydre_frame_count)rW   r@   rt   ru   rv   r   r   r   rY      s   
zWindowDetector.__init__returnNc                 C   s@   d| _ d| _dg| j | _tj| _tj| _d| _d| _	d| _
d S rw   )r{   ry   rx   rz   r   r    r|   r}   r   r   r   ri   r   r   r   rk      s   
zWindowDetector.Resetc                 C   
   t | jS rV   )r\   rx   ri   r   r   r   
GetWinSize      
zWindowDetector.GetWinSize
frameStateframe_countcachec                 C   s   t j}|t jkrd}n|t jkrd}ntjS |  j| j| j 8  _|  j|7  _|| j| j< | jd | j | _| j	t jkrJ| j| j
krJt j| _	tjS | j	t jkr]| j| jkr]t j| _	tjS | j	t jkrftjS | j	t jkrotjS tjS )Nr   r   )r   r    r   r!   r)   ry   rz   r{   rx   r|   r~   r'   r   r%   r&   r$   )rW   r   r   r   r}   r   r   r   DetectOneFrame   s.   

zWindowDetector.DetectOneFramec                 C   r   rV   )r\   rv   ri   r   r   r   FrameSizeMs   r   zWindowDetector.FrameSizeMs)r   N)r   r   r   rZ   r\   rY   rk   r   r   dictr!   r   r   r   r   r   r   rs      s0    



#rs   c                   @   s   e Zd Zdd ZdS )Statsc                 C   s   d| _ d| _d| _d| _d| _tj| _d| _d| _	d| _
d| _|| _d| _d| _d| _g | _d| _g | _|| _|| _d | _d| _g | _d | _d | _d | _d| _d S )Nr   r   r6   FT)data_buf_start_framefrm_cntlatest_confirmed_speech_framelastest_confirmed_silence_framecontinous_silence_frame_countr   r   vad_state_machineconfirmed_start_frameconfirmed_end_framenumber_end_time_detected	sil_framerP   noise_average_decibelpre_end_silence_detectednext_segoutput_data_bufoutput_data_buf_offsetframe_probsmax_end_sil_frame_cnt_threshrM   scoresmax_time_outdecibeldata_bufdata_buf_allwaveformlast_drop_frames)rW   rP   r   rM   r   r   r   rY      s4   
zStats.__init__N)r   r   r   rY   r   r   r   r   r      s    r   model_classesFsmnVADStreamingc                       sF  e Zd ZdZ			dFdedee deeef f fddZi fde	fd	d
Z
i fde	ddfddZi fdejde	ddfddZi fdede	ddfddZi fdededededede	ddfddZi fdede	fddZi fdede	ddfddZd i fd!ed"ede	ddfd#d$Zi fd%ed"ed&ede	ddf
d'd(Zi fd)ed*ede	ddfd+d,Zi fde	defd-d.Zi fde	defd/d0Zi fd1ede	fd2d3Zi d fdejd4ejde	d5efd6d7Zi fde	fd8d9Z					dGd:ede	fd;d<Zd=d> Z i fde	defd?d@Z!i fde	defdAdBZ"i fdCe#d*ed)ede	ddf
dDdEZ$  Z%S )Hr   r.   Nencoderencoder_confvad_post_argsc                    sD   t    tdi || _tj|}|di |}|| _|| _d S )Nr   )	superrY   r-   vad_optsr   encoder_classesgetr   r   )rW   r   r   r   rX   encoder_class	__class__r   r   rY      s   

zFsmnVADStreaming.__init__r   c                 C   s*  d|d _ d|d _d|d _d|d _d|d _tj|d _|d   d|d _	g |d _
|d jr|d jd jdks@J t|d jd j| jj }||d j }||d _|d j|t| jj| jj d  d  |d _|d j|d  |d _|d jd d |d d d f |d _d S d S )Nr   statsr   windows_detectorT  )r   r   r   r   r   r   r   r   rk   r   r   r   rg   r\   rd   r   rT   r   r   r9   r   r   )rW   r   drop_framesreal_drop_framesr   r   r   ResetDetection/  s,   









*zFsmnVADStreaming.ResetDetectionr   c                 C   s  t | jj| jj d }t | jj| jj d }|d jd u r2|d jd |d _|d j|d _nt	|d j|d jd f|d _|d j
 }td|jd | d |}|d|d d tjf t| f }dttjt|ddd  }| }|d j| d S )Nr   r   r   r   r7   )axisgư>)r\   r   rU   r9   rT   r   r   r   torchcatnumpynparangeshapenewaxislog10sumsquaretolistr   extend)rW   r   frame_sample_lengthframe_shift_lengthwaveform_numpyoffsetsframesdecibel_numpyr   r   r   ComputeDecibelF  s    

$"zFsmnVADStreaming.ComputeDecibelfeatsc                 C   s   | j ||d dd}|jd |jd ksJ d|jd | j_|d  j|jd 7  _|d jd u r;||d _d S tj|d j|fdd|d _d S )Nr   r   cpur   z1The shape between feats and scores does not matchr   )dim)	r   tor   r   rH   r   r   r   r   )rW   r   r   r   r   r   r   ComputeScores^  s   "zFsmnVADStreaming.ComputeScores	frame_idxc                 C   s   |d j |k rLt|d jt| jj| jj d krC|d  j d7  _ |d j|d j |d j t| jj| jj d  d  |d _|d j |k sd S d S )Nr   r   r   )	r   lenr   r\   r   rT   r9   r   r   )rW   r   r   r   r   r   PopDataBufTillFramej  s   
z$FsmnVADStreaming.PopDataBufTillFrame	start_frmr   first_frm_is_start_pointlast_frm_is_end_pointend_point_is_sent_endc                 C   s  | j ||d t|| jj | jj d }|r6tdt| jj| jj d | jj| jj d  }|t|7 }|rBt|t|d j}t|d j|k rOt	d t|d j
dksZ|r|d j
t  |d j
d   || jj |d j
d _|d j
d j|d j
d _d|d j
d _|d j
d }	|	j|| jj krt	d d}
|r|}
nt|| jj | jj d }
|
t|d jkrt	d t|d j}
t|d j}d|	_|	j|| jj krt	d	 |d  j|7  _|| | jj |	_|rd
|	_|rd
|	_d S d S )Nr   r   r   r   zerror in calling pop data_buf
r   zwarning
zAVAD data_to_pop is bigger than cache["stats"].data_buf.size()!!!
z'Something wrong with the VAD algorithm
T)r   r\   r   r9   rT   maxrU   r   r   printr   appendr_   rk   rc   rd   rh   r   rf   rg   )rW   r   r   r   r   r   r   expected_sample_numberextra_samplecur_segdata_to_popr   r   r   PopDataToOutputBufu  s^   	
z#FsmnVADStreaming.PopDataToOutputBufvalid_framec                 C   s0   ||d _ |d jtjkr| j||d d S d S )Nr   r   )r   r   r   r   r   rW   r   r   r   r   r   OnSilenceDetected  s   
z"FsmnVADStreaming.OnSilenceDetectedc                 C   s$   ||d _ | j|dddd|d d S )Nr   r   Fr   )r   r   r   r   r   r   OnVoiceDetected  s   
z FsmnVADStreaming.OnVoiceDetectedFstart_framefake_resultc                 C   sh   | j jr	 |d jdkrtd n||d _|s0|d jtjkr2| j|d jdddd|d d S d S d S )Nr   r   not reset vad properly
r   TFr   )r   r>   r   r   r   r   r   r   )rW   r   r   r   r   r   r   OnVoiceStart  s   


zFsmnVADStreaming.OnVoiceStart	end_frameis_last_framec                 C   s   t |d jd |D ]	}| j||d q
| jjr	 |d jdkr%td n||d _|s?d|d _| j|d jddd||d |d  j	d7  _	d S )	Nr   r   r   r   r   r   FT)
ranger   r   r   r?   r   r   r   r   r   )rW   r   r   r   r   tr   r   r   
OnVoiceEnd  s   


zFsmnVADStreaming.OnVoiceEndis_final_framecur_frm_idxc                 C   s*   |r| j |dd|d tj|d _d S d S )NFTr   r   )r   r   r   r   )rW   r   r   r   r   r   r   MaybeOnVoiceEndIfLastFrame  s   z+FsmnVADStreaming.MaybeOnVoiceEndIfLastFramec                 C   s   t | j|d| jj S )Nr   )r\   LatencyFrmNumAtStartPointr   rT   )rW   r   r   r   r   
GetLatency  s   zFsmnVADStreaming.GetLatencyc                 C   s0   |d   }| jjr|t| jj| jj 7 }|S )Nr   )r   r   rD   r\   rE   rT   )rW   r   vad_latencyr   r   r   r     s   z*FsmnVADStreaming.LatencyFrmNumAtStartPointr   c                    s  t j} d j }| d j }|| jjk r%t j}| j|d d |S d}d}t d j	| jj
ks6J t d j	dkrt d jdksJJ 	 t d j	dkrdt fdd d j	D }n d jd   d j	d   }t|| jj }d	}|| }t|}	| jjrt }
||
_|	|
_||
_|
_ d j|
 t|	t| d j kr|| jjkr|| jjkrt j}|S t j}|S t j} d jd
k r| d _|S | d j| jjd   | jj  d _|S )Nr   Fr   rm   r   r   c                 3   s*    | ]} d  j d  |  V  qdS )r   r   N)r   item).0
sil_pdf_idr   r   r   r   	<genexpr>  s   ( z1FsmnVADStreaming.GetFrameState.<locals>.<genexpr>r2   gX)r   r   r   r   r   rL   r    r   r   rP   rO   r   r   r   mathlogrC   rS   rl   rn   ro   rp   rq   r   r   exprM   rJ   r   rK   )rW   r   r   frame_statecur_decibelcur_snr	sum_scorern   total_scorero   
frame_probr   r   r   GetFrameState  sZ    $



zFsmnVADStreaming.GetFrameStater   is_finalc                 K   s  ||d _ |dd}| j|d | j||d |s!| j|d n| j|d g }td|jd D ]}g }	t|d j	dkrt|d j
t|d j	D ]}
|r|d j	|
 jsXqK|d jsf|d j	|
 jsfqK|d jrs|d j	|
 jnd}|d j	|
 jr|d j	|
 j}d|d _|d  j
d7  _
nd}d|d _||g}n,|s|d j	|
 jr|d j	|
 jsqK|d j	|
 j|d j	|
 jg}|d  j
d7  _
|	| qK|	r||	 q1|S )	Nr   is_streaming_inputTr   r   r   r   F)r   r   r   r   DetectCommonFramesDetectLastFramesr   r   r   r   r   rf   r   rg   rc   rd   r   )rW   r   r   r   r  rX   r  segments	batch_numsegment_batchirc   rd   segmentr   r   r   forward$  sb   




zFsmnVADStreaming.forwardc                 K   s   i |d< t d|d< i |d< |dd ur|d| j_t| jj| jj| jj| jj	}|
  t| jj| jj| jj | jjd}||d< ||d< |S )	Nfrontendr   prev_samplesr   r<   )rP   r   rM   r   r   )r   emptyr   r   r<   rs   r@   rA   rB   rT   rk   r   rP   rM   )rW   r   rX   r   r   r   r   r   
init_cachel  s,   zFsmnVADStreaming.init_cachekeyc                  K   s  |d u ri }t |dkr| j|fi | i }|dd}	t|	|j d }
t }|	dkr4|ddn|dd}|rB|d	dn|d	d}||d
}t||j|dd|dd||d}|d	 }|d }t }|| d|d< t |dks~J dt	|d |d f}tt ||
 t| }tt ||
 dt|  }g }t
|D ]z}|o||d k|d	< |||
 |d |
  }t|g|dd||d |d	 d\}}t }|| d|d< |  |j |j d |d< |j|d d}|j|d d}||d d |d	 ||d}| jd"i |}t |dkr#|j|  q|dkr0|| d  ntd|d< |r?| | d }|dd ur_t| dsWt|d| _| jd d }g }|d |d }|| |d urz||d! |d < ||fS )#Nr   
chunk_sizer4   r   i:  r  FTr  )r  r  fsr/   	data_typesound)r  audio_fsr  	tokenizerr   z0.3f	load_datar   zbatch_size must be set 1r  r  )r  r  r   r  extract_featbatch_data_timedevice)r  	waveforms)r   r   r  r   r  
output_dirwriter
best_recog)r  r[   textr   )r   r  r   r\   r  timeperf_counterr   r   r   r   r   r   r   frame_shiftlfr_nr   r  r   r  hasattrr   r  r   ) rW   data_indata_lengthsr  r  r  r   rX   	meta_datar  chunk_stride_samplestime1r  r  cfgaudio_sample_list	_is_finaltime2audio_samplenmr  r
  audio_sample_ispeechspeech_lengthstime3batch
segments_iibest_writerresultsresult_ir   r   r   	inference  s   








&


zFsmnVADStreaming.inferencec                 K   s"   ddl m} |dd| i|}|S )Nr   )export_rebuild_modelmodelr   )export_metar<  )rW   rX   r<  modelsr   r   r   export  s   zFsmnVADStreaming.exportc                 C   s~   |d j tjkr
dS t| jjd ddD ](}tj}| j|d j	d | |d j
 |d}| j||d j	d | d|d qdS )Nr   r   r   r   r   Fr   r   r   r   r   rH   r   r   r  r   r   r   rW   r   r
  r   r   r   r   r    s   "z#FsmnVADStreaming.DetectCommonFramesc                 C   s   |d j tjkr
dS t| jjd ddD ];}tj}| j|d j	d | |d j
 |d}|dkrA| j||d j	d | d|d q| j||d j	d d|d qdS )Nr   r   r   r   r   FTrA  rB  r   r   r   r     s   "z!FsmnVADStreaming.DetectLastFramescur_frm_statec                 C   sN  t j}|t jkrtd| jjkrt j}nt j}n|t jkr!t j}|d j|||d}| jj	}t
j|kr|d j}d|d _d|d _d}	|d jtjkr}t|d j|| j|d }	| j|	|d tj|d _t|	d |d D ]	}
| j|
|d qqn|d jtjkrt|d jd |D ]	}
| j|
|d q||d j d | jj| kr| j|dd|d tj|d _n|s| j||d n| j|||d n	 nt
j|kr)d|d _|d jtjkrn|d jtjkr&||d j d | jj| kr| j|dd|d tj|d _ny|s| j||d nm| j|||d nc	 n`t
j|kr|d|d _|d jtjkry||d j d | jj| krcd|d _ | j|dd|d tj|d _n&|so| j||d n| j|||d n	 nt
j!|kr|d  jd7  _|d jtjkr| jj"t#j$j%kr|d j| | jj&ks|r|d j'dkrt|d j(d |D ]
}
| j)|
|d q| jdd|d | jddd|d tj|d _n|| j|dkr| j)|| j|d |d n|d jtjkr|d j| |d j*krAt+|d j*| }| jj,r/|t+| jj-| 8 }|d8 }td|}| j|| dd|d tj|d _nH||d j d | jj| kra| j|dd|d tj|d _n(| jj,r|s|d jt+| jj-| kr~| j||d n
| j|||d n	 |d jtjkr| jj"t#j.j%kr| j/|d d S d S d S )	Nr2   r   r   r   r   Fr   T)0r   r   r   r   fabsr   rN   r    r   rT   r!   r'   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rG   r   r   r   r%   r$   r   r&   r:   r*   r+   r[   r=   r   r   r   r   r\   rD   rF   r,   r   )rW   rC  r   r   r   tmp_cur_frm_statestate_changefrm_shift_in_mssilence_frame_countr   r   lookback_framer   r   r   r     s   

















zFsmnVADStreaming.DetectOneFrame)NNN)NNNNN)&r   r   r   rZ   strr
   r   r	   rY   r   r   r   r   Tensorr   r\   r   r]   r   r   r   r   r   r   r   r   r  tensorr  r  listr;  r@  r  r  r   r   __classcell__r   r   r   r   r     s    

: 

;
H
e)&osjsonr!  r   r   r   r   r   enumr   dataclassesr   funasr.registerr   typingr   r   r   r	   r
   funasr.utils.datadir_writerr   funasr.utils.load_utilsr   r   r   r   r!   r*   r-   objectr_   rl   rs   r   registerModuler   r   r   r   r   <module>   s0   	GU
$