o
    i]U                  :   @   s0  d dl Z d dlZd dlZd dlZd dlmZ d dlmZmZm	Z	m
Z
mZ d dlZd dlZd dlmZmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dl m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z'm(Z(m)Z) d dl*m+Z+ d dl,m-Z- d dl.m/Z/ d dl0m1Z1 d dl2m3Z3 d dl4m5Z5 d dl6m7Z7 G dd dZ8de9de:de:de;de9de;de;d e;d!e:d"e:d#e:d$e;d%e;d&ee;e9f d'e	e
e9e9e9f  d(ee9 d)e9d*e9d+ee9 d,ee9 d-ee9 d.ee9 d/ee9 d0ee9 d1e<d2e;d3e<d4e;d5e;f:d6d7Z=d8d9 Z>d=d:d;Z?e@d<kre?  dS dS )>    N)Path)ListOptionalSequenceTupleUnion)check_argument_typescheck_return_type)ContextualBlockConformerEncoder)!ContextualBlockTransformerEncoder)DatadirWriter)ASRTask)LMTask)build_tokenizer)TokenIDConverter)	to_device)set_all_random_seed)config_argparse)str2boolstr2triple_strstr_or_none)BatchBeamSearchOnline)
Hypothesis)TooShortUttError)BatchScorerInterface)CTCPrefixScorer)LengthBonus)get_commandline_argsc                !   @   s  e Zd ZdZ																	
		d*deeef deeef deeef deeef dedededededededededededef ddZ	dd Z
	
d+d ejd!efd"d#Ze 	$d,d eejejf d!ed%eeee ee ee ef  fd&d'Zd(d) ZdS )-Speech2TextStreaminga  Speech2TextStreaming class

    Details in "Streaming Transformer ASR with Blockwise Synchronous Beam Search"
    (https://arxiv.org/abs/2006.14941)

    Examples:
        >>> import soundfile
        >>> speech2text = Speech2TextStreaming("asr_config.yml", "asr.pth")
        >>> audio, rate = soundfile.read("speech.wav")
        >>> speech2text(audio)
        [(text, token, token_int, hypothesis object), ...]

    Ncpu           float32         ?      ?Fr   asr_train_configasr_model_filelm_train_configlm_file
token_typebpemodeldevicemaxlenratiominlenratio
batch_sizedtype	beam_size
ctc_weight	lm_weightpenaltynbestc           "      C   s  t  sJ i }t|||\}}|jtt|d  t|jt	s)t|jt
s)J |j}t|j|jd}|j}|j||tt|d |d urUt|||\}}|j|d< td| |||d}d|v seJ d|jv slJ d	|jv ssJ d
|jv szJ |
dksJ t||||j|jt|||dkrd nd|||d}dd |j D }t|dksJ td |j|tt|d  | D ]}t|tjjr|j|tt|d  qtd|  td| d|  |d u r|j }|d u r|j!}|d u rd } n|dkr|d urt"||d} nd } nt"|d} t#|d}!td|   || _$|| _%|!| _&| | _'|| _(|| _)|	| _*|| _+|| _,|| _-d|j.v rO|j.d | _/nd| _/d|j.v r_|j.d | _0nd| _0d|j.v rw|j.d d urw|j.d | _1n| j/| _1| 2  d S ) Nr0   )ctceos)decoderr7   length_bonuslmr%   )r9   r7   r;   r:   encoder_conf
look_aheadhop_size
block_sizer!   full)r1   weightsscorerssosr8   
vocab_size
token_listpre_beam_score_keydisable_repetition_detectiondecoder_text_length_limitencoded_feat_length_limitc                 S   s   g | ]\}}t |ts|qS  )
isinstancer   .0kvrJ   rJ   W/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/bin/asr_inference_streaming.py
<listcomp>   s    z1Speech2TextStreaming.__init__.<locals>.<listcomp>r   z1BatchBeamSearchOnline implementation is selected.)r,   r0   zBeam_search: zDecoding device=z, dtype=bpe)r*   r+   )r*   )rE   zText tokenizer: n_ffti   
hop_length   
win_length)3r   r   build_model_from_filetogetattrtorchevalrK   encoderr   r
   r9   r   r7   r8   rE   updater   lenr   r;   dictr<   r   rC   full_scorersitemslogginginfovaluesnnModuler*   r+   r   r   	asr_modelasr_train_args	converter	tokenizerbeam_searchr-   r.   r,   r0   r5   frontend_confrS   rT   rV   reset)"selfr&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   rG   rH   rI   rB   rg   rh   r9   r7   rE   r;   lm_train_argsrA   rk   	non_batchscorerrj   ri   rJ   rJ   rP   __init__4   s   








zSpeech2TextStreaming.__init__c                 C   s   d | _ d | _| j  d S N)frontend_statesencoder_statesrk   rm   )rn   rJ   rJ   rP   rm      s   zSpeech2TextStreaming.resetspeechis_finalc                 C   s  |d ur|d }t j||gdd}|d| jkrdnd}|sH|r9t j| j|d |jd}t j||gdd}nd }d }d| i}	|||	fS |rO|}
d }nF|d| j| j  | j }|d| j| j  | j }|dd| j| j || j  }
|d|d| j| j  | | j| j |  }|
	d
tt | j}
|
jdgt j|
dd}|
|d	}t|| jd
}| jjdi |\}}| jjd ur| j||\}}|r|d u rni|dtt| j| j d |dtt| j| j d  }nD|d u r|dd|dtt| j| j d  }n&|dtt| j| j d |ddtt| j| j d   }|jdgt j|dd}|rXd }	nd|i}	|||	fS )Nwaveform_bufferr   )dimFTr6   r!   )r0   
fill_value)rv   speech_lengths)r,      rJ   )rZ   catsizerV   zerosr0   clonerT   narrow	unsqueezerX   rY   new_fulllongr   r,   rg   _extract_feats	normalizemathceil)rn   rv   prev_statesrw   bufhas_enough_samplespadfeatsfeats_lengthsnext_statesspeech_to_processrx   n_frames
n_residuallengthsbatchrJ   rJ   rP   apply_frontend   s   





z#Speech2TextStreaming.apply_frontendTreturnc           	      C   s   t  sJ t|tjrt|}| j|| j|d\}}| _|durD| jj	||| j
|dd\}}| _
| j|d | j| j|d}| |}ng }|rL|   |S )zInference

        Args:
            data: Input speech data
        Returns:
            text, token, token_int, hyp

        rw   NT)rw   
infer_moder   )xr-   r.   rw   )r   rK   npndarrayrZ   tensorr   rt   rg   r\   ru   rk   r-   r.   assemble_hypsrm   )	rn   rv   rw   r   r   enc_
nbest_hypsretrJ   rJ   rP   __call__$  s2   

zSpeech2TextStreaming.__call__c                 C   s   |d | j  }g }|D ]<}t|tsJ t||jdd  }ttdd |}| j	|}| j
d ur<| j
|}nd }|||||f qt|sNJ |S )Nr!   c                 S   s   | dkS )Nr   rJ   r   rJ   rJ   rP   <lambda>Z      z4Speech2TextStreaming.assemble_hyps.<locals>.<lambda>)r5   rK   r   typeyseqtolistlistfilterri   
ids2tokensrj   tokens2textappendr	   )rn   hypsr   resultshyp	token_inttokentextrJ   rJ   rP   r   P  s   
z"Speech2TextStreaming.assemble_hyps)NNNNNr   r    r    r!   r"   r#   r$   r%   r    r!   Fr   r   )NF)T)__name__
__module____qualname____doc__r   r   strfloatintrr   rm   rZ   Tensorboolr   no_gradr   r   r   r   r   r   r   r   rJ   rJ   rJ   rP   r   %   s    



	

 
Y+r   
output_dirr-   r.   r/   r0   r1   ngpuseedr2   r3   r4   r5   num_workers	log_leveldata_path_and_name_and_typekey_filer&   r'   r(   r)   word_lm_train_configword_lm_filer*   r+   allow_variable_data_keyssim_chunk_lengthrG   rI   rH   c           /      C   sD  t  sJ |dkrtd|d urtd|dkrtdtj|dd |dkr+d}nd}t| td2i d	|d
|d|d|d|d|d|d|d|d|d|d|d|	d|
d|d|d|d|}tj|||||t|j	dt
|j	d|dd	}t| } |D ]\}!}"t|"tsJ t|"tdd |!D sJ |!ttt|" }#t|!|#ksJ t|! d |# d!d" |" D }"t|" dksJ z=|d#kr|d2i |"}$n/|"d$ }%tt|%| D ]}&||%|&| |&d |  dd% q||%|&d | t|% dd&}$W n1 tyE }' z$td'|! d(|'  td)i i g d*}(d(d+gd,g|(gg| }$W Y d }'~'nd }'~'ww |!d# })ttd|d |$D ]9\}*\}+},}-}(| |* d- }.d(|,|.d. |)< d(tt|-|.d/ |)< t|(j|.d0 |)< |+d ur|+|.d1 |)< qTqW d    d S 1 sw   Y  d S )3Nr!   z!batch decoding is not implementedzWord LM is not implementedz%only single GPU decoding is supportedz>%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s)levelformatcudar   r&   r'   r(   r)   r*   r+   r,   r-   r.   r0   r1   r2   r3   r4   r5   rG   rH   rI   FT)r0   r/   r   r   preprocess_fn
collate_fnr   	inferencec                 s   s    | ]}t |tV  qd S rs   )rK   r   )rM   srJ   rJ   rP   	<genexpr>  s    zinference.<locals>.<genexpr>z != c                 S   s$   i | ]\}}| d s||d qS )_lengthsr   )endswithrL   rJ   rJ   rP   
<dictcomp>  s   $ zinference.<locals>.<dictcomp>r   rv   )rv   rw   r   z
Utterance  r    )scorescoresstatesr   z<space>r|   
best_recogr   r   r   r   rJ   )r   NotImplementedErrorrb   basicConfigr   r   r   build_streaming_iteratorbuild_preprocess_fnrh   build_collate_fnr   rK   r_   r   allr^   nextiterrd   ra   keysranger   warningr   zipjoinmapr   r   )/r   r-   r.   r/   r0   r1   r   r   r2   r3   r4   r5   r   r   r   r   r&   r'   r(   r)   r   r   r*   r+   r   r   rG   rI   rH   r,   speech2textloaderwriterr   r   _bsr   rv   ier   keynr   r   r   ibest_writerrJ   rJ   rP   r   i  s   
	
""$
$r   c                  C   s8  t jdtjd} | jddd dddd	 | jd
tdd | jdtddd | jdtddd | jddg ddd | jdtddd | d}|jdtddd |jdt	d  |jd!t
d"d# |jd$tdd%d | d&}|jd'tdd |jd(tdd |jd)td  |jd*td  |jd+td  |jd,td  | d-}|jd.tdd/d |jd0tdd1d |jd2td3d4d |jd5td6d7d |jd8td6d9d |jd:td6d;d |jd<td=d>d |jd?td@dAd |jdBt
d"d# |jdCtddDd |jdEtddFd | dG}|jdHt	d g dIdJd	 |jdKt	d dLd | S )MNzASR Decoding)descriptionformatter_classz--log_levelc                 S   s   |   S rs   )upperr   rJ   rJ   rP   r     r   zget_parser.<locals>.<lambda>INFO)CRITICALERRORWARNINGr   DEBUGNOTSETzThe verbose level of logging)r   defaultchoiceshelpz--output_dirT)r   requiredz--ngpur   z(The number of gpus. 0 indicates CPU mode)r   r   r   z--seedzRandom seedz--dtyper"   )float16r"   float64z	Data type)r   r   r   z--num_workersr!   z)The number of workers used for DataLoaderzInput data relatedz--data_path_and_name_and_typer   )r   r   actionz
--key_file)r   z--allow_variable_data_keysF)r   r   z--sim_chunk_lengthz_The length of one chunk, to which speech will be divided for evalution of streaming processing.zThe model configuration relatedz--asr_train_configz--asr_model_filez--lm_train_configz	--lm_filez--word_lm_train_configz--word_lm_filezBeam-search relatedz--batch_sizezThe batch size for inferencez--nbestzOutput N-best hypothesesz--beam_sizer#   z	Beam sizez	--penaltyr    zInsertion penaltyz--maxlenratiozInput length ratio to obtain max output length. If maxlenratio=0.0 (default), it uses a end-detect function to automatically find maximum hypothesis lengthsz--minlenratioz.Input length ratio to obtain min output lengthz--ctc_weightr$   zCTC weight in joint decodingz--lm_weightr%   zRNNLM weightz--disable_repetition_detectionz--encoded_feat_length_limitz@Limit the lengths of the encoded featureto input to the decoder.z--decoder_text_length_limitz5Limit the lengths of the textto input to the decoder.zText converter relatedz--token_type)charrR   NzIThe token type for ASR model. If not given, refers from the training argsz
--bpemodelzLThe model path of sentencepiece. If not given, refers from the training args)r   ArgumentParserargparseArgumentDefaultsHelpFormatteradd_argumentr   r   add_argument_groupr   r   r   r   )parsergrouprJ   rJ   rP   
get_parser  s   


	
r	  c                 C   sF   t t tjd t }|| }t|}|dd  tdi | d S )N)fileconfigrJ   )	printr   sysstderrr	  
parse_argsvarspopr   )cmdr  argskwargsrJ   rJ   rP   mainp  s   
r  __main__rs   )Ar  rb   r   r  pathlibr   typingr   r   r   r   r   numpyr   rZ   	typeguardr   r	   6espnet2.asr.encoder.contextual_block_conformer_encoderr
   8espnet2.asr.encoder.contextual_block_transformer_encoderr   espnet2.fileio.datadir_writerr   espnet2.tasks.asrr   espnet2.tasks.lmr   espnet2.text.build_tokenizerr   espnet2.text.token_id_converterr    espnet2.torch_utils.device_funcsr   'espnet2.torch_utils.set_all_random_seedr   espnet2.utilsr   espnet2.utils.typesr   r   r   $espnet.nets.batch_beam_search_onliner   espnet.nets.beam_searchr   3espnet.nets.pytorch_backend.transformer.subsamplingr   espnet.nets.scorer_interfacer   espnet.nets.scorers.ctcr    espnet.nets.scorers.length_bonusr   espnet.utils.cli_utilsr   r   r   r   r   r   r   r	  r  r   rJ   rJ   rJ   rP   <module>   s     F	


  

	
