o
    iQ                  8   @   s   d dl Z d dlZd dlZd dlZd dlmZ d dlmZmZm	Z	m
Z
mZ d dlZd dlZd dlmZmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dl m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z'm(Z(m)Z) d dl*m+Z+ d dl,m-Z- d dl.m/Z/ d dl0m1Z1 d dl2m3Z3 d dl4m5Z5 G dd dZ6de7de8de8de9de7de9de9de9d e8d!e8d"e9d#e9d$ee9e7f d%e	e
e7e7e7f  d&ee7 d'e7d(e7d)ee7 d*ee7 d+ee7 d,ee7 d-ee7 d.ee7 d/e:d0e9d1e:d2e9d3e9f8d4d5Z;d6d7 Z<d;d8d9Z=e>d:kre=  dS dS )<    N)Path)ListOptionalSequenceTupleUnion)check_argument_typescheck_return_type)ContextualBlockConformerEncoder)!ContextualBlockTransformerEncoder)DatadirWriter)LMTask)STTask)build_tokenizer)TokenIDConverter)	to_device)set_all_random_seed)config_argparse)str2boolstr2triple_strstr_or_none)BatchBeamSearchOnline)
Hypothesis)TooShortUttError)BatchScorerInterface)LengthBonus)get_commandline_argsc                   @   s  e Zd ZdZ																	
	
d(deeef deeef deeef deeef dededededededededededefddZ	dd Z
		d)dejdefd d!Ze 	"d*deejejf ded#eeee ee ee ef  fd$d%Zd&d' ZdS )+Speech2TextStreaminga  Speech2TextStreaming class

    Details in "Streaming Transformer ASR with Blockwise Synchronous Beam Search"
    (https://arxiv.org/abs/2006.14941)

    Examples:
        >>> import soundfile
        >>> speech2text = Speech2TextStreaming("asr_config.yml", "asr.pth")
        >>> audio, rate = soundfile.read("speech.wav")
        >>> speech2text(audio)
        [(text, token, token_int, hypothesis object), ...]

    Ncpu           float32         ?Fr   st_train_configst_model_filelm_train_configlm_file
token_typebpemodeldevicemaxlenratiominlenratio
batch_sizedtype	beam_size	lm_weightpenaltynbestc                  C   s  t  sJ i }t|||\}}|jtt|d  t|jt	s)t|jt
s)J |j}|j}|j|tt|d |d urLt|||\}}|j|d< td||d}d|v sYJ d|jv s`J d|jv sgJ d	|jv snJ |
d
kstJ t||||j|jt||d|||d}dd |j D }t|dksJ td |j|tt|d  | D ]}t|tjjr|j|tt|d  qtd|  td| d|  |d u r|j}|d u r|j}|d u rd }n|dkr|d urt ||d}nd }nt |d}t!|d}td|  || _"|| _#|| _$|| _%|| _&|| _'|	| _(|| _)|| _*|| _+d|j,v r;|j,d | _-nd| _-d|j,v rK|j,d | _.nd| _.d|j,v rc|j,d d urc|j,d | _/n| j-| _/| 0  d S )N)r.   )decoderlength_bonuslmr#   )r3   r5   r4   encoder_conf
look_aheadhop_size
block_sizer    full)r/   weightsscorerssoseos
vocab_size
token_listpre_beam_score_keydisable_repetition_detectiondecoder_text_length_limitencoded_feat_length_limitc                 S   s   g | ]\}}t |ts|qS  )
isinstancer   .0kvrE   rE   V/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/bin/st_inference_streaming.py
<listcomp>   s    z1Speech2TextStreaming.__init__.<locals>.<listcomp>r   z1BatchBeamSearchOnline implementation is selected.)r*   r.   zBeam_search: zDecoding device=z, dtype=bpe)r(   r)   )r(   )r@   zText tokenizer: n_ffti   
hop_length   
win_length)1r   r   build_model_from_filetogetattrtorchevalrF   encoderr   r
   r3   r@   updater   lenr   r5   dictr6   r   r=   r>   full_scorersitemslogginginfovaluesnnModuler(   r)   r   r   st_modelst_train_args	converter	tokenizerbeam_searchr+   r,   r*   r.   r2   frontend_confrN   rO   rQ   reset) selfr$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   rB   rC   rD   r<   rb   rc   r3   r@   r5   lm_train_argsr;   rf   	non_batchscorerre   rd   rE   rE   rK   __init__3   s   






zSpeech2TextStreaming.__init__c                 C   s   d | _ d | _| j  d S N)frontend_statesencoder_statesrf   rh   )ri   rE   rE   rK   rh      s   zSpeech2TextStreaming.resetspeechis_finalc                 C   sP  |d ur|d }t j||gdd}|r|}d }nF|d| j| j  | j }|d| j| j  | j }|dd| j| j || j  }|d|d| j| j  | | j| j |  }|dt	t | j
}|jdgt j|dd}	||	d}
t|
| jd}
| jjd	i |
\}}| jjd ur| j||\}}|r|d u rnh|dtt| j| j d |dtt| j| j d  }nC|d u r|dd|dtt| j| j d  }n&|dtt| j| j d |ddtt| j| j d   }|jdgt j|dd}|rd }nd|i}|||fS )
Nwaveform_bufferr   )dimr    )r.   
fill_value)rq   speech_lengths)r*      rE   )rU   catsizerQ   rO   narrowclone	unsqueezerS   rT   r.   new_fulllongr   r*   rb   _extract_feats	normalizemathceil)ri   rq   prev_statesrr   bufspeech_to_processrs   n_frames
n_residuallengthsbatchfeatsfeats_lengthsnext_statesrE   rE   rK   apply_frontend   s~   



z#Speech2TextStreaming.apply_frontendTreturnc           	      C   s   t  sJ t|tjrt|}| j|| j|d\}}| _| jj	||| j
|dd\}}| _
| j|d | j| j|d}| |}|rE|   |S )zInference

        Args:
            data: Input speech data
        Returns:
            text, token, token_int, hyp

        rr   T)rr   
infer_moder   )xr+   r,   rr   )r   rF   npndarrayrU   tensorr   ro   rb   rW   rp   rf   r+   r,   assemble_hypsrh   )	ri   rq   rr   r   r   enc_
nbest_hypsretrE   rE   rK   __call__  s.   


zSpeech2TextStreaming.__call__c                 C   s   |d | j  }g }|D ]<}t|tsJ t||jdd  }ttdd |}| j	|}| j
d ur<| j
|}nd }|||||f qt|sNJ |S )Nr    c                 S   s   | dkS )Nr   rE   r   rE   rE   rK   <lambda>F      z4Speech2TextStreaming.assemble_hyps.<locals>.<lambda>)r2   rF   r   typeyseqtolistlistfilterrd   
ids2tokensre   tokens2textappendr	   )ri   hypsr   resultshyp	token_inttokentextrE   rE   rK   r   <  s   
z"Speech2TextStreaming.assemble_hyps)NNNNNr   r   r   r    r!   r"   r#   r   r    Fr   r   )NF)T)__name__
__module____qualname____doc__r   r   strfloatintrm   rh   rU   Tensorboolr   no_gradr   r   r   r   r   r   r   r   rE   rE   rE   rK   r   $   s    



	

 
N'r   
output_dirr+   r,   r-   r.   r/   ngpuseedr0   r1   r2   num_workers	log_leveldata_path_and_name_and_typekey_filer$   r%   r&   r'   word_lm_train_configword_lm_filer(   r)   allow_variable_data_keyssim_chunk_lengthrB   rD   rC   c           .      C   s`  t  sJ |dkrtd|d urtd|dkrtdtj|dd |dkr+d}nd}t| td1i d	|d
|d|d|d|d|d|d|d|d|d|d|d|	d|
d|d|d|}tj|||||t|j	dt
|j	d|dd	}t| }|D ]\} }!t|!tsJ t|!tdd | D sJ | ttt|! }"t| |"ksJ t|  d|" d d! |! D }!t|! dksJ zN|d"kr|d1i |!}#n@|!d# }$t|$| dkrtt|$| D ]}%||$|%| |%d |  dd$ q||$|%d | t|$ dd%}#n|d1i |!}#W n1 tyS }& z$td&|  d'|&  td(i i g d)}'d'd*gd+g|'gg|
 }#W Y d }&~&nd }&~&ww | d" }(ttd|
d |#D ]9\})\}*}+},}'||) d, }-d'|+|-d- |(< d'tt|,|-d. |(< t|'j|-d/ |(< |*d ur|*|-d0 |(< qbqW d    d S 1 sw   Y  d S )2Nr    z!batch decoding is not implementedzWord LM is not implementedz%only single GPU decoding is supportedz>%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s)levelformatcudar   r$   r%   r&   r'   r(   r)   r*   r+   r,   r.   r/   r0   r1   r2   rB   rC   rD   FT)r.   r-   r   r   preprocess_fn
collate_fnr   	inferencec                 s   s    | ]}t |tV  qd S rn   )rF   r   )rH   srE   rE   rK   	<genexpr>  s    zinference.<locals>.<genexpr>z != c                 S   s$   i | ]\}}| d s||d qS )_lengthsr   )endswithrG   rE   rE   rK   
<dictcomp>  s   $ zinference.<locals>.<dictcomp>r   rq   )rq   rr   r   z
Utterance  r   )scorescoresstatesr   z<space>rw   
best_recogr   r   r   r   rE   )r   NotImplementedErrorr]   basicConfigr   r   r   build_streaming_iteratorbuild_preprocess_fnrc   build_collate_fnr   rF   rZ   r   allrY   nextiterr_   r\   keysranger   warningr   zipjoinmapr   r   ).r   r+   r,   r-   r.   r/   r   r   r0   r1   r2   r   r   r   r   r$   r%   r&   r'   r   r   r(   r)   r   r   rB   rD   rC   r*   speech2textloaderwriterr   r   _bsr   rq   ier   keynr   r   r   ibest_writerrE   rE   rK   r   U  s   
	
""$
$r   c                  C   s&  t jdtjd} | jddd dddd	 | jd
tdd | jdtddd | jdtddd | jddg ddd | jdtddd | d}|jdtddd |jdt	d  |jd!t
d"d# |jd$tdd%d | d&}|jd'tdd |jd(tdd |jd)td  |jd*td  |jd+td  |jd,td  | d-}|jd.tdd/d |jd0tdd1d |jd2td3d4d |jd5td6d7d |jd8td6d9d |jd:td6d;d |jd<td=d>d |jd?t
d"d# |jd@tddAd |jdBtddCd | dD}|jdEt	d g dFdGd	 |jdHt	d dId | S )JNzST Decoding)descriptionformatter_classz--log_levelc                 S   s   |   S rn   )upperr   rE   rE   rK   r     r   zget_parser.<locals>.<lambda>INFO)CRITICALERRORWARNINGr   DEBUGNOTSETzThe verbose level of logging)r   defaultchoiceshelpz--output_dirT)r   requiredz--ngpur   z(The number of gpus. 0 indicates CPU mode)r   r   r   z--seedzRandom seedz--dtyper!   )float16r!   float64z	Data type)r   r   r   z--num_workersr    z)The number of workers used for DataLoaderzInput data relatedz--data_path_and_name_and_typer   )r   r   actionz
--key_file)r   z--allow_variable_data_keysF)r   r   z--sim_chunk_lengthz_The length of one chunk, to which speech will be divided for evalution of streaming processing.zThe model configuration relatedz--st_train_configz--st_model_filez--lm_train_configz	--lm_filez--word_lm_train_configz--word_lm_filezBeam-search relatedz--batch_sizezThe batch size for inferencez--nbestzOutput N-best hypothesesz--beam_sizer"   z	Beam sizez	--penaltyr   zInsertion penaltyz--maxlenratiozInput length ratio to obtain max output length. If maxlenratio=0.0 (default), it uses a end-detect function to automatically find maximum hypothesis lengthsz--minlenratioz.Input length ratio to obtain min output lengthz--lm_weightr#   zRNNLM weightz--disable_repetition_detectionz--encoded_feat_length_limitz@Limit the lengths of the encoded featureto input to the decoder.z--decoder_text_length_limitz5Limit the lengths of the textto input to the decoder.zText converter relatedz--token_type)charrM   NzHThe token type for ST model. If not given, refers from the training argsz
--bpemodelzLThe model path of sentencepiece. If not given, refers from the training args)r   ArgumentParserargparseArgumentDefaultsHelpFormatteradd_argumentr   r   add_argument_groupr   r   r   r   )parsergrouprE   rE   rK   
get_parser  s   


	
r  c                 C   sF   t t tjd t }|| }t|}|dd  tdi | d S )N)fileconfigrE   )	printr   sysstderrr  
parse_argsvarspopr   )cmdr   argskwargsrE   rE   rK   mainY  s   
r  __main__rn   )?r   r]   r   r  pathlibr   typingr   r   r   r   r   numpyr   rU   	typeguardr   r	   6espnet2.asr.encoder.contextual_block_conformer_encoderr
   8espnet2.asr.encoder.contextual_block_transformer_encoderr   espnet2.fileio.datadir_writerr   espnet2.tasks.lmr   espnet2.tasks.str   espnet2.text.build_tokenizerr   espnet2.text.token_id_converterr    espnet2.torch_utils.device_funcsr   'espnet2.torch_utils.set_all_random_seedr   espnet2.utilsr   espnet2.utils.typesr   r   r   $espnet.nets.batch_beam_search_onliner   espnet.nets.beam_searchr   3espnet.nets.pytorch_backend.transformer.subsamplingr   espnet.nets.scorer_interfacer    espnet.nets.scorers.length_bonusr   espnet.utils.cli_utilsr   r   r   r   r   r   r   r  r  r   rE   rE   rE   rK   <module>   s     3	


 

{
	
