o
    iQE                  8   @   s$  d dl Z d dlZd dlZd dlmZ d dlmZmZmZm	Z	m
Z
mZ d dlZd dlZd dlmZmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dl m!Z! d dl"m#Z# d dl$m%Z%m&Z&m'Z' d dl(m)Z) d dl*m+Z+m,Z, d dl-m.Z. d dl/m0Z0 d dl1m2Z2 d dl3m4Z4 G dd dZ5de6de7de7de8de6de8de8de8de7d e7d!e7d"e8d#e8d$ee8e6f d%e	e
e6e6e6f  d&ee6 d'ee6 d(ee6 d)ee6 d*ee6 d+ee6 d,ee6 d-ee6 d.ee6 d/ee6 d0ee6 d1e9d2e9f8d3d4Z:d5d6 Z;d:d7d8Z<e=d9kre<  dS dS );    N)Path)AnyListOptionalSequenceTupleUnion)check_argument_typescheck_return_type)DatadirWriter)
EnhS2TTask)LMTask)STTask)build_tokenizer)TokenIDConverter)	to_device)set_all_random_seed)config_argparse)str2boolstr2triple_strstr_or_none)BatchBeamSearch)
BeamSearch
Hypothesis)TooShortUttError)BatchScorerInterface)LengthBonus)get_commandline_argsc                '   @   s  e Zd ZdZ																	
			d)deeef deeef deeef deeef dedeeef dedededededededededededede	f&dd Z
e d!eejejf d"eeee ee ee ef  fd#d$Ze	d*d%ee d&ee fd'd(ZdS )+Speech2Texta  Speech2Text class

    Examples:
        >>> import soundfile
        >>> speech2text = Speech2Text("st_config.yml", "st.pth")
        >>> audio, rate = soundfile.read("speech.wav")
        >>> speech2text(audio)
        [(text, token, token_int, hypothesis object), ...]

    Nfullcpu           float32         ??Fst_train_configst_model_filelm_train_configlm_filengram_scorer
ngram_file
token_typebpemodeldevicemaxlenratiominlenratio
batch_sizedtype	beam_size	lm_weightngram_weightpenaltynbestenh_s2t_taskc           %   
   C   s  t  sJ |s	tnt}i }||||	\}}|r |jg dd |jtt|d  |j	}|j
}|j|tt|d |d urNt|||	\}}|j|d< |d urn|dkrbddlm} |||}ndd	lm} |||}nd }||d
< td|||d}t||||j|jt||dd} |dkrdd | j D }!t|!dkrt| _td n	td|! d | j|	tt|d  | D ]}"t|"tj j!r|"j|	tt|d  qtd|   td|	 d|  |d u r|j"}|d u r|j#}|d u rd }#n|dkr|d urt$||d}#nd }#nt$|d}#t%|d}$td|#  || _&|| _'|$| _(|#| _)| | _*|
| _+|| _,|	| _-|| _.|| _/d S )N)ctcdecodereosjoint_networksos
token_listuse_transducer_decoder)inherite_s2t_attrs)r3   )r;   length_bonuslmr   r   )NgramFullScorer)NgramPartScorerngramr%   )r;   rC   rF   rB   )r4   weightsscorersr>   r<   
vocab_sizer?   pre_beam_score_keyr"   c                 S   s   g | ]\}}t |ts|qS  )
isinstancer   .0kvrK   rK   L/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/bin/st_inference.py
<listcomp>   s    z(Speech2Text.__init__.<locals>.<listcomp>z+BatchBeamSearch implementation is selected.zAs non-batch scorers z2 are found, fall back to non-batch implementation.)r/   r3   zBeam_search: zDecoding device=z, dtype=bpe)r-   r.   )r-   )r?   zText tokenizer: )0r	   r   r   build_model_from_fileinherite_attributestogetattrtorchevalr;   r?   updater   lenr   rC   espnet.nets.scorers.ngramrD   rE   dictr   r>   r<   full_scorersitemsr   	__class__logginginfowarningvaluesrL   nnModuler-   r.   r   r   st_modelst_train_args	converter	tokenizerbeam_searchr0   r1   r/   r3   r8   )%selfr'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   taskrH   rg   rh   r;   r?   rC   lm_train_argsrD   rF   rE   rG   rk   	non_batchscorerrj   ri   rK   rK   rQ   __init__*   s   









zSpeech2Text.__init__speechreturnc                 C   sP  t  sJ t|tjrt|}|dtt| j	}|j
dgtj|dd}||d}t|| jd}| jjdi |\}}t|dksLJ t|| j|d | j| jd}|d| j }g }|D ]<}t|tspJ t||jdd  }	ttd	d
 |	}	| j|	}
| jdur| j|
}nd}| ||
|	|f qct!|sJ |S )zInference

        Args:
            data: Input speech data
        Returns:
            text, token, token_int, hyp

        r   r"   )r3   
fill_value)rr   speech_lengths)r/   )xr0   r1   Nc                 S   s   | dkS )Nr   rK   rv   rK   rK   rQ   <lambda>       z&Speech2Text.__call__.<locals>.<lambda>rK   )"r	   rL   npndarrayrX   tensor	unsqueezerV   rW   r3   new_fulllongsizer   r/   rg   encoder[   rk   r0   r1   r8   r   typeyseqtolistlistfilterri   
ids2tokensrj   tokens2textappendr
   )rl   rr   lengthsbatchenc_
nbest_hypsresultshyp	token_inttokentextrK   rK   rQ   __call__   s2   



zSpeech2Text.__call__	model_tagkwargsc                 K   s^   | dur(zddl m} W n ty   td  w | }|jdi ||  tdi |S )a   Build Speech2Text instance from the pretrained model.

        Args:
            model_tag (Optional[str]): Model tag of the pretrained models.
                Currently, the tags of espnet_model_zoo are supported.
        Returns:
            Speech2Text: Speech2Text instance.

        Nr   )ModelDownloaderzZ`espnet_model_zoo` is not installed. Please install via `pip install -U espnet_model_zoo`.rK   )espnet_model_zoo.downloaderr   ImportErrorra   errorrZ   download_and_unpackr   )r   r   r   drK   rK   rQ   from_pretrained   s   zSpeech2Text.from_pretrained)NNNNr   NNNr    r!   r!   r"   r#   r$   r%   r&   r!   r"   FN)__name__
__module____qualname____doc__r   r   strfloatintboolrq   rX   no_gradTensorr{   r|   r   r   r   r   r   staticmethodr   r   rK   rK   rK   rQ   r      s    




	

 :r   
output_dirr0   r1   r2   r3   r4   ngpuseedr5   r6   r7   r8   num_workers	log_leveldata_path_and_name_and_typekey_filer'   r(   r)   r*   word_lm_train_configword_lm_filer,   r   r-   r.   allow_variable_data_keysr9   c           -      C   s  t  sJ |dkrtd|d urtd|dkrtdtj|dd |dkr+d}nd}t| td/i d	|d
|d|d|d|d|d|d|d|d|d|d|d|d|	d|
d|d|}tjd/d|i|}tj	|||||t
|jdt|jd|dd	}t| } |D ]\}!}"t|"tsJ t|"tdd |!D sJ |!ttt|" }#t|!|#ksJ t|! d |# d!d" |" D }"z	|d/i |"}$W n1 ty }% z$td#|! d$|%  td%i i g d&}&d$d'gd(g|&gg| }$W Y d }%~%nd }%~%ww |!d) }'ttd|d |$D ]9\}(\})}*}+}&| |( d* },d$|*|,d+ |'< d$tt|+|,d, |'< t|&j|,d- |'< |)d urS|)|,d. |'< qqW d    d S 1 sbw   Y  d S )0Nr"   z!batch decoding is not implementedzWord LM is not implementedz%only single GPU decoding is supportedz>%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s)levelformatcudar    r'   r(   r)   r*   r,   r-   r.   r/   r0   r1   r3   r4   r5   r6   r7   r8   r9   r   FT)r3   r2   r   r   preprocess_fn
collate_fnr   	inferencec                 s   s    | ]}t |tV  qd S r   )rL   r   )rN   srK   rK   rQ   	<genexpr>n  s    zinference.<locals>.<genexpr>z != c                 S   s$   i | ]\}}| d s||d qS )_lengthsr   )endswithrM   rK   rK   rQ   
<dictcomp>q  s   $ zinference.<locals>.<dictcomp>z
Utterance  r!   )scorescoresstatesr   z<space>   r   
best_recogr   r   r   r   rK   )r	   NotImplementedErrorra   basicConfigr   r]   r   r   r   build_streaming_iteratorbuild_preprocess_fnrh   build_collate_fnr   rL   r   allr[   nextiterrd   r_   r   rc   r   ziprangejoinmapr   r   )-r   r0   r1   r2   r3   r4   r   r   r5   r6   r7   r8   r   r   r   r   r'   r(   r)   r*   r   r   r,   r   r-   r.   r   r9   r/   speech2text_kwargsspeech2textloaderwriterkeysr   _bsr   er   keynr   r   r   ibest_writerrK   rK   rQ   r     s   
	

""$
$r   c                  C   s,  t jdtjd} | jddd dddd	 | jd
tdd | jdtddd | jdtddd | jddg ddd | jdtddd | d}|jdtddd |jdt	d  |jd!t
d"d# | d$}|jd%td&d' |jd(td)d' |jd*td+d' |jd,td-d' |jd.td/d' |jd0td1d' |jd2td3d' |jd4td5d' |jd6t
d"d7d | d8}|jd9tdd:d |jd;tdd<d |jd=td>d?d |jd@tdAdBd |jdCtdAdDd |jdEtdAdFd |jdGtdHdId |jdJtdKdLd | dM}|jdNt	d g dOdPd	 |jdQt	d dRd | S )SNzST Decoding)descriptionformatter_classz--log_levelc                 S   s   |   S r   )upperrx   rK   rK   rQ   ry     rz   zget_parser.<locals>.<lambda>INFO)CRITICALERRORWARNINGr   DEBUGNOTSETzThe verbose level of logging)r   defaultchoiceshelpz--output_dirT)r   requiredz--ngpur   z(The number of gpus. 0 indicates CPU mode)r   r   r   z--seedzRandom seedz--dtyper#   )float16r#   float64z	Data type)r   r   r   z--num_workersr"   z)The number of workers used for DataLoaderzInput data relatedz--data_path_and_name_and_typer   )r   r   actionz
--key_file)r   z--allow_variable_data_keysF)r   r   zThe model configuration relatedz--st_train_configzST training configuration)r   r   z--st_model_filezST model parameter filez--lm_train_configzLM training configurationz	--lm_filezLM parameter filez--word_lm_train_configzWord LM training configurationz--word_lm_filezWord LM parameter filez--ngram_filezN-gram parameter filez--model_tagz[Pretrained model tag. If specify this option, *_train_config and *_file will be overwrittenz--enh_s2t_taskzenhancement and asr joint modelzBeam-search relatedz--batch_sizezThe batch size for inferencez--nbestzOutput N-best hypothesesz--beam_sizer$   z	Beam sizez	--penaltyr!   zInsertion penaltyz--maxlenratiozInput length ratio to obtain max output length. If maxlenratio=0.0 (default), it uses a end-detect function to automatically find maximum hypothesis lengths.If maxlenratio<0.0, its absolute value is interpretedas a constant max output lengthz--minlenratioz.Input length ratio to obtain min output lengthz--lm_weightr%   zRNNLM weightz--ngram_weightr&   zngram weightzText converter relatedz--token_type)charrS   NzHThe token type for ST model. If not given, refers from the training argsz
--bpemodelzLThe model path of sentencepiece. If not given, refers from the training args)r   ArgumentParserargparseArgumentDefaultsHelpFormatteradd_argumentr   r   add_argument_groupr   r   r   r   )parsergrouprK   rK   rQ   
get_parser  s   



r   c                 C   sF   t t tjd t }|| }t|}|dd  tdi | d S )N)fileconfigrK   )	printr   sysstderrr   
parse_argsvarspopr   )cmdr   argsr   rK   rK   rQ   main  s   
r   __main__r   )>r   ra   r   pathlibr   typingr   r   r   r   r   r   numpyr{   rX   	typeguardr	   r
   espnet2.fileio.datadir_writerr   espnet2.tasks.enh_s2tr   espnet2.tasks.lmr   espnet2.tasks.str   espnet2.text.build_tokenizerr   espnet2.text.token_id_converterr    espnet2.torch_utils.device_funcsr   'espnet2.torch_utils.set_all_random_seedr   espnet2.utilsr   espnet2.utils.typesr   r   r   espnet.nets.batch_beam_searchr   espnet.nets.beam_searchr   r   3espnet.nets.pytorch_backend.transformer.subsamplingr   espnet.nets.scorer_interfacer    espnet.nets.scorers.length_bonusr   espnet.utils.cli_utilsr   r   r   r   r   r   r   r   r   r   rK   rK   rK   rQ   <module>   s     s	


z 

	
