o
    iKd                  F   @   s  d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZm	Z	m
Z
mZmZmZ d dlZd dlZd dlZd dlmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z& d dl'm(Z( d dl)m*Z* d dl+m,Z, d dl-m.Z.m/Z/m0Z0 d dl1m2Z2 d dl3m4Z4 d dl5m6Z6mZ d dl7m8Z8 d dl9m:Z: d dl;m<Z< d dl=m>Z> d dl?m@Z@ G dd dZAdeBdeCdeCd eDd!eBd"eDd#eDd$eDd%eCd&eCd'eCd(eCd)eDd*eDd+eeDeBf d,eeeBeBeBf  d-e
eB d.e
eB d/e
eB d0e
eB d1e
eB d2e
eB d3e
eB d4e
eB d5e
eB d6e
eB d7e
eB d8eEd9e
eF d:eEd;eEd<eEd=eEd>e	eB d?eBfFd@dAZGdBdC ZHdGdDdEZIeJdFkrJeI  dS dS )H    N)LooseVersion)Path)AnyListOptionalSequenceTupleUnion)check_argument_typescheck_return_type)BeamSearchTransducer)ExtendedHypothesis)
Hypothesis)DatadirWriter)ASRTask)
EnhS2TTask)LMTask)build_tokenizer)TokenIDConverter)	to_device)set_all_random_seed)config_argparse)str2boolstr2triple_strstr_or_none)BatchBeamSearch)BatchBeamSearchOnlineSim)
BeamSearchr   )TooShortUttError)BatchScorerInterface)CTCPrefixScorer)LengthBonus)get_commandline_argsc                5   @   sd  e Zd ZdZdddddddddddddddd	d
ddddddddgdfdeeef deeef dedeeef deeef dedeeef dededededede	dede	deded ed!ed"e	d#e
d$e
d%e
d&e
d'ee d(ef4d)d*Ze d+eejejf d,eeee ee ee	 eeeef f  fd-d.Zd/ejfd0d1Ze	d6d2ee d3ee fd4d5ZdS )7Speech2Texta  Speech2Text class

    Examples:
        >>> import soundfile
        >>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
        >>> audio, rate = soundfile.read("speech.wav")
        >>> speech2text(audio)
        [(text, token, token_int, hypothesis object), ...]

    Nfullcpu           float32         ?      ??FLinearqint8asr_train_configasr_model_filetransducer_conflm_train_configlm_filengram_scorer
ngram_file
token_typebpemodeldevicemaxlenratiominlenratio
batch_sizedtype	beam_size
ctc_weight	lm_weightngram_weightpenaltynbest	streamingenh_s2t_taskquantize_asr_modelquantize_lmquantize_modulesquantize_dtypec           .   
   C   s  t  sJ |s	tnt}|s|r|dkrtjtdk rtdtdd |D }tt|}i }|	|||
\}}|rA|j
g dd |jtt|d  |r\td	 tjj|||d
}|j}t|j|jd} |j}!|j|| tt|!d |d urt	|||
\}"}#|rtd tjj|"||d
}"|"j|d< |d ur|dkrddlm}$ |$||!}%nddlm}& |&||!}%nd }%|%|d< |jrtd'|j|j |d|v r|d nd ||!d|}'d }(nd }'t!d| ||||d})t"||)||j#|jt|!|!|dkrd ndd}(|dkr8dd |(j$% D }*t|*dkr/|r&t&|(_'|((| td nt)|(_'td n	t*d|* d |(j|
tt|d  |+ D ]}+t,|+tj-j.r^|+j|
tt|d  qHtd|(  td |
 d!|  |d u r{|j/}|	d u r|j0}	|d u rd },n|d"kr|	d urt1||	d#},nd },nt1|d$},t2|!d%}-td&|,  || _3|| _4|-| _5|,| _6|(| _7|'| _8|| _9|| _:|
| _;|| _<|| _=|| _>d S )(Nfloat16z1.5.0zrfloat16 dtype for dynamic quantization is not supported with torch version < 1.5.0. Switch to qint8 dtype instead.c                 S   s   g | ]}t tj|qS  )getattrtorchnn).0qrJ   rJ   M/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/bin/asr_inference.py
<listcomp>]   s    z(Speech2Text.__init__.<locals>.<listcomp>)ctcdecodereosjoint_networksos
token_listuse_transducer_decoder)inherite_s2t_attrs)r<   z%Use quantized asr model for decoding.)qconfig_specr<   )rR   rT   )rS   rR   length_bonuszUse quantized lm for decoding.lmr$   r   )NgramFullScorer)NgramPartScorerngram)rS   rU   r=   r\   r?   rW   r+   )rS   rR   r\   r_   r[   )r=   weightsscorersrV   rT   
vocab_sizerW   pre_beam_score_keyr'   c                 S   s   g | ]\}}t |ts|qS rJ   )
isinstancer   rN   kvrJ   rJ   rP   rQ      s    z4BatchBeamSearchOnlineSim implementation is selected.z+BatchBeamSearch implementation is selected.zAs non-batch scorers z2 are found, fall back to non-batch implementation.)r8   r<   zBeam_search: zDecoding device=z, dtype=bpe)r6   r7   )r6   )rW   zText tokenizer: rJ   )?r
   r   r   rL   __version__r   
ValueErrorsetrK   build_model_from_fileinherite_attributestoevallogginginfoquantizationquantize_dynamicrS   r    rR   rT   rW   updater!   lenr   r\   espnet.nets.scorers.ngramr]   r^   rX   r   rU   dictr   rV   full_scorersitemsr   	__class__set_streaming_configr   warningvaluesrd   rM   Moduler6   r7   r   r   	asr_modelasr_train_args	converter	tokenizerbeam_searchbeam_search_transducerr9   r:   r8   r<   rB   rD   ).selfr/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   taskra   r   r   rS   rR   rW   r\   lm_train_argsr]   r_   r^   r   r   r`   	non_batchscorerr   r   rJ   rJ   rP   __init__3   s  





	










zSpeech2Text.__init__speechreturnc                 C   s  t  sJ t|tjrt|}|dtt| j	}|j
dgtj|dd}||d}tdt|d  t|| jd}| jjd
i |\}}| jrt| jjdd}t||kseJ t||fg }t|dD ]3\}}	td| t|	tr|	d }	t|	dksJ t|	| |	d }
t|
sJ ||
 ql|S t|tr|d }t|dksJ t|| |d }t|sJ |S )zInference

        Args:
            data: Input speech data
        Returns:
            text, token, token_int, hyp

        r   r'   )r<   
fill_value)r   speech_lengthszspeech length: )r8   num_spkz=== [EnhASR] Speaker {} ===NrJ   )r
   rd   npndarrayrL   tensor	unsqueezern   rK   r<   new_fulllongsizerp   rq   strr   r8   r   encoderD   	enh_modelru   	enumerateformattuple_decode_single_sampler   append)r   r   lengthsbatchenc_r   resultsspkenc_spkretrJ   rJ   rP   __call__   s8   




zSpeech2Text.__call__r   c           
   
   C   sf  | j rGtdt|jd   |  |}|d }td|jd td|jt|j d tdd| j	
|jdd   d	  n
| j|| j| jd
}|d | j }g }|D ]T}t|ttfskJ t|| jjrqd nd}t|jtr|jd| }n	|jd|  }ttdd |}| j	
|}| jd ur| j|}	nd }	||	|||f q\|S )Nzencoder output length: r   ztotal log probability: z.2fznormalized log probability: zbest hypo:  r'   
)xr9   r:   c                 S   s   | dkS )Nr   rJ   r   rJ   rJ   rP   <lambda>b      z3Speech2Text._decode_single_sample.<locals>.<lambda>)r   rp   rq   r   shapescoreru   yseqjoinr   
ids2tokensr   r9   r:   rB   rd   r   TransHypothesistyper   rX   listtolistfilterr   tokens2textr   )
r   r   
nbest_hypsbestr   hyplast_pos	token_inttokentextrJ   rJ   rP   r   B  s:   
"

z!Speech2Text._decode_single_sample	model_tagkwargsc                 K   s^   | dur(zddl m} W n ty   td  w | }|jdi ||  tdi |S )a!  Build Speech2Text instance from the pretrained model.

        Args:
            model_tag (Optional[str]): Model tag of the pretrained models.
                Currently, the tags of espnet_model_zoo are supported.

        Returns:
            Speech2Text: Speech2Text instance.

        Nr   )ModelDownloaderzZ`espnet_model_zoo` is not installed. Please install via `pip install -U espnet_model_zoo`.rJ   )espnet_model_zoo.downloaderr   ImportErrorrp   errorrt   download_and_unpackr#   )r   r   r   drJ   rJ   rP   from_pretrainedo  s   zSpeech2Text.from_pretrainedN)__name__
__module____qualname____doc__r	   r   r   rw   floatintboolr   r   rL   no_gradTensorr   r   r   r   r   ExtTransHypothesisr   r   r   staticmethodr   r   rJ   rJ   rJ   rP   r#   '   s    




	

 MB-r#   
output_dirr9   r:   r;   r<   r=   ngpuseedr>   r?   r@   rA   rB   num_workers	log_leveldata_path_and_name_and_typekey_filer/   r0   r2   r3   word_lm_train_configword_lm_filer5   r   r6   r7   allow_variable_data_keysr1   rC   rD   rE   rF   rG   rH   c#           6         s  t  sJ |dkrtd|d urtd|dkrtdtj|dd |dkr+d}#nd}#t| td:i d	|d
|d|d|d|d|d|d|d|#d|d|d|d|d|d|	d|
d|d|d|d|d|d| d|!d |"}$tjd:d!|i|$}%tj	|||||t
|%jd"t|%jd"|d#d$	}&t| ;}'|&D ].\}(})t|)tsJ t|)td%d& |(D sJ |(ttt|) }*t|(|*ksJ t|( d'|* d(d) |) D })z	|%d:i |) W nG ty9 }+ z:td*|( d+|+  td,i i g d-},d+d.gd/g|,gg|  |r/t|%jjd0d}- fd1d2t|-D  W Y d }+~+nd }+~+ww |(d3 }.|rt dD ]K\}/}0ttd|d |0D ];\}1\}2}3}4},|'|1 d4|/  }5d+|3|5d5 |.< d+t t!|4|5d6 |.< t!|,j"|5d7 |.< |2d ur|2|5d8 |.< qTqFqttd|d  D ]9\}1\}2}3}4},|'|1 d9 }5d+|3|5d5 |.< d+t t!|4|5d6 |.< t!|,j"|5d7 |.< |2d ur|2|5d8 |.< qqW d    d S 1 sw   Y  d S );Nr'   z!batch decoding is not implementedzWord LM is not implementedz%only single GPU decoding is supportedz>%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s)levelr   cudar%   r/   r0   r1   r2   r3   r5   r6   r7   r8   r9   r:   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   r   FT)r<   r;   r   r   preprocess_fn
collate_fnr   	inferencec                 s   s    | ]}t |tV  qd S r   )rd   r   )rN   srJ   rJ   rP   	<genexpr>  s    zinference.<locals>.<genexpr>z != c                 S   s$   i | ]\}}| d s||d qS )_lengthsr   )endswithre   rJ   rJ   rP   
<dictcomp>  s   $ zinference.<locals>.<dictcomp>z
Utterance  r&   )r   scoresstatesr   z<space>   r   c                    s   g | ]} qS rJ   rJ   )rN   r   r   rJ   rP   rQ     s    zinference.<locals>.<listcomp>r   best_recog_spkr   r   r   r   
best_recogrJ   )#r
   NotImplementedErrorrp   basicConfigr   rw   r#   r   r   build_streaming_iteratorbuild_preprocess_fnr   build_collate_fnr   rd   r   allru   nextiterr}   ry   r   r|   r   rK   r   r   ranger   zipr   mapr   r   )6r   r9   r:   r;   r<   r=   r   r   r>   r?   r@   rA   rB   r   r   r   r   r/   r0   r2   r3   r   r   r5   r   r6   r7   r   r1   rC   rD   rE   rF   rG   rH   r8   speech2text_kwargsspeech2textloaderwriterkeysr   _bser   r   keyr   r   nr   r   r   ibest_writerrJ   r   rP   r     s   
%	
"	

$r   c                  C   s  t jdtjd} | jddd dddd	 | jd
tdd | jdtddd | jdtddd | jddg ddd | jdtddd | d}|jdtddd |jdt	d  |jd!t
d"d# | d$}|jd%td&d' |jd(td)d' |jd*td+d' |jd,td-d' |jd.td/d' |jd0td1d' |jd2td3d' |jd4td5d' |jd6t
d"d7d | d8}|jd9t
d"d:d |jd;t
d"d<d |jd=td>d?gd@dA |jdBtdCdDdCgdEd	 | dF}|jdGtddHd |jdItddJd |jdKtdLdMd |jdNtdOdPd |jdQtdOdRd |jdStdOdTd |jdUtdVdWd |jdXtdYdZd |jd[td\d]d |jd^t
d"d# |jd_d d`da | db}|jdct	d g ddded	 |jdft	d dgd | S )hNzASR Decoding)descriptionformatter_classz--log_levelc                 S   s   |   S r   )upperr   rJ   rJ   rP   r   9  r   zget_parser.<locals>.<lambda>INFO)CRITICALERRORWARNINGr	  DEBUGNOTSETzThe verbose level of logging)r   defaultchoiceshelpz--output_dirT)r   requiredz--ngpur   z(The number of gpus. 0 indicates CPU mode)r   r  r  z--seedzRandom seedz--dtyper(   )rI   r(   float64z	Data type)r  r  r  z--num_workersr'   z)The number of workers used for DataLoaderzInput data relatedz--data_path_and_name_and_typer   )r   r  actionz
--key_file)r   z--allow_variable_data_keysF)r   r  zThe model configuration relatedz--asr_train_configzASR training configuration)r   r  z--asr_model_filezASR model parameter filez--lm_train_configzLM training configurationz	--lm_filezLM parameter filez--word_lm_train_configzWord LM training configurationz--word_lm_filezWord LM parameter filez--ngram_filezN-gram parameter filez--model_tagz[Pretrained model tag. If specify this option, *_train_config and *_file will be overwrittenz--enh_s2t_taskzenhancement and asr joint modelzQuantization relatedz--quantize_asr_modelz(Apply dynamic quantization to ASR model.z--quantize_lmz!Apply dynamic quantization to LM.z--quantize_modules*r-   zList of modules to be dynamically quantized.
        E.g.: --quantize_modules=[Linear,LSTM,GRU].
        Each specified module should be an attribute of 'torch.nn', e.g.:
        torch.nn.Linear, torch.nn.LSTM, torch.nn.GRU, ...)r   nargsr  r  z--quantize_dtyper.   rI   zDtype for dynamic quantization.zBeam-search relatedz--batch_sizezThe batch size for inferencez--nbestzOutput N-best hypothesesz--beam_sizer)   z	Beam sizez	--penaltyr&   zInsertion penaltyz--maxlenratiozInput length ratio to obtain max output length. If maxlenratio=0.0 (default), it uses a end-detect function to automatically find maximum hypothesis lengths.If maxlenratio<0.0, its absolute value is interpretedas a constant max output lengthz--minlenratioz.Input length ratio to obtain min output lengthz--ctc_weightr*   zCTC weight in joint decodingz--lm_weightr+   zRNNLM weightz--ngram_weightr,   zngram weightz--streamingz--transducer_confz1The keyword arguments for transducer beam search.)r  r  zText converter relatedz--token_type)charrh   NzIThe token type for ASR model. If not given, refers from the training argsz
--bpemodelzLThe model path of sentencepiece. If not given, refers from the training args)r   ArgumentParserargparseArgumentDefaultsHelpFormatteradd_argumentr   r   add_argument_groupr   r   r   r   )parsergrouprJ   rJ   rP   
get_parser/  sJ  





r  c                 C   sF   t t tjd t }|| }t|}|dd  tdi | d S )N)fileconfigrJ   )	printr"   sysstderrr  
parse_argsvarspopr   )cmdr  argsr   rJ   rJ   rP   main  s   
r*  __main__r   )Kr  rp   r#  distutils.versionr   pathlibr   typingr   r   r   r   r   r	   numpyr   rL   torch.quantization	typeguardr
   r   -espnet2.asr.transducer.beam_search_transducerr   r   r   r   r   espnet2.fileio.datadir_writerr   espnet2.tasks.asrr   espnet2.tasks.enh_s2tr   espnet2.tasks.lmr   espnet2.text.build_tokenizerr   espnet2.text.token_id_converterr    espnet2.torch_utils.device_funcsr   'espnet2.torch_utils.set_all_random_seedr   espnet2.utilsr   espnet2.utils.typesr   r   r   espnet.nets.batch_beam_searchr   (espnet.nets.batch_beam_search_online_simr   espnet.nets.beam_searchr   3espnet.nets.pytorch_backend.transformer.subsamplingr   espnet.nets.scorer_interfacer   espnet.nets.scorers.ctcr     espnet.nets.scorers.length_bonusr!   espnet.utils.cli_utilsr"   r#   r   r   r   r   rw   r   r  r*  r   rJ   rJ   rJ   rP   <module>   s      i	

 !"#
 " 
?
	
