o
    i5\                  D   @   s  d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZm	Z	m
Z
mZmZmZ d dlZd dlZd dlZd dlmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z& d dl'm(Z( d dl)m*Z* d dl+m,Z,m-Z-m.Z. d dl/m0Z0 d dl1m2Z2 d dl3m4Z4mZ d dl5m6Z6 d dl7m8Z8 d dl9m:Z: d dl;m<Z< d dl=m>Z> G dd dZ?de@deAdeAdeBd e@d!eBd"eBd#eBd$eAd%eAd&eAd'eAd(eBd)eBd*eeBe@f d+eee@e@e@f  d,e
e@ d-e
e@ d.e
e@ d/e
e@ d0e
e@ d1e
e@ d2e
e@ d3e
e@ d4e
e@ d5e
e@ d6e
e@ d7eCd8e
eD d9eCd:eCd;eCd<e	e@ d=e@fDd>d?ZEd@dA ZFdEdBdCZGeHdDkrBeG  dS dS )F    N)LooseVersion)Path)AnyListOptionalSequenceTupleUnion)check_argument_typescheck_return_type)BeamSearchTransducer)ExtendedHypothesis)
Hypothesis)DatadirWriter)LMTask)SLUTask)build_tokenizer)TokenIDConverter)	to_device)set_all_random_seed)config_argparse)str2boolstr2triple_strstr_or_none)BatchBeamSearch)BatchBeamSearchOnlineSim)
BeamSearchr   )TooShortUttError)BatchScorerInterface)CTCPrefixScorer)LengthBonus)get_commandline_argsc                3   @   sX  e Zd ZdZdddddddddddddddd	d
dddddddgdfdeeef deeef dedeeef deeef dedeeef dededededede	dede	deded ed!ed"e	d#e
d$e
d%e
d&ee d'ef2d(d)Ze 	d3d*eejejf d+ejd,eeee ee ee	 eeeef f  fd-d.Ze	d3d/ee d0ee fd1d2ZdS )4Speech2Understanda/  Speech2Understand class

    Examples:
        >>> import soundfile
        >>> speech2understand = Speech2Understand("slu_config.yml", "slu.pth")
        >>> audio, rate = soundfile.read("speech.wav")
        >>> speech2understand(audio)
        [(text, token, token_int, hypothesis object), ...]

    Nfullcpu           float32         ?      ??FLinearqint8slu_train_configslu_model_filetransducer_conflm_train_configlm_filengram_scorer
ngram_file
token_typebpemodeldevicemaxlenratiominlenratio
batch_sizedtype	beam_size
ctc_weight	lm_weightngram_weightpenaltynbest	streamingquantize_asr_modelquantize_lmquantize_modulesquantize_dtypec           -   
   C   s  t  sJ t}|s|r|dkrtjtdk rtdtdd |D }tt|}i }||||
\}}|j	tt|d
  |rNtd tjj|||d}|j}t|j|jd	}|j} |j||tt| d
 |d urt|||
\}!}"|rtd tjj|!||d}!|!j|d< |d ur|dkrddlm}# |#|| }$nddlm}% |%|| }$nd }$|$|d< |jrtd%|j|j|d|v r|d nd || d|}&d }'nd }&td| ||||d}(t ||(||j!|jt| | |dkrd ndd}'|dkr*dd |'j"# D })t|)dkr!|rt$|'_%|'&| td nt'|'_%td n	t(d|) d |'j	|
tt|d
  |) D ]}*t*|*tj+j,rP|*j	|
tt|d
  q:td|'  td|
 d|  |d u rm|j-}|	d u ru|j.}	|d u r}d }+n|d kr|	d urt/||	d!}+nd }+nt/|d"}+t0| d#},td$|+  || _1|| _2|,| _3|+| _4|'| _5|&| _6|| _7|| _8|
| _9|| _:|| _;d S )&Nfloat16z1.5.0zrfloat16 dtype for dynamic quantization is not supported with torch version < 1.5.0. Switch to qint8 dtype instead.c                 S   s   g | ]}t tj|qS  )getattrtorchnn).0qrH   rH   M/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/bin/slu_inference.py
<listcomp>[   s    z.Speech2Understand.__init__.<locals>.<listcomp>)r;   z%Use quantized asr model for decoding.)qconfig_specr;   )ctceos)decoderrQ   length_bonuszUse quantized lm for decoding.lmr#   r   )NgramFullScorer)NgramPartScorerngram)rS   joint_networkr<   rU   r>   
token_listr*   )rS   rQ   rU   rX   rT   )r<   weightsscorerssosrR   
vocab_sizerZ   pre_beam_score_keyr&   c                 S   s   g | ]\}}t |ts|qS rH   )
isinstancer   rL   kvrH   rH   rN   rO      s    z4BatchBeamSearchOnlineSim implementation is selected.z+BatchBeamSearch implementation is selected.zAs non-batch scorers z2 are found, fall back to non-batch implementation.)r7   r;   zBeam_search: zDecoding device=z, dtype=bpe)r5   r6   )r5   )rZ   zText tokenizer: rH   )<r
   r   rJ   __version__r   
ValueErrorsetrI   build_model_from_filetoevallogginginfoquantizationquantize_dynamicrS   r   rQ   rR   rZ   updater    lenr   rU   espnet.nets.scorers.ngramrV   rW   use_transducer_decoderr   rY   dictr   r]   full_scorersitemsr   	__class__set_streaming_configr   warningvaluesr`   rK   Moduler5   r6   r   r   	asr_modelasr_train_args	converter	tokenizerbeam_searchbeam_search_transducerr8   r9   r7   r;   rA   )-selfr.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   taskr\   r{   r|   rS   rQ   rZ   rU   lm_train_argsrV   rX   rW   r   r   r[   	non_batchscorerr~   r}   rH   rH   rN   __init__2   s   





	










zSpeech2Understand.__init__speech
transcriptreturnc              
   C   s  t  sJ t|tjrt|}|dtt| j	}|j
dgtj|dd}|du r?||d}tdt|d  n |dtt| j	}|j
dgtj|dd}||||d}t|| jd}| jjdi |\}}t|trz|d }t|dksJ t|| jrtd	t|d jd   | |d }|d }	td
|	jd td|	jt|	j d tdd| j|	jdd  d  n| j|d | j| jd}|d| j  }g }
|D ]V}t|t!t"fsJ t#|| jj$rdnd}t|jt%r|jd| }n	|jd| & }t%t'dd |}| j|}| j(dur3| j()|}nd}|
*||||f qt+|
sFJ |
S )zInference

        Args:
            data: Input speech data
        Returns:
            text, token, token_int, hyp

        r   r&   )r;   
fill_valueN)r   speech_lengthszspeech length: )r   r   transcript_padtranscript_pad_lens)r7   zencoder output length: ztotal log probability: z.2fznormalized log probability: zbest hypo:  
)xr8   r9   c                 S   s   | dkS )Nr   rH   r   rH   rH   rN   <lambda>G      z,Speech2Understand.__call__.<locals>.<lambda>rH   ),r
   r`   npndarrayrJ   tensor	unsqueezeri   rI   r;   new_fulllongsizerk   rl   strr   r7   r{   encodetuplerp   r   shapescoreyseqjoinr}   
ids2tokensr   r8   r9   rA   r   TransHypothesistyperr   listtolistfilterr~   tokens2textappendr   )r   r   r   lengthsbatchtranscript_lengthsenc_
nbest_hypsbestresultshyplast_pos	token_inttokentextrH   rH   rN   __call__   sh   



"zSpeech2Understand.__call__	model_tagkwargsc                 K   s^   | dur(zddl m} W n ty   td  w | }|jdi ||  tdi |S )a3  Build Speech2Understand instance from the pretrained model.

        Args:
            model_tag (Optional[str]): Model tag of the pretrained models.
                Currently, the tags of espnet_model_zoo are supported.

        Returns:
            Speech2Understand: Speech2Understand instance.

        Nr   )ModelDownloaderzZ`espnet_model_zoo` is not installed. Please install via `pip install -U espnet_model_zoo`.rH   )espnet_model_zoo.downloaderr   ImportErrorrk   errorro   download_and_unpackr"   )r   r   r   drH   rH   rN   from_pretrainedU  s   z!Speech2Understand.from_pretrainedN)__name__
__module____qualname____doc__r	   r   r   rs   floatintboolr   r   rJ   no_gradTensorr   r   r   r   r   ExtTransHypothesisr   r   staticmethodr   r   rH   rH   rH   rN   r"   &   s    




	

 ?dr"   
output_dirr8   r9   r:   r;   r<   ngpuseedr=   r>   r?   r@   rA   num_workers	log_leveldata_path_and_name_and_typekey_filer.   r/   r1   r2   word_lm_train_configword_lm_filer4   r   r5   r6   allow_variable_data_keysr0   rB   rC   rD   rE   rF   c"           3      C   s  t  sJ |dkrtd|d urtd|dkrtdtj|dd |dkr+d}"nd}"t| td5i d	|d
|d|d|d|d|d|d|d|"d|d|d|d|d|d|	d|
d|d|d|d|d|d| d|!}#tjd5d |i|#}$tj	|||||t
|$jd!t|$jd!|d"d#	}%t| }&|%D ]\}'}(t|(tsJ t|(td$d% |'D sJ |'ttt|( })t|'|)ksJ t|' d&|) d'd( |( D }(z	|$d5i |(}*W n1 ty }+ z$td)|' d*|+  td+i i g d,},d*d-gd.g|,gg| }*W Y d }+~+nd }+~+ww |'d/ }-ttd|d |*D ]9\}.\}/}0}1},|&|. d0 }2d*|0|2d1 |-< d*tt|1|2d2 |-< t|,j|2d3 |-< |/d ure|/|2d4 |-< q-qW d    d S 1 stw   Y  d S )6Nr&   z!batch decoding is not implementedzWord LM is not implementedz%only single GPU decoding is supportedz>%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s)levelformatcudar$   r.   r/   r0   r1   r2   r4   r5   r6   r7   r8   r9   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   r   FT)r;   r:   r   r   preprocess_fn
collate_fnr   	inferencec                 s   s    | ]}t |tV  qd S r   )r`   r   )rL   srH   rH   rN   	<genexpr>  s    zinference.<locals>.<genexpr>z != c                 S   s$   i | ]\}}| d s||d qS )_lengthsr   )endswithra   rH   rH   rN   
<dictcomp>  s   $ zinference.<locals>.<dictcomp>z
Utterance  r%   )r   scoresstatesr   z<space>   r   
best_recogr   r   r   r   rH   )r
   NotImplementedErrorrk   basicConfigr   rs   r"   r   r   build_streaming_iteratorbuild_preprocess_fnr|   build_collate_fnr   r`   r   allrp   nextiterry   ru   r   rx   r   zipranger   mapr   r   )3r   r8   r9   r:   r;   r<   r   r   r=   r>   r?   r@   rA   r   r   r   r   r.   r/   r1   r2   r   r   r4   r   r5   r6   r   r0   rB   rC   rD   rE   rF   r7   speech2understand_kwargsspeech2understandloaderwriterkeysr   _bsr   er   keynr   r   r   ibest_writerrH   rH   rN   r   t  s   
$	

""$
$r   c                  C   s  t jdtjd} | jddd dddd	 | jd
tdd | jdtddd | jdtddd | jddg ddd | jdtddd | d}|jdtddd |jdt	d  |jd!t
d"d# | d$}|jd%td&d' |jd(td)d' |jd*td+d' |jd,td-d' |jd.td/d' |jd0td1d' |jd2td3d' |jd4td5d' | d6}|jd7t
d"d8d |jd9t
d"d:d |jd;td<d=gd>d? |jd@tdAdBdAgdCd	 | dD}|jdEtddFd |jdGtddHd |jdItdJdKd |jdLtdMdNd |jdOtdMdPd |jdQtdMdRd |jdStdTdUd |jdVtdWdXd |jdYtdZd[d |jd\t
d"d# |jd]d d^d_ | d`}|jdat	d g dbdcd	 |jddt	d ded | S )fNzASR Decoding)descriptionformatter_classz--log_levelc                 S   s   |   S r   )upperr   rH   rH   rN   r     r   zget_parser.<locals>.<lambda>INFO)CRITICALERRORWARNINGr   DEBUGNOTSETzThe verbose level of logging)r   defaultchoiceshelpz--output_dirT)r   requiredz--ngpur   z(The number of gpus. 0 indicates CPU mode)r   r  r  z--seedzRandom seedz--dtyper'   )rG   r'   float64z	Data type)r  r  r  z--num_workersr&   z)The number of workers used for DataLoaderzInput data relatedz--data_path_and_name_and_typer   )r   r	  actionz
--key_file)r   z--allow_variable_data_keysF)r   r  zThe model configuration relatedz--slu_train_configzASR training configuration)r   r  z--slu_model_filezASR model parameter filez--lm_train_configzLM training configurationz	--lm_filezLM parameter filez--word_lm_train_configzWord LM training configurationz--word_lm_filezWord LM parameter filez--ngram_filezN-gram parameter filez--model_tagz[Pretrained model tag. If specify this option, *_train_config and *_file will be overwrittenzQuantization relatedz--quantize_asr_modelz(Apply dynamic quantization to ASR model.z--quantize_lmz!Apply dynamic quantization to LM.z--quantize_modules*r,   zList of modules to be dynamically quantized.
        E.g.: --quantize_modules=[Linear,LSTM,GRU].
        Each specified module should be an attribute of 'torch.nn', e.g.:
        torch.nn.Linear, torch.nn.LSTM, torch.nn.GRU, ...)r   nargsr  r  z--quantize_dtyper-   rG   zDtype for dynamic quantization.zBeam-search relatedz--batch_sizezThe batch size for inferencez--nbestzOutput N-best hypothesesz--beam_sizer(   z	Beam sizez	--penaltyr%   zInsertion penaltyz--maxlenratiozInput length ratio to obtain max output length. If maxlenratio=0.0 (default), it uses a end-detect function to automatically find maximum hypothesis lengths.If maxlenratio<0.0, its absolute value is interpretedas a constant max output lengthz--minlenratioz.Input length ratio to obtain min output lengthz--ctc_weightr)   zCTC weight in joint decodingz--lm_weightr*   zRNNLM weightz--ngram_weightr+   zngram weightz--streamingz--transducer_confz1The keyword arguments for transducer beam search.)r  r  zText converter relatedz--token_type)charrd   NzIThe token type for ASR model. If not given, refers from the training argsz
--bpemodelzLThe model path of sentencepiece. If not given, refers from the training args)r   ArgumentParserargparseArgumentDefaultsHelpFormatteradd_argumentr   r   add_argument_groupr   r   r   r   )parsergrouprH   rH   rN   
get_parser  s>  





r  c                 C   sF   t t tjd t }|| }t|}|dd  tdi | d S )N)fileconfigrH   )	printr!   sysstderrr  
parse_argsvarspopr   )cmdr  argsr   rH   rH   rN   main  s   
r!  __main__r   )Ir  rk   r  distutils.versionr   pathlibr   typingr   r   r   r   r   r	   numpyr   rJ   torch.quantization	typeguardr
   r   -espnet2.asr.transducer.beam_search_transducerr   r   r   r   r   espnet2.fileio.datadir_writerr   espnet2.tasks.lmr   espnet2.tasks.slur   espnet2.text.build_tokenizerr   espnet2.text.token_id_converterr    espnet2.torch_utils.device_funcsr   'espnet2.torch_utils.set_all_random_seedr   espnet2.utilsr   espnet2.utils.typesr   r   r   espnet.nets.batch_beam_searchr   (espnet.nets.batch_beam_search_online_simr   espnet.nets.beam_searchr   3espnet.nets.pytorch_backend.transformer.subsamplingr   espnet.nets.scorer_interfacer   espnet.nets.scorers.ctcr    espnet.nets.scorers.length_bonusr    espnet.utils.cli_utilsr!   r"   r   r   r   r   rs   r   r  r!  r   rH   rH   rH   rN   <module>   s      P	

 !"
 	 
9
	
