o
    ig                  C   @   sX  d dl Z d dlZd dlZd dlZd dlmZ d dlmZmZm	Z	m
Z
mZmZmZ d dlZd dlZd dlZd dlZd dlmZmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
l m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z)m*Z*m+Z+ d dl,m-Z- dDde.fddZ/de	e. dej0fddZ1dej0de	e	e.  fddZ2G dd dZ3de4de5de5de.d e4d!e.d"e.d#e.d$e5d%e5d&e5d'e.d(e.d)ee.e4f d*eee4e4e4f  d+e
e4 d,e
e4 d-e
e4 d.e
e4 d/e
e4 d0e
e4 d1e
e4 d2e
e4 d3e
e4 d4e
e4 d5e6d6e6d7e6d8e6d9e.d:e.d;e.d<e
e4 fBd=d>Z7d?d@ Z8dDdAdBZ9e:dCkr*e9  dS dS )E    N)Path)AnyDictListOptionalSequenceTupleUnion)check_argument_typescheck_return_type)DatadirWriter)nbest_am_lm_scores)ASRTask)LMTask)build_tokenizer)TokenIDConverter)	to_device)set_all_random_seed)config_argparse)str2boolstr2triple_strstr_or_none)get_commandline_argstotal_elementsc                    s\    d dkrdg    fddt dt D }|dur,t||kr,||t|  |S )a#  convert indices to split_size

    During decoding, the api torch.tensor_split should be used.
    However, torch.tensor_split is only available with pytorch >= 1.8.0.
    So torch.split is used to pass ci with pytorch < 1.8.0.
    This fuction is used to prepare input for torch.split.
    r   c                    s    g | ]} |  |d    qS )    ).0iindicesr   P/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/bin/asr_inference_k2.py
<listcomp>'   s     z)indices_to_split_size.<locals>.<listcomp>r   N)rangelensumappend)r   r   
split_sizer   r   r    indices_to_split_size   s   
r'   tokensreturnc                 C   s   d| v sJ dt | }|}d}t|D ];}t|D ]*}||kr0|| d| d| |  d7 }q|| d| d| |  d| |  d7 }q|| d| d7 }q|| 7 }tjj|dd	}t|}|S )
a	  Build CTC topology.

    A token which appears once on the right side (i.e. olabels) may
    appear multiple times on the left side (ilabels), possibly with
    epsilons in between.
    When 0 appears on the left side, it represents the blank symbol;
    when it appears on the right side, it indicates an epsilon. That
    is, 0 has two meanings here.
    Args:
      tokens:
        A list of tokens, e.g., phones, characters, etc.
    Returns:
      Returns an FST that converts repeated tokens to a single token.
    r   z%We assume 0 is ID of the blank symbol  z 0 0.0
z 0.0
z -1 -1 0.0
r   )num_aux_labels)r#   r"   k2Fsafrom_strarc_sort)r(   
num_statesfinal_statearcsr   jansr   r   r    build_ctc_topo/   s    *

r6   
best_pathsc                 C   s   t | jtjr*| jd}| j | }|d}|d}t||	 }n| j d}t|| j}|d}|j
dksEJ | S )a  Extract the texts from the best-path FSAs.

     Args:
         best_paths:  a k2.Fsa with best_paths.arcs.num_axes() == 3, i.e.
                  containing multiple FSAs, which is expected to be the result
                  of k2.shortest_path (otherwise the returned values won't
                  be meaningful).  Must have the 'aux_labels' attribute, as
                a ragged tensor.
    Return:
        Returns a list of lists of int, containing the label sequences we
        decoded.
    r   r      )
isinstance
aux_labelsr-   RaggedTensorremove_values_leqr3   shapecomposeremove_axisvaluesnum_axestolist)r7   r:   	aux_shaper   r   r    	get_textsQ   s   


rD   c                C   @   sp  e Zd ZdZ																	
										
	
								d=deeef deeef deeef deeef dedededededededededed ed!ed"e	d#ed$ed%ed&ed'ed(ed)e	d*e
e d+e	d,e	d-ed.ed/ed0ed1ed2efBd3d4Ze d5eeeejejf f d6eee
e ee ee ef  fd7d8Ze	d>d9e
e d:e
e fd;d<ZdS )?k2Speech2Texta  Speech2Text class

    Examples:
        >>> import soundfile
        >>> speech2text = k2Speech2Text("asr_config.yml", "asr.pth")
        >>> audio, rate = soundfile.read("speech.wav")
        >>> speech = np.expand_dims(audio, 0) # shape: [batch_size, speech_length]
        >>> speech_lengths = np.array([audio.shape[0]]) # shape: [batch_size]
        >>> batch = {"speech": speech, "speech_lengths", speech_lengths}
        >>> speech2text(batch)
        [(text, token, token_int, score), ...]

    Ncpu        r   float32         ?      ?F      '  T    d   asr_train_configasr_model_filelm_train_configlm_file
token_typebpemodeldevicemaxlenratiominlenratio
batch_sizedtype	beam_size
ctc_weight	lm_weightpenaltynbest	streamingsearch_beam_sizeoutput_beam_sizemin_active_statesmax_active_states
blank_biaslattice_weightis_ctc_decodinglang_diruse_fgram_rescoringuse_nbest_rescoring	am_weightdecoder_weightnnlm_weight	num_pathsnbest_batch_sizenll_batch_sizec"           )      C   s  t  sJ t|||\}"}#|"jtt|d  |"j}$|d ur,t|||\}%}&|%| _	|| _
|| _|| _| j
s<J d| j
rMttttt|$| _| j|| _|d u r[|#j}|d u rb|#j}|d u rid }'n|dkr{|d urxt||d}'nd }'nt|d}'t|$d}(td|'  td|  |"| _|#| _|(| _|'| _|| _|| _|| _ || _!|| _"|| _#|| _$|| _%|| _&|| _'|| _(|| _)| | _*|!| _+d S )	Nr\   0Currently, only ctc_decoding graph is supported.bpe)rV   rW   )rV   )
token_listzText tokenizer: zRunning on : ),r
   r   build_model_from_filetogetattrtorchevalrv   r   lmri   rk   rl   r-   r0   r6   listr"   r#   decode_graphrV   rW   r   r   logginginfo	asr_modelasr_train_args	converter	tokenizerrX   r\   rc   rd   re   rf   rg   rh   rm   rn   ro   rp   rq   rr   ))selfrR   rS   rT   rU   rV   rW   rX   rY   rZ   r[   r\   r]   r^   r_   r`   ra   rb   rc   rd   re   rf   rg   rh   ri   rj   rk   rl   rm   rn   ro   rp   rq   rr   r   r   rv   r|   lm_train_argsr   r   r   r   r    __init__   sh   
$


zk2Speech2Text.__init__batchr)   c           *      C   s  t  sJ t|d tjrt|d |d< t|d tjr't|d |d< t|| jd}| jj	di |\}}tj
jj| jj|dd}|dddddf  | j7  < |d}td|d tj}tj|gtjdd }| d tj}tj|||gd	d}	|	tj}	t||	}
t| j|
| j| j| j| j }| j!| j"9  _!g }| j#rt$|| j%| j| j&\}}}}}}td
d |D | j}t'|}g }|D ]#}|(ttj|tj)dtj| jj*g|+ t,|  tj)dg qt-|tj)| j}|.d|tj)| j}|.d|tj)| j}| j/||||| j0 }d||| jj*k< | j1/||| j0\}}|j2d	d }| j3| | j4|  | j5|  }t6|7 |dd}t8||}g }g } d}!|D ]4}"|"9 dkr n*|!t:|" }#|#t,|k sJ ||# }$|!|"9 7 }!|(|$ | (|"' +  qzt,|t,|ksJ ntj;|dd}%|%j<ddd7 } t=|%}t,| t,|ksJ t>|| D ]"\}&}'| j?@|&}(| jAdusJ | jAB|(})|(|)|(|&|'f qtC|s	J |S )zInference

        Args:
            batch: Input speech data and corresponding lengths
        Returns:
            text, token, token_int, hyp

        speechspeech_lengths)rX   r8   )dimNr   rs   r   c                 S   s   g | ]}t |qS r   )r#   )r   hypr   r   r    r!   I  s    z*k2Speech2Text.__call__.<locals>.<listcomp>)r   T)use_double_scoresF)r   log_semiringr   )Dr
   r9   npndarrayrz   tensorr   rX   r   encodenn
functionallog_softmaxctcctc_lorg   sizearange	unsqueezetrx   int32zerosrF   catr-   DenseFsaVecintersect_dense_prunedr~   rc   rd   re   rf   scoresrh   rl   r   rp   rq   maxr%   long	ignore_iditemr#   stackindex_selectbatchify_nllrr   r|   r$   rm   rn   ro   r'   rB   splitnelementargmaxshortest_pathget_tot_scoresrD   zipr   
ids2tokensr   tokens2textr   )*r   r   encencoder_out_lenslogp_encoder_outputr[   sequence_idxstart_frame
num_framessupervision_segmentsdense_fsa_veclatticesresults	am_scores	lm_scores	token_idsnew2oldpath_to_seq_mapseq_to_path_splitsys_pad_lensmax_token_lengthys_pad_listr   ys_padencoder_outdecoder_scoresnnlm_nll	x_lengthsnnlm_scoresbatch_tot_scoresr&   hypsr   processed_seqs
tot_scoresbest_seq_idxbest_token_seqsr7   	token_intscoretokentextr   r   r    __call__   s   
 

zk2Speech2Text.__call__	model_tagkwargsc                 K   s^   | dur(zddl m} W n ty   td  w | }|jdi ||  tdi |S )a#  Build k2Speech2Text instance from the pretrained model.

        Args:
            model_tag (Optional[str]): Model tag of the pretrained models.
                Currently, the tags of espnet_model_zoo are supported.

        Returns:
            Speech2Text: Speech2Text instance.

        Nr   )ModelDownloaderzZ`espnet_model_zoo` is not installed. Please install via `pip install -U espnet_model_zoo`.r   )espnet_model_zoo.downloaderr   ImportErrorr   errorupdatedownload_and_unpackrE   )r   r   r   dr   r   r    from_pretrained  s   zk2Speech2Text.from_pretrained) NNNNNrF   rG   rG   r   rH   rI   rJ   rK   rG   r   FrL   rL   rM   rN   rG   rK   TNFFrK   rJ   rK   rO   rP   rQ   N)__name__
__module____qualname____doc__r	   r   strfloatintboolr   r   rz   no_gradr   Tensorr   r   r   r   r   staticmethodr   r   r   r   r   r    rE   s   s    



	
 !"
e ArE   
output_dirrY   rZ   r[   r\   r]   ngpuseedr^   r_   r`   ra   num_workers	log_leveldata_path_and_name_and_typekey_filerR   rS   rT   rU   word_lm_train_configword_lm_filer   rV   rW   allow_variable_data_keysrb   ri   rl   rp   rq   rr   	k2_configc!           7      C   s  |sJ dt  sJ |dkrtdtj|dd |dkr!d}!nd}!t| t| }"t|"}#W d    n1 s;w   Y  td0i d|d	|d
|d|d|d|d|!d|d|d|d|d|d|	d|
d|d|d|d|d|d|d|}$td0i |$|#}$t	j
d0d|i|$}%tj|||||t|%jdt|%jd|dd 	}&t| }'tj }(t|&D ]\})\}*}+|)d! d"krtd#|) d$ t|+tsJ t|+td%d& |*D sJ |*ttt|+ },t|*|,ksJ t|* d'|, |%|+}-t|-D ]9\}.\}/}0}1}2|*|. }3|'d( }4d)|0|4d* |3< d)tt|1|4d+ |3< t|2|4d, |3< |/d urF|/|4d- |3< qqtj }5|5|( }6td.|6j d/ W d    d S 1 shw   Y  d S )1Nrt   r   z%only single GPU decoding is supportedz>%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s)levelformatcudarF   rR   rS   rT   rU   rV   rW   rX   rY   rZ   r\   r]   r^   r_   r`   ra   rb   ri   rl   rp   rq   rr   r   FT)r\   r[   r   r   preprocess_fn
collate_fnr   	inference
   r   zProcessing z batchc                 s   s    | ]}t |tV  qd S r   )r9   r   )r   sr   r   r    	<genexpr>0  s    zinference.<locals>.<genexpr>z != 1best_recogr+   r   r   r   r   zDecoding duration is z secondsr   ) r
   NotImplementedErrorr   basicConfigr   openyaml	safe_loaddictrE   r   r   build_streaming_iteratorbuild_preprocess_fnr   build_collate_fnr   datetimenow	enumerater   r9   typeallr#   nextiterr@   joinmapr   seconds)7r   rY   rZ   r[   r\   r]   r   r   r^   r_   r`   ra   r   r   r   r   rR   rS   rT   rU   r   r   r   rV   rW   r   rb   ri   rl   rp   rq   rr   r   rX   k2_config_filedict_k2_configspeech2text_kwargsspeech2textloaderwriterstart_decoding_time	batch_idxkeysr   _bsr   key_idxr   r   r   r   keybest_writerend_decoding_timedecoding_durationr   r   r    r     s   #

	


$

$r   c                  C   s  t jdtjd} | jddd dddd	 | jd
tdd | jdtddd | jdtddd | jddg ddd | jdtddd | d}|jdtddd |jdt	d  |jd!t
d"d# | d$}|jd%td&d' |jd(td)d' |jd*td+d' |jd,td-d' |jd.td/d' |jd0td1d' |jd2td3d' | d4}|jd5tdd6d |jd7tdd8d |jd9td:d;d |jd<td=d>d |jd?td=d@d |jdAtd=dBd |jdCtdDdEd |jdFtdGdHd |jdIt
d"d# | dJ}|jdKt	d g dLdMd	 |jdNt	d dOd |jdPt
ddQd |jdRt
d"d# |jdStdTdUd |jdVtdWdXd |jdYtdZd[d |jd\td]d' | S )^NzASR Decoding)descriptionformatter_classz--log_levelc                 S   s   |   S r   )upper)xr   r   r    <lambda>Q  s    zget_parser.<locals>.<lambda>INFO)CRITICALERRORWARNINGr'  DEBUGNOTSETzThe verbose level of logging)r  defaultchoiceshelpz--output_dirT)r  requiredz--ngpur   z(The number of gpus. 0 indicates CPU mode)r  r-  r/  z--seedzRandom seedz--dtyperH   )float16rH   float64z	Data type)r-  r.  r/  z--num_workersr   z)The number of workers used for DataLoaderzInput data relatedz--data_path_and_name_and_typer%   )r  r0  actionz
--key_file)r  z--allow_variable_data_keysF)r  r-  zThe model configuration relatedz--asr_train_configzASR training configuration)r  r/  z--asr_model_filezASR model parameter filez--lm_train_configzLM training configurationz	--lm_filezLM parameter filez--word_lm_train_configzWord LM training configurationz--word_lm_filezWord LM parameter filez--model_tagz[Pretrained model tag. If specify this option, *_train_config and *_file will be overwrittenzBeam-search relatedz--batch_sizezThe batch size for inferencez--nbestzOutput N-best hypothesesz--beam_sizerL   z	Beam sizez	--penaltyrG   zInsertion penaltyz--maxlenratiozInput length ratio to obtain max output length. If maxlenratio=0.0 (default), it uses a end-detect function to automatically find maximum hypothesis lengthsz--minlenratioz.Input length ratio to obtain min output lengthz--ctc_weightrJ   zCTC weight in joint decodingz--lm_weightrK   zRNNLM weightz--streamingzText converter relatedz--token_type)charru   NzIThe token type for ASR model. If not given, refers from the training argsz
--bpemodelzLThe model path of sentencepiece. If not given, refers from the training argsz--is_ctc_decodingz"Use ctc topology as decoding graphz--use_nbest_rescoringz--num_pathsrO   z&The third argument for k2.random_pathsz--nbest_batch_sizerP   z<batchify nbest list when computing am/lm scores to avoid OOMz--nll_batch_sizerQ   z4batch_size when computing nll during nbest rescoringz--k2_configz Config file for decoding with k2)r   ArgumentParserargparseArgumentDefaultsHelpFormatteradd_argumentr   r   add_argument_groupr   r   r   r   )parsergroupr   r   r    
get_parserG  s&  


	
r<  c                 C   sF   t t tjd t }|| }t|}|dd  tdi | d S )N)fileconfigr   )	printr   sysstderrr<  
parse_argsvarspopr   )cmdr:  argsr   r   r   r    main  s   
rG  __main__r   );r6  r	  r   r@  pathlibr   typingr   r   r   r   r   r   r	   r-   numpyr   rz   r  	typeguardr
   r   espnet2.fileio.datadir_writerr   espnet2.fst.lm_rescorer   espnet2.tasks.asrr   espnet2.tasks.lmr   espnet2.text.build_tokenizerr   espnet2.text.token_id_converterr    espnet2.torch_utils.device_funcsr   'espnet2.torch_utils.set_all_random_seedr   espnet2.utilsr   espnet2.utils.typesr   r   r   espnet.utils.cli_utilsr   r   r'   r.   r6   rD   rE   r   r   r   r   r<  rG  r   r   r   r   r    <module>   s   $""  V	

 !
  
&
	
