o
    i.                  "   @   s  d dl Z d dlZd dlZd dlmZ d dlmZmZmZm	Z	m
Z
mZ d dlZd dlZd dlmZmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dl m!Z! d dl"m#Z#m$Z$m%Z% d dl&m'Z' d dl(m)Z) d dl*m+Z+ G dd dZ,de-de.de-de.de.de.dee.e-f de	e
e-e-e-f  dee- de-de-dee- dee- d ee- d!e/d"e.d#e0f"d$d%Z1d&d' Z2d+d(d)Z3e4d*kre3  dS dS ),    N)Path)AnyListOptionalSequenceTupleUnion)check_argument_typescheck_return_type)MaskCTCInference)DatadirWriter)ASRTask)build_tokenizer)TokenIDConverter)	to_device)set_all_random_seed)config_argparse)str2boolstr2triple_strstr_or_none)
Hypothesis)TooShortUttError)get_commandline_argsc                   @   s   e Zd ZdZ								ddeeef d	eeef d
ededededededefddZ	e
 dee
jejf deeee ee ee ef  fddZe	ddee dee fddZdS )Speech2Texta  Speech2Text class

    Examples:
        >>> import soundfile
        >>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
        >>> audio, rate = soundfile.read("speech.wav")
        >>> speech2text(audio)
        [(text, token, token_int, hypothesis object), ...]

    Ncpu   float32
   Gz?asr_train_configasr_model_file
token_typebpemodeldevice
batch_sizedtypemaskctc_n_iterationsmaskctc_threshold_probabilityc
                 C   s   t  sJ t|||\}
}|
jtt|d  |
j}t|
||	d}|j|tt|d  |d u r6|j	}|d u r=|j
}|d u rDd }n|dkrV|d urSt||d}nd }nt|d}t|d}td|  |
| _|| _|| _|| _|| _|| _|| _d S )	N)r%   )	asr_modeln_iterationsthreshold_probability)r#   r%   bpe)r!   r"   )r!   )
token_listzText tokenizer: )r	   r   build_model_from_filetogetattrtorchevalr,   r   r!   r"   r   r   logginginfor(   asr_train_argss2t	converter	tokenizerr#   r%   )selfr   r    r!   r"   r#   r$   r%   r&   r'   r(   r4   r,   r5   r7   r6    r9   U/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/bin/asr_inference_maskctc.py__init__&   s@   



zSpeech2Text.__init__speechreturnc                 C   s8  t  sJ t|tjrt|}|dtt| j	}|j
dgtj|dd}||d}t|| jd}| jjd
i |\}}t|trI|d }t|dksUJ t|| |d }t|tsgJ t||jdd  }ttdd |}| j|}| jd	ur| j|}	nd	}	|	|||fg}
t|
sJ |
S )zInference

        Args:
            data: Input speech data
        Returns:
            text, token, token_int, hyp

        r   r   )r%   
fill_value)r<   speech_lengths)r#   c                 S   s   | dkS )Nr   r9   xr9   r9   r:   <lambda>       z&Speech2Text.__call__.<locals>.<lambda>Nr9   )r	   
isinstancenpndarrayr0   tensor	unsqueezer.   r/   r%   new_fulllongsizer   r#   r(   encodetuplelenr5   r   typeyseqtolistlistfilterr6   
ids2tokensr7   tokens2textr
   )r8   r<   lengthsbatchenc_hyp	token_inttokentextresultsr9   r9   r:   __call__\   s,   




zSpeech2Text.__call__	model_tagkwargsc                 K   s^   | dur(zddl m} W n ty   td  w | }|jdi ||  tdi |S )a!  Build Speech2Text instance from the pretrained model.

        Args:
            model_tag (Optional[str]): Model tag of the pretrained models.
                Currently, the tags of espnet_model_zoo are supported.

        Returns:
            Speech2Text: Speech2Text instance.

        Nr   )ModelDownloaderzZ`espnet_model_zoo` is not installed. Please install via `pip install -U espnet_model_zoo`.r9   )espnet_model_zoo.downloaderrc   ImportErrorr2   errorupdatedownload_and_unpackr   )ra   rb   rc   dr9   r9   r:   from_pretrained   s   zSpeech2Text.from_pretrained)NNNr   r   r   r   r   N)__name__
__module____qualname____doc__r   r   strintfloatr;   r0   no_gradTensorrF   rG   r   r   r   r   r`   staticmethodr   rj   r9   r9   r9   r:   r      sT    

	

66r   
output_dirr$   r%   ngpuseednum_workers	log_leveldata_path_and_name_and_typekey_filer   r    ra   r!   r"   allow_variable_data_keysr&   r'   c           !      C   sH  t  sJ |dkrtd|dkrtdtj|dd |dkr#d}nd}t| t|	|
|||||||d	}tjdd	|i|}tj	|||||t
|jd
t|jd
|dd	}t| }|D ]\}}t|tsnJ t|tdd |D s{J |ttt| }t||ksJ t| d| dd | D }z	|di |}W n. ty } z"td| d|  tdi i g d}ddgdg|gg}W Y d }~nd }~ww |d }|d \}}}}|d } d|| d |< dtt|| d |< t|j| d |< |d ur|| d |< q_W d    d S 1 sw   Y  d S )Nr   z!batch decoding is not implementedz%only single GPU decoding is supportedz>%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s)levelformatcudar   )	r   r    r!   r"   r#   r$   r%   r&   r'   ra   FT)r%   r$   r|   ry   preprocess_fn
collate_fnr}   	inferencec                 s   s    | ]}t |tV  qd S rk   )rE   rp   ).0sr9   r9   r:   	<genexpr>   s    zinference.<locals>.<genexpr>z != c                 S   s$   i | ]\}}| d s||d qS )_lengthsr   )endswith)r   kvr9   r9   r:   
<dictcomp>   s   $ zinference.<locals>.<dictcomp>z
Utterance  g        )scorescoresstatesrQ   z<space>   r   1best_recogr]   r\   r   r^   r9   )r	   NotImplementedErrorr2   basicConfigr   dictr   rj   r   build_streaming_iteratorbuild_preprocess_fnr4   build_collate_fnr   rE   rP   allrO   nextitervaluesitemsr   warningr   joinmaprp   r   )!rv   r$   r%   rw   rx   ry   rz   r{   r|   r   r    ra   r!   r"   r}   r&   r'   r#   speech2text_kwargsspeech2textloaderwriterkeysrX   _bsr_   er[   keyr^   r]   r\   ibest_writerr9   r9   r:   r      s   

"
$r   c                  C   sl  t jdtjd} | jddd dddd	 | jd
tdd | jdtddd | jdtddd | jddg ddd | jdtddd | d}|jdtddd |jdt	d  |jd!t
d"d# | d$}|jd%tdd |jd&tdd |jd'td(d) | d*}|jd+tdd,d |jd-td.d# |jd/td0d# | d1}|jd2t	d g d3d4d	 |jd5t	d d6d | S )7NzASR Decoding)descriptionformatter_classz--log_levelc                 S   s   |   S rk   )upperrA   r9   r9   r:   rC      rD   zget_parser.<locals>.<lambda>INFO)CRITICALERRORWARNINGr   DEBUGNOTSETzThe verbose level of logging)rP   defaultchoiceshelpz--output_dirT)rP   requiredz--ngpur   z(The number of gpus. 0 indicates CPU mode)rP   r   r   z--seedzRandom seedz--dtyper   )float16r   float64z	Data type)r   r   r   z--num_workersr   z)The number of workers used for DataLoaderzInput data relatedz--data_path_and_name_and_typeappend)rP   r   actionz
--key_file)rP   z--allow_variable_data_keysF)rP   r   zThe model configuration relatedz--asr_train_configz--asr_model_filez--model_tagz[Pretrained model tag. If specify this option, *_train_config and *_file will be overwritten)rP   r   zDecoding relatedz--batch_sizezThe batch size for inferencez--maskctc_n_iterationsr   z--maskctc_threshold_probabilityr   zText converter relatedz--token_type)charr+   NzIThe token type for ASR model. If not given, refers from the training argsz
--bpemodelzLThe model path of sentencepiece. If not given, refers from the training args)r   ArgumentParserargparseArgumentDefaultsHelpFormatteradd_argumentrp   rq   add_argument_groupr   r   r   rr   )parsergroupr9   r9   r:   
get_parser  s   



r   c                 C   sF   t t tjd t }|| }t|}|dd  tdi | d S )N)fileconfigr9   )	printr   sysstderrr   
parse_argsvarspopr   )cmdr   argsrb   r9   r9   r:   mainm  s   
r   __main__rk   )5r   r2   r   pathlibr   typingr   r   r   r   r   r   numpyrF   r0   	typeguardr	   r
   espnet2.asr.maskctc_modelr   espnet2.fileio.datadir_writerr   espnet2.tasks.asrr   espnet2.text.build_tokenizerr   espnet2.text.token_id_converterr    espnet2.torch_utils.device_funcsr   'espnet2.torch_utils.set_all_random_seedr   espnet2.utilsr   espnet2.utils.typesr   r   r   espnet.nets.beam_searchr   3espnet.nets.pytorch_backend.transformer.subsamplingr   espnet.utils.cli_utilsr   r   rp   rq   boolrr   r   r   r   rl   r9   r9   r9   r:   <module>   s|     
	

d
W

