o
    ie                     @  sh  d Z ddlmZ ddlZddlZddlZddlZddlmZ ddl	m
Z
mZmZmZmZmZmZ ddlZddlZddlmZ ddlmZmZ ddlmZmZ dd	lmZ dd
lm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z, ddl-m.Z. ddl/m0Z0m1Z1m2Z2 ddl3m4Z4 G dd dZ5dGd@dAZ6dBdC Z7dHdDdEZ8e9dFkre8  dS dS )Iz2 Inference class definition for Transducer models.    )annotationsN)Path)AnyDictListOptionalSequenceTupleUnion)parse)check_argument_typescheck_return_type)BeamSearchTransducer
Hypothesis)TooShortUttError)DatadirWriter)ASRTransducerTask)LMTask)build_tokenizer)TokenIDConverter)	to_device)set_all_random_seed)config_argparse)str2boolstr2triple_strstr_or_none)get_commandline_argsc                      s   e Zd ZdZ																		
			dEdF fd*d+ZdGd,d-Z	dHdId2d3Ze 	4dJdKd7d8Z	e dLd9d:Z
dMd=d>Ze	dNdOdCdDZ  ZS )PSpeech2Texta  Speech2Text class for Transducer models.

    Args:
        asr_train_config: ASR model training config path.
        asr_model_file: ASR model path.
        beam_search_config: Beam search config path.
        lm_train_config: Language Model training config path.
        lm_file: Language Model config path.
        token_type: Type of token units.
        bpemodel: BPE model path.
        device: Device to use for inference.
        beam_size: Size of beam during search.
        dtype: Data type.
        lm_weight: Language model weight.
        quantize_asr_model: Whether to apply dynamic quantization to ASR model.
        quantize_modules: List of module names to apply dynamic quantization on.
        quantize_dtype: Dynamic quantization data type.
        nbest: Number of final hypothesis.
        streaming: Whether to perform chunk-by-chunk inference.
        chunk_size: Number of frames in chunk AFTER subsampling.
        left_context: Number of frames in left context AFTER subsampling.
        right_context: Number of frames in right context AFTER subsampling.
        display_partial_hypotheses: Whether to display partial hypotheses.

    Ncpu   float32      ?Fqint8          r   asr_train_configUnion[Path, str]asr_model_filebeam_search_configDict[str, Any]lm_train_configlm_file
token_typestrbpemodeldevice	beam_sizeintdtype	lm_weightfloatquantize_asr_modelboolquantize_modules	List[str]quantize_dtypenbest	streaming
chunk_sizeleft_contextright_contextdisplay_partial_hypothesesreturnNonec                    s  t    t s
J t|||\}}|rW|dur0tdd |D s&tdtdd |D }ntj	j
h}|dkrFttjtdk rFtdtt|}tjj|||d	 }n|jtt|
d	  |durst|||\}}|j}nd}|du r{i }t|j|j|	f|||d
|}|j}|du r|j}|du r|j}|du rd}n|dkr|durt||d}nd}nt|d}t|d}td|  || _|| _|| _ |
| _!|| _"|| _#|| _$|| _%|| _&t'|d| _(t'|d| _)t'|d| _*|r|dkrd| _&d| jj+_,|j-.dd| _/|j-.dd| _0|j-.dddur$|j-d | _1n| j/| _1| j(| j* | _2| jj+3| j2| j0| _4| jj+j5j6| j* d | j0 | _7| 8  dS )zConstruct a Speech2Text object.Nc                 S  s   g | ]}|d v qS ))LSTMLinear .0qrE   rE   X/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/bin/asr_transducer_inference.py
<listcomp>a   s    z(Speech2Text.__init__.<locals>.<listcomp>z]Only 'Linear' and 'LSTM' modules are currently supported by PyTorch and in --quantize_modulesc                 S  s   g | ]}t tj|qS rE   )getattrtorchnnrF   rE   rE   rI   rJ   g   s    float16z1.5.0zufloat16 dtype for dynamic quantization is not supported with torch version < 1.5.0. Switching to qint8 dtype instead.r3   )lmr4   r;   bpe)r-   r/   )r-   )
token_listzText tokenizer: r   Fn_ffti   
hop_length   
win_lengthr#   )9super__init__r   r   build_model_from_fileall
ValueErrorsetrL   rM   rD   V__version__rK   quantizationquantize_dynamicevaltor   rP   r   decoderjoint_networkrR   r-   r/   r   r   logginginfo	asr_modelasr_train_argsr0   r3   r;   	converter	tokenizerbeam_searchr<   maxr=   r>   r?   encoderdynamic_chunk_trainingfrontend_confgetrS   rT   frontend_window_sizewindow_sizeget_encoder_input_raw_size_raw_ctxembedmin_frame_lengthlast_chunk_lengthreset_inference_cache) selfr&   r(   r)   r+   r,   r-   r/   r0   r1   r3   r4   r6   r8   r:   r;   r<   r=   r>   r?   r@   rg   rh   q_configq_dtyperP   lm_train_args	lm_scorerrk   rR   rj   ri   	__class__rE   rI   rX   ?   s   







zSpeech2Text.__init__c                 C  s@   d| _ | jjj| j| jd | j  tj	dgg| jd| _
dS )zReset Speech2Text parameters.Nr0   r   )frontend_cacherg   rm   reset_streaming_cacher>   r0   rk   rx   rL   tensornum_processed_frames)ry   rE   rE   rI   rx      s   
z!Speech2Text.reset_inference_cachespeechtorch.Tensoris_final!Tuple[torch.Tensor, torch.Tensor]c                 C  s  | j durtj| j d |gdd}|r:| jr5|d| jk r5tj| j|d |jd}tj||gdd}|}d}nF|d| j| j	  | j	 }|d| j| j	  | j	 }|
dd| j| j	 || j	  }|
d|d| j| j	  | | j| j	 |  }|dtt| j}|jdgtj|dd}||d}	t|	| jd	}	| jjdi |	\}
}| jjdur| j|
|\}
}|r| j du rnj|

dtt| j| j	 d
 |
dtt| j| j	 d
  }
nE| j du r|

dd|
dtt| j| j	 d
  }
n&|

dtt| j| j	 d
 |
dd
tt| j| j	 d
   }
|
jdgtj|
dd}|rHd| _ |
|fS d|i| _ |
|fS )a-  Forward frontend.

        Args:
            speech: Speech data. (S)
            is_final: Whether speech corresponds to the final (or only) chunk of data.

        Returns:
            feats: Features sequence. (1, T_in, F)
            feats_lengths: Features sequence length. (1, T_in, F)

        Nwaveform_bufferr   )dimrO   r#   )r3   
fill_value)r   speech_lengthsr      rE   )r   rL   catr<   sizerw   zerosr3   rq   rT   narrowclone	unsqueezerb   rK   new_fulllongr   r0   rg   _extract_feats	normalizemathceil)ry   r   r   padspeech_to_processr   n_frames
n_residuallengthsbatchfeatsfeats_lengthsrE   rE   rI   apply_frontend   s   






	
zSpeech2Text.apply_frontendTUnion[torch.Tensor, np.ndarray]List[Hypothesis]c                 C  sx   t |tjrt|}| j||d\}}| jjj||| j	| j
| jd}| j|d |d}|  j	| j7  _	|r:|   |S )zSpeech2Text streaming call.

        Args:
            speech: Chunk of speech data. (S)
            is_final: Whether speech corresponds to the final chunk of data.

        Returns:
            nbest_hypothesis: N-best hypothesis.

        r   )r>   r?   r   )
isinstancenpndarrayrL   r   r   rg   rm   chunk_forwardr   r>   r?   rk   r=   rx   )ry   r   r   r   feats_lengthenc_out
nbest_hypsrE   rE   rI   streaming_decode?  s   
zSpeech2Text.streaming_decodec                 C  sR   t  sJ t|tjrt|}| |\}}| j||\}}| 	|d }|S )zSpeech2Text call.

        Args:
            speech: Speech data. (S)

        Returns:
            nbest_hypothesis: N-best hypothesis.

        r   )
r   r   r   r   rL   r   r   rg   rm   rk   )ry   r   r   r   r   _r   rE   rE   rI   __call__e  s   

zSpeech2Text.__call__r   	List[Any]c                 C  sl   g }|D ]/}t tdd |j}| j|}| jdur"| j|}nd}|||||f t|s3J q|S )zBuild partial or final results from the hypotheses.

        Args:
            nbest_hyps: N-best hypothesis.

        Returns:
            results: Results containing different representation for the hypothesis.

        c                 S  s   | dkS )Nr   rE   xrE   rE   rI   <lambda>      z3Speech2Text.hypotheses_to_results.<locals>.<lambda>N)	listfilteryseqri   
ids2tokensrj   tokens2textappendr   )ry   r   resultshyp	token_inttokentextrE   rE   rI   hypotheses_to_results|  s   

z!Speech2Text.hypotheses_to_results	model_tagOptional[str]kwargsOptional[Any]c                 K  s^   | dur(zddl m} W n ty   td  w | }|jdi ||  tdi |S )zBuild Speech2Text instance from the pretrained model.

        Args:
            model_tag: Model tag of the pretrained models.

        Return:
            : Speech2Text instance.

        Nr   )ModelDownloaderzZ`espnet_model_zoo` is not installed. Please install via `pip install -U espnet_model_zoo`.rE   )espnet_model_zoo.downloaderr   ImportErrorre   errorupdatedownload_and_unpackr   )r   r   r   drE   rE   rI   from_pretrained  s   zSpeech2Text.from_pretrained)NNNNNNNr   r   r    r!   FNr"   r#   Fr$   r%   r   F)*r&   r'   r(   r'   r)   r*   r+   r'   r,   r'   r-   r.   r/   r.   r0   r.   r1   r2   r3   r.   r4   r5   r6   r7   r8   r9   r:   r.   r;   r2   r<   r7   r=   r2   r>   r2   r?   r2   r@   r7   rA   rB   )rA   rB   )F)r   r   r   r7   rA   r   )T)r   r   r   r7   rA   r   )r   r   rA   r   )r   r   rA   r   N)r   r   r   r   rA   r   )__name__
__module____qualname____doc__rX   rx   r   rL   no_gradr   r   r   staticmethodr   __classcell__rE   rE   r~   rI   r   $   sH     
	m%
r   
output_dirr.   
batch_sizer2   r3   r1   ngpuseedr4   r5   r;   num_workers	log_levelUnion[int, str]data_path_and_name_and_typeSequence[Tuple[str, str, str]]r&   r   r(   r)   Optional[dict]r+   r,   r   r-   r/   key_fileallow_variable_data_keysr7   r6   Optional[bool]r8   Optional[List[str]]r:   r<   r=   Optional[int]r>   r?   r@   rA   rB   c           3      C  sd  t  sJ |dkrtd|dkrtdtj|	dd |dkr#d}nd}t| td3i d|d	|d
|d|d|d|d|d|d|d|d|d|d|d|d|d|d|d|d|}tjd3d|i|}tj	|
||||t
|jdt|jd|dd	} t| }!| D ]\}"}#t|#tsJ t|#tdd  |"D sJ |"ttt|# }$t|"|$ksJ t|" d!|$ d"d# |# D }#t|# dksJ zI|jr|#d$ }%t|%|j }&d%}'t|&D ]}(|(d |j }'|j|%|(|j |' dd& q|j|%|'t|% dd&})n|d3i |#})||)}*W n0 tyU }+ z#td'|" d(|+  td)g d*d+},d(d,gd-g|,gg| }*W Y d*}+~+nd*}+~+ww |"d% }-ttd|d |*D ]9\}.\}/}0}1},|!|. d. }2d( |0|2d/ |-< d( t!t"|1|2d0 |-< t"|,j#|2d1 |-< |/d*ur|/|2d2 |-< qdqW d*   d*S 1 sw   Y  d*S )4a  Transducer model inference.

    Args:
        output_dir: Output directory path.
        batch_size: Batch decoding size.
        dtype: Data type.
        beam_size: Beam size.
        ngpu: Number of GPUs.
        seed: Random number generator seed.
        lm_weight: Weight of language model.
        nbest: Number of final hypothesis.
        num_workers: Number of workers.
        log_level: Level of verbose for logs.
        data_path_and_name_and_type:
        asr_train_config: ASR model training config path.
        asr_model_file: ASR model path.
        beam_search_config: Beam search config path.
        lm_train_config: Language Model training config path.
        lm_file: Language Model path.
        model_tag: Model tag.
        token_type: Type of token units.
        bpemodel: BPE model path.
        key_file: File key.
        allow_variable_data_keys: Whether to allow variable data keys.
        quantize_asr_model: Whether to apply dynamic quantization to ASR model.
        quantize_modules: List of module names to apply dynamic quantization on.
        quantize_dtype: Dynamic quantization data type.
        streaming: Whether to perform chunk-by-chunk inference.
        chunk_size: Number of frames in chunk AFTER subsampling.
        left_context: Number of frames in left context AFTER subsampling.
        right_context: Number of frames in right context AFTER subsampling.
        display_partial_hypotheses: Whether to display partial hypotheses.

    r#   z!batch decoding is not implementedz%only single GPU decoding is supportedz>%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s)levelformatcudar   r&   r(   r)   r+   r,   r-   r/   r0   r3   r1   r4   r;   r6   r8   r:   r<   r=   r>   r?   r   FT)r3   r   r   r   preprocess_fn
collate_fnr   	inferencec                 s  s    | ]}t |tV  qd S r   )r   r.   )rG   srE   rE   rI   	<genexpr>:  s    zinference.<locals>.<genexpr>z != c                 S  s$   i | ]\}}| d s||d qS )_lengthsr   )endswith)rG   kvrE   rE   rI   
<dictcomp>>  s   $ zinference.<locals>.<dictcomp>r   r   r   z
Utterance  g        N)scorer   	dec_statez<space>r   
best_recogr   r   r   r   rE   )$r   NotImplementedErrorre   basicConfigr   dictr   r   r   build_streaming_iteratorbuild_preprocess_fnrh   build_collate_fnr   r   typerZ   lennextitervaluesitemskeysr<   rt   ranger   r   r   warningr   zipjoinmapr.   r   )3r   r   r3   r1   r   r   r4   r;   r   r   r   r&   r(   r)   r+   r,   r   r-   r/   r   r   r6   r8   r:   r<   r=   r>   r?   r@   r0   speech2text_kwargsspeech2textloaderwriterr  r   _bsr   _steps_endi
final_hypsr   er   keynr   r   r   ibest_writerrE   rE   rI   r     s   
A	
""$
$r   c                  C  s\  t jdtjd} | jddd dddd	 | jd
tdd | jdtddd | jdtddd | jddg ddd | jdtddd | d}|jdtddd |jdt	d  |jd!t
d"d# | d$}|jd%td&d' |jd(td)d' |jd*td+d' |jd,td-d' |jd.td/d' | d0}|jd1tdd2d |jd3tdd4d |jd5td6d7d |jd8td9d:d |jd;i d<d= | d>}|jd?t	d@g dAdBd	 |jdCt	d@dDd | dE}| jdFtd"dGd | jdHdId@dJdK | jdLtdMdNdMgdOd	 | dP}| jdQtd"dRd | jdStdTdUd | jdVtdWdXd | jdYtddZd | jd[td"d\d | S )]z&Get Transducer model inference parser.zASR Transducer Decoding)descriptionformatter_classz--log_levelc                 S  s   |   S r   )upperr   rE   rE   rI   r   q  r   zget_parser.<locals>.<lambda>INFO)CRITICALERRORWARNINGr  DEBUGNOTSETzThe verbose level of logging)r   defaultchoiceshelpz--output_dirT)r   requiredz--ngpur   z(The number of gpus. 0 indicates CPU mode)r   r  r  z--seedzRandom seedz--dtyper    )rN   r    float64z	Data type)r  r  r  z--num_workersr#   z)The number of workers used for DataLoaderzInput data relatedz--data_path_and_name_and_typer   )r   r   actionz
--key_file)r   z--allow_variable_data_keysF)r   r  zThe model configuration relatedz--asr_train_configzASR training configuration)r   r  z--asr_model_filezASR model parameter filez--lm_train_configzLM training configurationz	--lm_filezLM parameter filez--model_tagz[Pretrained model tag. If specify this option, *_train_config and *_file will be overwrittenzBeam-search relatedz--batch_sizezThe batch size for inferencez--nbestzOutput N-best hypothesesz--beam_sizer   z	Beam sizez--lm_weightr!   zRNNLM weightz--beam_search_configz1The keyword arguments for transducer beam search.)r  r  zText converter relatedz--token_typeN)charrQ   NzIThe token type for ASR model. If not given, refers from the training argsz
--bpemodelzLThe model path of sentencepiece. If not given, refers from the training argszDynamic quantization relatedz--quantize_asr_modelz(Apply dynamic quantization to ASR model.z--quantize_modules*a>  Module names to apply dynamic quantization on.
        The module names are provided as a list, where each name is separated
        by a comma (e.g.: --quantize-config=[Linear,LSTM,GRU]).
        Each specified name should be an attribute of 'torch.nn', e.g.:
        torch.nn.Linear, torch.nn.LSTM, torch.nn.GRU, ...)nargsr  r  z--quantize_dtyper"   rN   zDtype for dynamic quantization.zStreaming relatedz--streamingz,Whether to perform chunk-by-chunk inference.z--chunk_sizer$   z,Number of frames in chunk AFTER subsampling.z--left_contextr%   z@Number of frames in left context of the chunk AFTER subsampling.z--right_contextzANumber of frames in right context of the chunk AFTER subsampling.z--display_partial_hypotheseszFWhether to display partial hypotheses during chunk-by-chunk inference.)r   ArgumentParserargparseArgumentDefaultsHelpFormatteradd_argumentr.   r2   add_argument_groupr   r   r   r5   r7   )parsergrouprE   rE   rI   
get_parserg  s&  






r-  c                 C  sF   t t tjd t }|| }t|}|dd  tdi | d S )N)fileconfigrE   )	printr   sysstderrr-  
parse_argsvarspopr   )cmdr+  argsr   rE   rE   rI   main  s   
r8  __main__)<r   r.   r   r2   r3   r.   r1   r2   r   r2   r   r2   r4   r5   r;   r2   r   r2   r   r   r   r   r&   r   r(   r   r)   r   r+   r   r,   r   r   r   r-   r   r/   r   r   r   r   r7   r6   r   r8   r   r:   r   r<   r   r=   r   r>   r   r?   r   r@   r7   rA   rB   r   ):r   
__future__r   r'  re   r   r1  pathlibr   typingr   r   r   r   r   r	   r
   numpyr   rL   packaging.versionr   r]   	typeguardr   r   -espnet2.asr_transducer.beam_search_transducerr   r   espnet2.asr_transducer.utilsr   espnet2.fileio.datadir_writerr   espnet2.tasks.asr_transducerr   espnet2.tasks.lmr   espnet2.text.build_tokenizerr   espnet2.text.token_id_converterr    espnet2.torch_utils.device_funcsr   'espnet2.torch_utils.set_all_random_seedr   espnet2.utilsr   espnet2.utils.typesr   r   r   espnet.utils.cli_utilsr   r   r   r-  r8  r   rE   rE   rE   rI   <module>   sH   $   
 3 
)
