o
    }oi3                     @   s  d dl Z d dlZd dlZd dlZd dlZd dlmZmZ d dlm	Z	 d dl
Z
d dlmZmZmZ d dl
mZ d dlm  m  mZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlm Z m!Z! G dd deZ"G dd deZ#G dd deZ$G dd de$Z%dS )    N)ABCabstractmethod)List)
DictConfig	OmegaConf	open_dict)Tensor)ASRAdapterModelMixin)StreamingEncoder)asr_module_utils)
Hypothesis)
tokenizers)	app_stateloggingc                   @   s   e Zd ZdZdZdefddZdefddZdefdd	ZddefddZ	dd Z
dd ZdefddZdedefddZdd Zd
S )ASRBPEMixina9  ASR BPE Mixin class that sets up a Tokenizer via a config

    This mixin class adds the method `_setup_tokenizer(...)`, which can be used by ASR models
    which depend on subword tokenization.

    The setup_tokenizer method adds the following parameters to the class -
        -   tokenizer_cfg: The resolved config supplied to the tokenizer (with `dir` and `type` arguments).
        -   tokenizer_dir: The directory path to the tokenizer vocabulary + additional metadata.
        -   tokenizer_type: The type of the tokenizer. Currently supports `bpe` and `wpe`, as well as `agg`.
        -   vocab_path: Resolved path to the vocabulary text file.

    In addition to these variables, the method will also instantiate and preserve a tokenizer
    (subclass of TokenizerSpec) if successful, and assign it to self.tokenizer.

    The mixin also supports aggregate tokenizers, which consist of ordinary, monolingual tokenizers.
    If a conversion between a monolongual and an aggregate tokenizer (or vice versa) is detected,
    all registered artifacts will be cleaned up.
    langstokenizer_cfgc                 C   sH   | d}|d u rtd| dkr| | n| | |   d S )Ntypez`tokenizer.type` cannot be Noneagg)get
ValueErrorlower_setup_aggregate_tokenizer_setup_monolingual_tokenizer_derive_tokenizer_properties)selfr   tokenizer_type r   \/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/asr/parts/mixins/mixins.py_setup_tokenizer;   s   

zASRBPEMixin._setup_tokenizerc           	         sd  dt jvr
dt jd< tj|dd| _| jd| _| jd | _| jdi | _	| 
  t| drfd	| jv rf| j| jj_| j| jj_d|v rft| jj |d| jj_W d    n1 saw   Y  | jd
vrotd| jdkr?d| jv r| jd}nt j| jd}| d|}|| _d| jv r| jd }|d urtdd| jv r| |d d |d| _ntj|d| _d| jv r| jd}nt j| jd}| d|}|| _zd| jv r| jd}nt j| jd}| d|}|| _W n ty   d | _Y nw i  t| jjD ]}| j |g}|d }|d  |< q fdd}t! | jj_|| jj_"| jj#| jj_$ncd| jv rL| jd}nt j| jd}| d|}|| _d| jv ri| jd tj%d | j| j	d!d | j	d"d | j	d#d | j	d$d | j	d%d | j	d&d | j	d'd | j	d(d)d*
| _t&'d+(| jj)j*| jj d S ),NTOKENIZERS_PARALLELISMfalseTresolvedirr   	hf_kwargscfg	tokenizerbpewpezc`tokenizer.type` must be either `bpe` for SentencePiece tokenizer or `wpe` for BERT based tokenizerr)   
model_pathtokenizer.modeltokenizer.model_pathspecial_tokensL`special_tokens` are no longer supported for SentencePiece based tokenizers.custom_tokenizer_target_)r1   r+   r+   
vocab_path	vocab.txttokenizer.vocab_pathspe_tokenizer_vocabtokenizer.vocabtokenizer.spe_tokenizer_vocabr      c                          S Nr   r   
vocabularyr   r   	get_vocab      z;ASRBPEMixin._setup_monolingual_tokenizer.<locals>.get_vocabbert-base-cased
mask_token	bos_token	eos_token	pad_token	sep_token	cls_token	unk_tokenuse_fastF
pretrained_model_name
vocab_filerA   rB   rC   rD   rE   rF   rG   rH   'Tokenizer {} initialized with {} tokens)+osenvironr   to_containerr   poptokenizer_dirr   r   hf_tokenizer_kwargs1_cleanup_aggregate_config_and_artifacts_if_neededhasattrr&   r'   r$   r   r   r   r%   r   pathjoinregister_artifactr+   from_config_dictr   SentencePieceTokenizerr3   spe_vocab_pathFileNotFoundErrorrange
vocab_sizeids_to_tokenslenr>   special_token_to_idall_special_tokensAutoTokenizerr   infoformat	__class____name__)	r   r   r+   r.   r3   rZ   ipiecer>   r   r<   r   r   F   s   











z(ASRBPEMixin._setup_monolingual_tokenizerc           	   	   C   s  dt jvr
dt jd< tj|dd| _d | _| jdd  | jd | _i | _	| jdi  t
d |   t| d	rHd
| jv rH| j| jj_i }| j| j  D ]S\}}| ||\}}}}|||< t| d	rt| jj. | j| j | d | jj| j | d< | j| j | d | jj| j | d< W d    n1 sw   Y  qRd|v r| |d d |d| _d S t|| _d S )Nr    r!   Tr"   r$   r   r%   z1_setup_tokenizer: detected an aggregate tokenizerr&   r'   r0   r1   )r1   r   )rM   rN   r   rO   r   rQ   rP   r   r   rR   r   rc   A_cleanup_monolingual_and_aggregate_config_and_artifacts_if_neededrT   r&   r'   r    AGGREGATE_TOKENIZERS_DICT_PREFIXitems_make_tokenizerr   rX   r   AggregateTokenizer)	r   r   tokenizers_dictlangtokenizer_configr'   r+   r3   rZ   r   r   r   r      sX   





z&ASRBPEMixin._setup_aggregate_tokenizerNc                    s  | d }| d}|dvrtdd }d }d }|dkrd|v r(| d}ntj|d}| d| j d	 | d
 |}d|v rN|d }|d urNtdtj	|d}	d|v r^| d}ntj|d}| d| j d	 | d |}z"d|v r| d}ntj|d}| d| j d	 | d |}W n t
y   d }Y nw i  t|	jD ]}
|	|
g}|d }|
d  |< q fdd}t |	j_||	j_|	j|	j_nbd|v r| d}ntj|d}| d| j d	 | d |}d|v r|d | di }tj| dd|| dd | dd | dd | dd | dd | d d | d!d | d"d#d$
}	td%|	jj|	j |	|||fS )&Nr   r$   r(   zb`tokenizer.type` must be either `bpe` for SentencePiece tokenizer or`wpe` for BERT based tokenizerr)   r+   r,   
tokenizer..z.model_pathr.   r/   r2   r3   r4   z.vocab_pathr6   r7   z.spe_tokenizer_vocabr   r9   c                      r:   r;   r   r   r<   r   r   r>   2  r?   z.ASRBPEMixin._make_tokenizer.<locals>.get_vocabr%   rJ   r@   rA   rB   rC   rD   rE   rF   rG   rH   FrI   rL   )r   r   r   rM   rU   rV   rW   rj   r   rY   r[   r\   r]   r^   r_   r'   r>   r`   ra   rP   rb   r   rc   rd   re   rf   )r   r   ro   r   rQ   r+   r3   rZ   r.   r'   rg   rh   r>   rR   r   r<   r   rl      s   










zASRBPEMixin._make_tokenizerc                 C   s   t | drCt| jj0 | jjdd | jjdd | jjdd | jjdd | jjdd W d   n1 s>w   Y  t | drw| jd	d | jd
d | jdd t| j D ]}|d| j	 d rv| j| qddS dS )a  
        Clean ups any monolingual and some aggregate config items and artifacts.
        We need to do this when we switch from a monolingual tokenizer to an aggregate one
        or go between aggregate tokenizers which could have a different number of languages
        r&   r$   Nr+   r3   r6   r%   	artifactsr-   r5   r8   rq   rr   )
rT   r   r&   r'   rP   rs   listkeys
startswithrj   r   akeyr   r   r   ri   ^  s$   

zMASRBPEMixin._cleanup_monolingual_and_aggregate_config_and_artifacts_if_neededc                 C   s   t | dr$t| jj | jj| jd W d   n1 sw   Y  t | drCt| j D ]}|	d| j d rB| j| q0dS dS )z
        Clean ups any aggregate config items and artifacts.
        We need to do this when we switch from an aggregate tokenizer to a monolingual one
        r&   Nrs   rq   rr   )
rT   r   r&   r'   rP   rj   rt   rs   ru   rv   rw   r   r   r   rS   x  s   

z=ASRBPEMixin._cleanup_aggregate_config_and_artifacts_if_needed	directoryc                 C   sl   t | ds	td| jdkr,| jjD ]}| jjj|}tj	||}| 
|| qdS | 
| jj| dS )z
        Save the model tokenizer(s) to the specified directory.

        Args:
            directory: The directory to save the tokenizer(s) to.
        r&   z}The model has not been initialized with a tokenizer yet. Please call the model's __init__ and _setup_tokenizer methods first.r   N)rT   RuntimeErrorr   r'   r   r&   r   rM   rU   rV   _extract_tokenizer_from_config)r   ry   ro   	subconfignew_dirr   r   r   save_tokenizers  s   

zASRBPEMixin.save_tokenizersr$   c                    s  t j|st j|dd g }| D ]5\}}t|tr3t j|r3t||}t	
d| d|  t|trH|drH|dd  |  qt|dkrt	d	| d
|  t| dsctdt }|| jj}|du rvtdd}	zt||	}
|
  W n tjy   d}	Y nw t||	}|D ]U  fdd| D }|D ]C}||| |jddd }t|dkrd|}n|d }t t j||jt j|| t	
d  dt j||  qqdS dS )a  
        Extracts the tokenizer from the config and write the objects to dir.
        The file may be from a local path (new model init) or from a .nemo file (restored model).
        If its from a newly initialized model, the file is copied to dir.
        If its from a restored model, the file is extracted from the .nemo file and copied to dir.

        Args:
            tokenizer_cfg: The tokenizer config to extract the tokenizer from.
            dir: The directory to write the tokenizer objects to.
        T)exist_okzSaved z at znemo:   Nr   z+Copying the following nemo file objects to z: 
model_guidzqThe model does not have a model_guid attribute. Please ensure that the model has been restored from a .nemo file.zsThe model has not been restored from a .nemo file. Cannot extract the tokenizer as the nemo file cannot be located.zr:zr:gzc                    s   g | ]	} |j v r|qS r   )name).0xnemo_object_namer   r   
<listcomp>  s    z>ASRBPEMixin._extract_tokenizer_from_config.<locals>.<listcomp>_r9   ) rM   rU   existsmakedirsrk   
isinstancestrshutilcopy2r   rc   rv   appendr_   debugrT   r   r   AppStateget_model_metadata_from_guidr   restoration_pathtarfileopenclose	ReadError
getmembersextractr   splitrV   rename)r   r   r$   nemo_file_objectskvlocappstaterestore_path
tar_headertar_testtarmembersmembernew_namer   r   r   r{     sX   

""z*ASRBPEMixin._extract_tokenizer_from_configc                 C   s@   | j j  }dd |D }t|| j _dd |D }|| j _d S )Nc                 S   s&   h | ]}t d d |D r| qS )c                 s   s    | ]}|  V  qd S r;   )isupper)r   charr   r   r   	<genexpr>  s    zEASRBPEMixin._derive_tokenizer_properties.<locals>.<setcomp>.<genexpr>)anystrip)r   tokenr   r   r   	<setcomp>  s   & z;ASRBPEMixin._derive_tokenizer_properties.<locals>.<setcomp>c                 S   s*   h | ]}|D ]}t |d r|qqS )P)unicodedatacategoryrv   )r   r   r   r   r   r   r     s   * )r'   r>   boolsupports_capitalizationsupported_punctuation)r   vocabcapitalized_tokenspunctuationr   r   r   r     s
   z(ASRBPEMixin._derive_tokenizer_propertiesr;   )rf   
__module____qualname____doc__rj   r   r   r   r   rl   ri   rS   r   r~   r{   r   r   r   r   r   r   $   s    u6mFr   c                   @   s   e Zd ZdZd'dedefddZ	d(ded	ee defd
dZ		d'dedefddZ
										d)dededededededee dedededefddZe 				d*dee ded ed!ed"ef
d#d$Z	d+dee defd%d&ZdS ),ASRModuleMixinaQ  
    ASRModuleMixin is a mixin class added to ASR models in order to add methods that are specific
    to a particular instantiation of a module inside of an ASRModel.

    Each method should first check that the module is present within the subclass, and support additional
    functionality if the corresponding module is present.
    Tcontext_windowupdate_configc                 C   s   t j| ||d dS )a  
        Update the context window of the SqueezeExcitation module if the provided model contains an
        `encoder` which is an instance of `ConvASREncoder`.

        Args:
            context_window:  An integer representing the number of input timeframes that will be used
                to compute the context. Each timeframe corresponds to a single window stride of the
                STFT features.

                Say the window_stride = 0.01s, then a context window of 128 represents 128 * 0.01 s
                of context to compute the Squeeze step.
            update_config: Whether to update the config or not with the new context window.
        )r   r   N)r   !change_conv_asr_se_context_window)r   r   r   r   r   r   r     s   
z0ASRModuleMixin.change_conv_asr_se_context_windowNself_attention_modelatt_context_sizec                 C   s   |du r
|du r
dS t | dstd dS t | jds#td dS | j|||| j |rQt| j || jj_|| jj_	W d   dS 1 sJw   Y  dS dS )a  
        Update the self_attention_model if function is available in encoder.

        Args:
            self_attention_model (str): type of the attention layer and positional encoding

                'rel_pos':
                    relative positional embedding and Transformer-XL

                'rel_pos_local_attn':
                    relative positional embedding and Transformer-XL with local attention using
                    overlapping windows. Attention context is determined by att_context_size parameter.

                'abs_pos':
                    absolute positional embedding and Transformer

                If None is provided, the self_attention_model isn't changed. Defauts to None.
            att_context_size (List[int]): List of 2 ints corresponding to left and right attention context sizes,
                or None to keep as it is. Defauts to None.
            update_config (bool): Whether to update the config or not with the new attention model.
                Defaults to True.
        NencoderzCould not change the self_attention_model in encoder since the model provided does not contain an `encoder` module in its config.change_attention_modelz;Model encoder doesn't have a change_attention_model method )
rT   r   rc   r   r   devicer   r&   r   r   )r   r   r   r   r   r   r   r     s"   


"z%ASRModuleMixin.change_attention_model subsampling_conv_chunking_factorc                 C   s   t | dstd dS t | jdstd dS | j| |r>t| j || jj_W d   dS 1 s7w   Y  dS dS )a  
        Update the conv_chunking_factor (int) if function is available in encoder.
        Default is 1 (auto)
        Set it to -1 (disabled) or to a specific value (power of 2) if you OOM in the conv subsampling layers

        Args:
            conv_chunking_factor (int)
        r   zCould not call the change_subsampling_conv_chunking_factor method in encoder since the model provided does not contain an `encoder` module in its config.N'change_subsampling_conv_chunking_factorzLModel encoder doesn't have a change_subsampling_conv_chunking_factor method )rT   r   rc   r   r   r   r&   r   )r   r   r   r   r   r   r   4  s   

"z6ASRModuleMixin.change_subsampling_conv_chunking_factorFprocessed_signalprocessed_signal_lengthcache_last_channelcache_last_timecache_last_channel_lenkeep_all_outputsprevious_hypothesesprevious_pred_outdrop_extra_pre_encodedreturn_transcriptionreturn_log_probsc              	   C   s   t | tjst | tjstdt|  dt | jts tdt | tjr/|
du r/t	d t | tjs>|du r>t	d | jj
|||||||	d\}}}}}t | tjsat | tjr| jd	krt| d
rm| j}| j}n| j}| j}||d}|jddd}g }|
rg }nd}t|D ]T\}}|du r|| }n
||d|| f }|durtj|| |fdd}||  t|| 7  < n|}|| |
r|j|d|||d  dd}||d  qd}n| jj||d|d}dd |D }|}||||||g}|r|| || t|S )a6  
        It simulates a forward step with caching for streaming purposes.
        It supports the ASR models where their encoder supports streaming like Conformer.
        Args:
            processed_signal: the input audio signals
            processed_signal_length: the length of the audios
            cache_last_channel: the cache tensor for last channel layers like MHA
            cache_last_channel_len: lengths for cache_last_channel
            cache_last_time: the cache tensor for last time layers like convolutions
            keep_all_outputs: if set to True, would not drop the extra outputs specified by encoder.streaming_cfg.valid_out_len
            previous_hypotheses: the hypotheses from the previous step for RNNT models
            previous_pred_out: the predicted outputs from the previous step for CTC models
            drop_extra_pre_encoded: number of steps to drop from the beginning of the outputs after the downsampling module. This can be used if extra paddings are added on the left side of the input.
            return_transcription: whether to decode and return the transcriptions. It can not get disabled for Transducer models.
            return_log_probs: whether to return the log probs, only valid for ctc model

        Returns:
            greedy_predictions: the greedy predictions from the decoder
            all_hyp_or_transcribed_texts: the decoder hypotheses for Transducer models and the transcriptions for CTC models
            cache_last_channel_next: the updated tensor cache for last channel layers to be used for next streaming step
            cache_last_time_next: the updated tensor cache for last time layers to be used for next streaming step
            cache_last_channel_next_len: the updated lengths for cache_last_channel
            best_hyp: the best hypotheses for the Transducer models
            log_probs: the logits tensor of current streaming chunk, only returned when return_log_probs=True
            encoded_len: the length of the output log_probs + history chunk log_probs, only returned when return_log_probs=True
        zstream_step does not support !1Encoder of this model does not support streaming!Fzfreturn_transcription can not be False for Transducer models as decoder returns the transcriptions too.Tz1return_log_probs can only be True for CTC models.)r   r   r   r   r   r   r   ctcctc_decoder)encoder_output)dimkeepdimN)r   r   r9   )decoder_outputsdecoder_lengthsreturn_hypotheses)r   encoded_lengthsr   partial_hypothesesc                 S   s   g | ]}|j qS r   )
y_sequence)r   hypr   r   r   r     s    z8ASRModuleMixin.conformer_stream_step.<locals>.<listcomp>)r   
asr_modelsEncDecRNNTModelEncDecCTCModelNotImplementedErrorr   r   r
   r   rc   cache_aware_stream_stepEncDecHybridRNNTCTCModelcur_decoderrT   ctc_decodingr   decodingdecoderargmax	enumeratetorchcatr_   r   ctc_decoder_predictions_tensor	unsqueezernnt_decoder_predictions_tensortuple)r   r   r   r   r   r   r   r   r   r   r   r   encodedencoded_lencache_last_channel_nextcache_last_time_nextcache_last_channel_next_lenr   r   	log_probspredictions_tensorgreedy_predictionsall_hyp_or_transcribed_texts	preds_idxpreds	preds_curgreedy_predictions_concatdecoded_outbest_hypresultr   r   r   conformer_stream_stepP  s   (








z$ASRModuleMixin.conformer_stream_step   paths2audio_files
batch_sizelogprobsr   online_normalizationc                 C   s  |du s
t |dkri S |r|rtdt| tjs$tdt|  dt| jts.td| 	|||}g }g }|D ]}	t
|	}
t |	j}| jj|d\}}}d}d}d}d}g }t|
D ]W\}\}}|dkrm| jjjnd}t : | j||||||	 |||d|p|d	}|s|r|\}}}}}}}}||  n|\}}}}}}W d   n1 sw   Y  q^|s|rtj|d
d}t||D ]\}}||d|  q|du r|dg| 7 }q;||7 }q;|r|S |s|S g }t||D ]\}}|t||ddd q|S )a  
        Args:
            paths2audio_files: (a list) of paths to audio files.
            batch_size: (int) batch size to use during inference.
                Bigger will result in better throughput performance but would use more memory.
            logprobs: (bool) pass True to get log probabilities instead of transcripts.
            return_hypotheses: (bool) Either return hypotheses or text
                With hypotheses can do some postprocessing like getting timestamp or rescoring
            online_normalization: (bool) Perform normalization on the run per chunk.
        Returns:
            A list of transcriptions (or raw log probabilities if logprobs is True) in the same order as paths2audio_files
        Nr   zuEither `return_hypotheses` or `logprobs` can be True at any given time.Returned hypotheses will contain the logprobs.z$simulate streaming does not support r   r   )r  T)r   r   r   r   r   r   r   r   r   r   r   r9   )axis         )r   textscore	dec_state)r_   r   r   r   r   r   r   r   r
   &_setup_streaming_transcribe_dataloaderiterstreams_lengthget_initial_cache_stater   streaming_cfgr   r   inference_moder  is_buffer_emptyr   cpur   zipr   )r   r  r  r  r   r  data_loadertotal_log_probstotal_textsstreaming_bufferstreaming_buffer_iterr   r   r   r   pred_out_streamr   transcribed_textsbatch_log_probsstep_numchunk_audiochunk_lengthsr   r  cur_chunk_log_probsr   log_prob_lenhypsr  r   r   r   )transcribe_simulate_cache_aware_streaming  s   


	$
z8ASRModuleMixin.transcribe_simulate_cache_aware_streamingc                 c   s    ddl m} || |d}t|D ]?\}}|j|dd\}}	}
td|  |d | dks7|t|d krPtd|t| d  d	| d
 |V  |  qdS )a  
        Setup function for a temporary data loader which wraps the provided audio file.

        Args:
            paths2audio_files: (a list) of paths to audio files.
            batch_size: (int) batch size to use during inference.
                Bigger will result in better throughput performance but would use more memory.
            online_normalization: whether to do online normalization
        Returns:
            a new batch streaming buffer
        r   )CacheAwareStreamingAudioBuffer)modelr  r   )	stream_idz!Added this sample to the buffer: r9   zStarting to stream samples z to z...N)0nemo.collections.asr.parts.utils.streaming_utilsr&  r   append_audio_filer   rc   r_   reset_buffer)r   r  r  r  r&  r  
sample_idxsampler   r   r(  r   r   r   r  J  s    $z5ASRModuleMixin._setup_streaming_transcribe_dataloader)T)NNT)
NNNNTNNNTF)r  FFF)F)rf   r   r   r   intr   r   r   r   r   r   r   r   r  r   no_gradr%  r  r   r   r   r   r     s    
.
	

 
qr   c                   @   s   e Zd Zedd ZdS )VerificationMixinc                 C   sj   t |ddd$}| D ]}| }|ddddd}|t|d	  q
W d   dS 1 s.w   Y  dS )
a  
        Takes paths to audio files and manifest filepath and creates manifest file with the audios
        Args:
            paths2audio_files: paths to audio fragment to be verified
            manifest_filepath: path to manifest file to bre created
        wzutf-8)encodingr
  N-infer)audio_filepathoffsetdurationr  label
)r   r   writejsondumps)r  manifest_filepathfp
audio_fileentryr   r   r   path2audio_files_to_manifestg  s   "z.VerificationMixin.path2audio_files_to_manifestN)rf   r   r   staticmethodrA  r   r   r   r   r0  f  s    r0  c                	   @   s0   e Zd Zeddee dedee fddZdS )	DiarizationMixinr9   r  r  returnc                 C   s   dS )z
        Takes paths to audio files and returns speaker labels
        Args:
            paths2audio_files: paths to audio fragment to be transcribed

        Returns:
            Speaker labels
        Nr   )r   r  r  r   r   r   diarizew  s   
zDiarizationMixin.diarizeN)r9   )rf   r   r   r   r   r   r.  rE  r   r   r   r   rC  v  s    &rC  )&r;  rM   r   r   r   abcr   r   typingr   r   	omegaconfr   r   r   r   nemo.collections.asr.modelscollectionsasrmodelsr   4nemo.collections.asr.parts.mixins.asr_adapter_mixinsr	   +nemo.collections.asr.parts.mixins.streamingr
    nemo.collections.asr.parts.utilsr   +nemo.collections.asr.parts.utils.rnnt_utilsr   nemo.collections.commonr   
nemo.utilsr   r   r   r   r0  rC  r   r   r   r   <module>   s4      K  |