o
    }oiӤ                    @   s:  d dl Z d dlZd dlZd dlmZ d dlmZmZmZ d dl	m
Z
mZmZmZmZmZ d dlZd dlZd dlmZ d dlmZmZmZ d dlmZmZ d dlmZmZ d d	lm Z m!Z! d d
l"m#Z# d dl$m%Z% d dl&m'Z'm(Z( G dd deZ)G dd de)Z*G dd de)Z+eG dd dZ,eG dd de,Z-dS )    N)abstractmethod)	dataclassfieldis_dataclass)CallableDictListOptionalSetUnion)	OmegaConf)rnnt_beam_decodingrnnt_greedy_decodingtdt_beam_decoding)ConfidenceConfigConfidenceMixin)BlankLMScoreModePruningMode)
HypothesisNBestHypotheses)AggregateTokenizer)TokenizerSpec)logginglogging_modec                       s4  e Zd ZdZd:dedee f fddZ		d;dej	d	ej	d
e
deee  deee eee  f f
ddZdee deeeef  fddZdee dee fddZedeeeeeef f  deeeeeef f  dedee deeeeeef f  f
ddZedee defddZedee dee fddZedee defddZedee dee fd d!Zdee defd"d#Zd$d% Zd<d'ed(efd)d*Zed'ed+ee d,edeeeeeef f  fd-d.Zed'edeeeeeef f  fd/d0Z e	d:deeeeeef f  deeeeeef f  dee deeeeeef f  fd1d2Z!e	d:deeeeeef f  deeeeeef f  dee deeeeeef f  fd3d4Z"e		d=d5eeeeeef f  d6ee dee d7ee deeeeeef f  f
d8d9Z#  Z$S )>AbstractRNNTDecodingu/  
    Used for performing RNN-T auto-regressive decoding of the Decoder+Joint network given the encoder state.

    Args:
        decoding_cfg: A dict-like object which contains the following key-value pairs.
            strategy: str value which represents the type of decoding that can occur.
                Possible values are :
                -   greedy, greedy_batch (for greedy decoding).
                -   beam, tsd, alsd (for beam search decoding).

            compute_hypothesis_token_set: A bool flag, which determines whether to compute a list of decoded
                tokens as well as the decoded string. Default is False in order to avoid double decoding
                unless required.

            preserve_alignments: Bool flag which preserves the history of logprobs generated during
                decoding (sample / batched). When set to true, the Hypothesis will contain
                the non-null value for `alignments` in it. Here, `alignments` is a List of List of
                Tuple(Tensor (of length V + 1), Tensor(scalar, label after argmax)).

                In order to obtain this hypothesis, please utilize `rnnt_decoder_predictions_tensor` function
                with the `return_hypotheses` flag set to True.

                The length of the list corresponds to the Acoustic Length (T).
                Each value in the list (Ti) is a torch.Tensor (U), representing 1 or more targets from a vocabulary.
                U is the number of target tokens for the current timestep Ti.

            tdt_include_token_duration: Bool flag, which determines whether predicted durations for each token
            need to be included in the Hypothesis object. Defaults to False.

            compute_timestamps: A bool flag, which determines whether to compute the character/subword, or
                word based timestamp mapping the output log-probabilities to discrete intervals of timestamps.
                The timestamps will be available in the returned Hypothesis.timestep as a dictionary.

            rnnt_timestamp_type: A str value, which represents the types of timestamps that should be calculated.
                Can take the following values - "char" for character/subword time stamps, "word" for word level
                time stamps, "segment" for segment level time stamps and "all" (default), for character, word and
                segment level time stamps.

            word_seperator: Str token representing the seperator between words.

            segment_seperators: List containing tokens representing the seperator(s) between segments.

            segment_gap_threshold: The threshold (in frames) that caps the gap between two words necessary for forming
            the segments.

            preserve_frame_confidence: Bool flag which preserves the history of per-frame confidence scores
                generated during decoding (sample / batched). When set to true, the Hypothesis will contain
                the non-null value for `frame_confidence` in it. Here, `alignments` is a List of List of ints.

            confidence_cfg: A dict-like object which contains the following key-value pairs related to confidence
                scores. In order to obtain hypotheses with confidence scores, please utilize
                `rnnt_decoder_predictions_tensor` function with the `preserve_frame_confidence` flag set to True.

                preserve_frame_confidence: Bool flag which preserves the history of per-frame confidence scores
                    generated during decoding (sample / batched). When set to true, the Hypothesis will contain
                    the non-null value for `frame_confidence` in it. Here, `alignments` is a List of List of floats.

                    The length of the list corresponds to the Acoustic Length (T).
                    Each value in the list (Ti) is a torch.Tensor (U), representing 1 or more confidence scores.
                    U is the number of target tokens for the current timestep Ti.
                preserve_token_confidence: Bool flag which preserves the history of per-token confidence scores
                    generated during greedy decoding (sample / batched). When set to true, the Hypothesis will contain
                    the non-null value for `token_confidence` in it. Here, `token_confidence` is a List of floats.

                    The length of the list corresponds to the number of recognized tokens.
                preserve_word_confidence: Bool flag which preserves the history of per-word confidence scores
                    generated during greedy decoding (sample / batched). When set to true, the Hypothesis will contain
                    the non-null value for `word_confidence` in it. Here, `word_confidence` is a List of floats.

                    The length of the list corresponds to the number of recognized words.
                exclude_blank: Bool flag indicating that blank token confidence scores are to be excluded
                    from the `token_confidence`.
                aggregation: Which aggregation type to use for collapsing per-token confidence into per-word
                    confidence. Valid options are `mean`, `min`, `max`, `prod`.
                tdt_include_duration: Bool flag indicating that the duration confidence scores are to be calculated and
                    attached to the regular frame confidence,
                    making TDT frame confidence element a pair: (`prediction_confidence`, `duration_confidence`).
                method_cfg: A dict-like object which contains the method name and settings to compute per-frame
                    confidence scores.

                    name: The method name (str).
                        Supported values:
                            - 'max_prob' for using the maximum token probability as a confidence.
                            - 'entropy' for using a normalized entropy of a log-likelihood vector.

                    entropy_type: Which type of entropy to use (str).
                        Used if confidence_method_cfg.name is set to `entropy`.
                        Supported values:
                            - 'gibbs' for the (standard) Gibbs entropy. If the alpha (α) is provided,
                                the formula is the following: H_α = -sum_i((p^α_i)*log(p^α_i)).
                                Note that for this entropy, the alpha should comply the following inequality:
                                (log(V)+2-sqrt(log^2(V)+4))/(2*log(V)) <= α <= (1+log(V-1))/log(V-1)
                                where V is the model vocabulary size.
                            - 'tsallis' for the Tsallis entropy with the Boltzmann constant one.
                                Tsallis entropy formula is the following: H_α = 1/(α-1)*(1-sum_i(p^α_i)),
                                where α is a parameter. When α == 1, it works like the Gibbs entropy.
                                More: https://en.wikipedia.org/wiki/Tsallis_entropy
                            - 'renyi' for the Rényi entropy.
                                Rényi entropy formula is the following: H_α = 1/(1-α)*log_2(sum_i(p^α_i)),
                                where α is a parameter. When α == 1, it works like the Gibbs entropy.
                                More: https://en.wikipedia.org/wiki/R%C3%A9nyi_entropy

                    alpha: Power scale for logsoftmax (α for entropies). Here we restrict it to be > 0.
                        When the alpha equals one, scaling is not applied to 'max_prob',
                        and any entropy type behaves like the Shannon entropy: H = -sum_i(p_i*log(p_i))

                    entropy_norm: A mapping of the entropy value to the interval [0,1].
                        Supported values:
                            - 'lin' for using the linear mapping.
                            - 'exp' for using exponential mapping with linear shift.

            The config may further contain the following sub-dictionaries:
            "greedy":
                max_symbols: int, describing the maximum number of target tokens to decode per
                    timestep during greedy decoding. Setting to larger values allows longer sentences
                    to be decoded, at the cost of increased execution time.
                preserve_frame_confidence: Same as above, overrides above value.
                confidence_method_cfg: Same as above, overrides confidence_cfg.method_cfg.

            "beam":
                beam_size: int, defining the beam size for beam search. Must be >= 1.
                    If beam_size == 1, will perform cached greedy search. This might be slightly different
                    results compared to the greedy search above.

                score_norm: optional bool, whether to normalize the returned beam score in the hypotheses.
                    Set to True by default.

                return_best_hypothesis: optional bool, whether to return just the best hypothesis or all of the
                    hypotheses after beam search has concluded. This flag is set by default.

                tsd_max_sym_exp: optional int, determines number of symmetric expansions of the target symbols
                    per timestep of the acoustic model. Larger values will allow longer sentences to be decoded,
                    at increased cost to execution time.

                alsd_max_target_len: optional int or float, determines the potential maximum target sequence length.
                    If an integer is provided, it can decode sequences of that particular maximum length.
                    If a float is provided, it can decode sequences of int(alsd_max_target_len * seq_len),
                    where seq_len is the length of the acoustic model output (T).

                    NOTE:
                        If a float is provided, it can be greater than 1!
                        By default, a float of 2.0 is used so that a target sequence can be at most twice
                        as long as the acoustic model output length T.

                maes_num_steps: Number of adaptive steps to take. From the paper, 2 steps is generally sufficient,
                    and can be reduced to 1 to improve decoding speed while sacrificing some accuracy. int > 0.

                maes_prefix_alpha: Maximum prefix length in prefix search. Must be an integer, and is advised to keep
                    this as 1 in order to reduce expensive beam search cost later. int >= 0.

                maes_expansion_beta: Maximum number of prefix expansions allowed, in addition to the beam size.
                    Effectively, the number of hypothesis = beam_size + maes_expansion_beta. Must be an int >= 0,
                    and affects the speed of inference since large values will perform large beam search in the next
                    step.

                maes_expansion_gamma: Float pruning threshold used in the prune-by-value step when computing the
                    expansions. The default (2.3) is selected from the paper. It performs a comparison
                    (max_log_prob - gamma <= log_prob[v]) where v is all vocabulary indices in the Vocab set and
                    max_log_prob is the "most" likely token to be predicted. Gamma therefore provides a margin of
                    additional tokens which can be potential candidates for expansion apart from the "most likely"
                    candidate. Lower values will reduce the number of expansions (by increasing pruning-by-value,
                    thereby improving speed but hurting accuracy). Higher values will increase the number of expansions
                    (by reducing pruning-by-value, thereby reducing speed but potentially improving accuracy). This is
                    a hyper parameter to be experimentally tuned on a validation set.

                softmax_temperature: Scales the logits of the joint prior to computing log_softmax.

        decoder: The Decoder/Prediction network module.
        joint: The Joint network module.
        blank_id: The id of the RNNT blank token.
        supported_punctuation: Set of punctuation marks in the vocabulary
    Nblank_idsupported_punctuationc                    s  t t|   t|rt|}|| _|| _|| _|j	| _	| j
dd | _| j
dd | _| j
dd| _|
dd| _| j
dd | _| j
dd | _| j
dd | _| j
d	d| _| j
d
d| _| j
dg d| _| j
dd | _| jd uo| jg k| _| jr|dkrtd| jd ur| jg krtd| jjdvrtd| jd ur| jg kr|dkrtd| jjdvrtdg d}| jj|vrtd| | jd u r| jjdv r| jj
dd| _n| jjdv r| jj
dd| _| jd u r| jjdv r| jj
dd| _n| jjdv r| jj
dd| _| jr5ddd | jD }td| d | _| jsI| jdu rI| jdu rItd | | j
d!d  | jrw| j du rg| jdu rgtd"| jpm| j| _| j!| _"| j#| _$| j s| jjdv r| jj
d#drt%d$| jj d%| jjd&kr| jd u s| jg kr| jst&j'||| j| jj
d'd p| jj
d(d | j| j | j(d)| _)n t&j*||| j| j| jj
d'd p| jj
d(d | j| j | j| j+| j(d*
| _)nt&j,||| j| j| jj
d'd p| jj
d(d | j| j | j(d+| _)n| jjd,kr| jd u s-| jg kr| jsot&j-||| j| jj
d'd pG| jj
d(d | j| j | j(| jj
d-d| jj
d.d| jj
d/d | jj
d0dd1| _)n_t&j.||| j| j| jj
d'd p| jj
d(d | j| j | j| j+| j(| jj
d.d| jj
d/d | jj
d0dd2| _)n"t&j/||| j| j| jj
d'd p| jj
d(d | j| j | j(d+| _)n| jjd3kr3| jd u s| jg kr1| jst0j1||| jjj2|j
d4dd5| jj
d6d| jj
d7d8| jd9| _)nt3j4||| j| jjj2|j
d4dd5| jj
d6d| jj
d7d8| jd:	| _)n| jjd;krft0j1||| jjj2|j
d4dd;| jj
d6d| jj
d<d=| jj
d7d8| jd>	| _)nh| jjd?krt0j1||| jjj2|j
d4dd?| jj
d6d| jj
d@dA| jj
d7d8| jdB	| _)n5| jjdCkr| jd u s| jg kr}| js.t0j1d^i dD|dE|dF| jjj2d4|j
d4ddGdCd6| jj
d6ddH| jj
dHdAdI| jj
dIdJdK| jj
dKdLdM| jj
dMdNd7| jj
d7d8d| jd/| jj
d/d d0| jj
d0dOdP| jj
dPddQ| jj
dQdO| _)nt3j4||| j| jjj2|j
d4ddC| jj
d6d| jj
dHdA| jj
dIdJ| jj
dKdL| jj
dMdN| jj
d7d8| j| jj
d/d | jj
d0dRdS| _)nO| jjdTkr0| jd u s| jg kr/| jst0j5||| j| jjj2dT| jj
d'd=| j| jj
d/d | jj
d0dO| jj
dUt6j7| jj
dVt8j9| jj
d6d| jj
dWd| jj
d4ddX| _)nt3j:||| j| j| jjj2dT| jj
d'd=| j| jj
d/d | jj
d0dO| jj
dUt6j7| jj
dVt8j9| jj
d6d| jj
dWd| jj
d4ddY| _)n| jjdZkr| jd u sC| jg kr| jst0j5d^i dD|dE|d[| jdF| jjj2dGdZdH| jj
dHdAdM| jj
dMdAdK| jj
dKdLd| jd/| jj
d/d d0| jj
d0dOdU| jj
dUt6j7dV| jj
dVt8j9d6| jj
d6ddW| jj
dWdd4| jj
d4d| _)ntd\| d]| jj | ;  d S )_Nbig_blank_durations	durationscompute_hypothesis_token_setFcompute_langspreserve_alignmentsfused_batch_sizecompute_timestampstdt_include_token_durationword_seperator segment_seperators).?!segment_gap_thresholdr   z8blank_id must equal len(non_blank_vocabs) for TDT modelsz7duration and big_blank_durations can't both be not None)greedygreedy_batchbeammaesmalsd_batchzXcurrently only greedy, greedy_batch, beam and maes inference is supported for TDT modelsz<blank_id must equal len(vocabs) for multi-blank RNN-T models)r,   r-   zTcurrently only greedy and greedy_batch inference is supported for multi-blank models)r,   r-   r.   tsdalsdr/   r0   
maes_batchz!Decoding strategy must be one of )r.   r1   r2   r/   |c                 S   s   g | ]}t |qS  )reescape.0pr5   r5   g/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/asr/parts/submodules/rnnt_decoding.py
<listcomp>  s    z1AbstractRNNTDecoding.__init__.<locals>.<listcomp>z(\s)()TzUIf `compute_timesteps` flag is set, then `preserve_alignments` flag must also be set.confidence_cfgz]If `preserve_frame_confidence` flag is set, then `preserve_alignments` flag must also be set.preserve_frame_confidencez6Confidence calculation is not supported for strategy ``r,   max_symbolsmax_symbols_per_step)decoder_modeljoint_modelblank_indexrB   r!   r?   confidence_method_cfg)
rC   rD   rE   r   rB   r!   r?   include_durationinclude_duration_confidencerF   )rC   rD   rE   r   rB   r!   r?   rF   r-   loop_labelsuse_cuda_graph_decoderngram_lm_modelngram_lm_alpha)rC   rD   rE   rB   r!   r?   rF   rI   rJ   rK   rL   )rC   rD   rE   r   rB   r!   r?   rG   rH   rF   rJ   rK   rL   r.   return_best_hypothesisdefault
score_normsoftmax_temperature      ?)rC   rD   	beam_sizerM   search_typerO   rP   r!   )	rC   rD   r   rR   rM   rS   rO   rP   r!   r1   tsd_max_sym_exp
   )	rC   rD   rR   rM   rS   rO   tsd_max_sym_exp_per_steprP   r!   r2   alsd_max_target_len   )	rC   rD   rR   rM   rS   rO   rW   rP   r!   r/   rC   rD   rR   rS   maes_num_stepsmaes_prefix_alpha   maes_expansion_gammagffffff@maes_expansion_betag       @g        hat_subtract_ilmhat_ilm_weightg333333?)rC   rD   r   rR   rM   rS   rO   rY   rZ   r\   r]   rP   r!   rK   rL   r0   blank_lm_score_modepruning_modeallow_cuda_graphs)rC   rD   rE   rR   rS   rB   r!   rK   rL   r`   ra   rO   rb   rM   )rC   rD   rE   r   rR   rS   rB   r!   rK   rL   r`   ra   rO   rb   rM   r3   rE   z5Incorrect decoding strategy supplied. Must be one of z
but was provided r5   )<superr   __init__r   r   
structuredcfgr   r   num_extra_outputsgetr   r   r   r    r!   joint_fused_batch_sizer#   r$   r%   r'   r+   _is_tdt
ValueErrorstrategyr,   r.   joinr6   compilespace_before_punct_pattern_init_confidencer?   _compute_offsets_tdt_compute_offsets_refine_timestamps_tdt_refine_timestampsNotImplementedErrorr   GreedyRNNTInferrF   decodingGreedyTDTInfertdt_include_duration_confidenceGreedyMultiblankRNNTInferGreedyBatchedRNNTInferGreedyBatchedTDTInfer GreedyBatchedMultiblankRNNTInferr   BeamRNNTInferrR   r   BeamTDTInferBeamBatchedRNNTInferr   LM_WEIGHTED_FULLr   LATEBeamBatchedTDTInferupdate_joint_fused_batch_size)selfdecoding_cfgdecoderjointr   r   possible_strategiespunct_pattern	__class__r5   r;   rd      s  

   

	




	
zAbstractRNNTDecoding.__init__Fencoder_outputencoded_lengthsreturn_hypothesespartial_hypothesesreturnc                 C   s^  t   | j|||d}|d }W d   n1 sw   Y  |}t|d trrg }g }|D ]5}	|	j}
| |
}| jdu rX| j	dd}t
t|D ]}| || |||< qK||d  || q/|ri|S dd |D }|S | |}| jdu r| j	dd}t
t|D ]}| || |||< q|r| jr| js| jr| |}|S d	d |D S )
a  
        Decode an encoder output by autoregressive decoding of the Decoder+Joint networks.

        Args:
            encoder_output: torch.Tensor of shape [B, D, T].
            encoded_lengths: torch.Tensor containing lengths of the padded encoder outputs. Shape [B].
            return_hypotheses: bool. If set to True it will return list of Hypothesis or NBestHypotheses

        Returns:
            If `return_all_hypothesis` is set:
                A list[list[Hypothesis]].
                    Look at rnnt_utils.Hypothesis for more information.

            If `return_all_hypothesis` is not set:
                A list[Hypothesis].
                List of best hypotheses
                    Look at rnnt_utils.Hypothesis for more information.
        )r   r   r   r   NTrnnt_timestamp_typeallc                 S   s   g | ]	}d d |D qS )c                 S      g | ]}t |j|j|jqS r5   r   score
y_sequencetextr9   hr5   r5   r;   r<   o      zSAbstractRNNTDecoding.rnnt_decoder_predictions_tensor.<locals>.<listcomp>.<listcomp>r5   )r9   hhr5   r5   r;   r<   o      zHAbstractRNNTDecoding.rnnt_decoder_predictions_tensor.<locals>.<listcomp>c                 S   r   r5   r   r   r5   r5   r;   r<     r   )torchinference_moderw   
isinstancer   n_best_hypothesesdecode_hypothesisr#   rf   rh   rangelencompute_rnnt_timestampsappendr?   preserve_word_confidencepreserve_token_confidencecompute_confidence)r   r   r   r   r   hypotheses_listprediction_list
hypothesesall_hypotheses	nbest_hypn_hypsdecoded_hypstimestamp_typehyp_idxall_hypr5   r5   r;   rnnt_decoder_predictions_tensor7  sJ   






z4AbstractRNNTDecoding.rnnt_decoder_predictions_tensorr   c                    s  t t|D ]~}|| j}t|tkr| }jdur1jg kr1tj  fdd|D }njr>fdd|D }n	fdd|D }jdu rUjrU|ddf}n*jdu rot	
|| j}dgt| }|||f}n|}jr||| _||| _q|S )z
        Decode a list of hypotheses into a list of strings.

        Args:
            hypotheses_list: List of Hypothesis.

        Returns:
            A list of strings.
        Nc                    s   g | ]}|j   k r|qS r5   r   r8   rg   r   r5   r;   r<     r   z:AbstractRNNTDecoding.decode_hypothesis.<locals>.<listcomp>c                    s   g | ]	}| j k r|qS r5   r   r8   r   r5   r;   r<     r   c                       g | ]	}| j kr|qS r5   r   r8   r   r5   r;   r<     r   Tr[   )r   r   r   typelisttolistr   rj   r#   copydeepcopy
alignments+decode_tokens_to_str_with_strip_punctuationr   decode_ids_to_tokenstokensr   )r   r   ind
prediction
hypothesisr   token_repetitionsr5   r   r;   r     s*   




z&AbstractRNNTDecoding.decode_hypothesisc                    s|   j r jr fddndd }|D ]}g }g }t|jddd |jddd D ]\}}t|dkrrttt|D ]5\}}	|	d  j	krS j
sR||||  q;|s_||||  q;| ||| g|  g }q;q)d|d }}	|	d  j	kr j
s||||  q)|s||||  q)| ||| g|  g }q)|ddd }||_qnu j
r|D ]}|j|_qnh|D ]e}t|jtjr|j n|j}
d}g }t|
dkr(t|
|
dd t|jg D ]6\}}||kr| |j| | gdd	 |j|d | D   d}q||j| |  |d7 }q||_qƈ jr<|D ]	} ||_q2|S )
aI  
        Computes high-level (per-token and/or per-word) confidence scores for a list of hypotheses.
        Assumes that `frame_confidence` is present in the hypotheses.

        Args:
            hypotheses_list: List of Hypothesis.

        Returns:
            A list of hypotheses with high-level confidence scores.
        c                    s
     | S N)_aggregate_confidencexr   r5   r;   <lambda>  s   
 z9AbstractRNNTDecoding.compute_confidence.<locals>.<lambda>c                 S   s   | S r   r5   r   r5   r5   r;   r     s    Nr[   r   c                 S   s   g | ]}|d  qS )r   r5   )r9   fcr5   r5   r;   r<     s    z;AbstractRNNTDecoding.compute_confidence.<locals>.<listcomp>)rj   ry   zipframe_confidencer   r   reversedr   	enumerater   exclude_blank_from_confidencer   r   token_confidencenon_blank_frame_confidencer   	timestampr   Tensorr   r   _aggregate_token_confidenceword_confidence)r   r   maybe_pre_aggregatehypr   subsequent_blank_confidencer   faiatimestepoffsettster5   r   r;   r     s|   *$
&

z'AbstractRNNTDecoding.compute_confidencechar_offsetsencoded_char_offsetsword_delimiter_charc                 C      t  )zL
        Implemented by subclass in order to get the words offsets.
        ru   )r   r   r   r   r   r5   r5   r;   get_words_offsets     z&AbstractRNNTDecoding.get_words_offsetsr   c                 C   r   )z
        Implemented by subclass in order to decoder a token id list into a string.

        Args:
            tokens: List of int representing the token ids.

        Returns:
            A decoded string.
        r   r   r   r5   r5   r;   decode_tokens_to_str  r   z)AbstractRNNTDecoding.decode_tokens_to_strc                 C   r   &  
        Implemented by subclass in order to decode a token id list into a token list.
        A token list is the string representation of each token id.

        Args:
            tokens: List of int representing the token ids.

        Returns:
            A list of decoded tokens.
        r   r   r5   r5   r;   r        z)AbstractRNNTDecoding.decode_ids_to_tokensc                 C   r   )z
        Implemented by subclass in order to
        compute the most likely language ID (LID) string given the tokens.

        Args:
            tokens: List of int representing the token ids.

        Returns:
            A decoded LID string.
        r   r   r5   r5   r;   decode_tokens_to_lang-  r   z*AbstractRNNTDecoding.decode_tokens_to_langc                 C   r   )z
        Implemented by subclass in order to
        decode a token id list into language ID (LID) list.

        Args:
            tokens: List of int representing the token ids.

        Returns:
            A list of decoded LIDS.
        r   r   r5   r5   r;   decode_ids_to_langs;  r   z(AbstractRNNTDecoding.decode_ids_to_langsc                 C   s"   |  |}| jr| jd|}|S )zn
        Decodes a list of tokens to a string and removes a space before supported punctuation marks.
        z\2)r   r   ro   sub)r   r   r   r5   r5   r;   r   I  s   
z@AbstractRNNTDecoding.decode_tokens_to_str_with_strip_punctuationc                 C   s   | j du rdS t| jjdstd dS t| jjds#td dS | j dkr2| jj| j  dS td | jjd dS )	a   "
        Updates the fused batch size for the joint module if applicable.

        If `joint_fused_batch_size` is set, verifies that the joint module has
        the required `set_fused_batch_size` and `set_fuse_loss_wer` functions.
        If present, updates the batch size; otherwise, logs a warning.

        If `joint_fused_batch_size` is <= 0, disables fused batch processing.
        Nset_fused_batch_sizez{The joint module does not have `set_fused_batch_size(int)` as a setter function.
Ignoring update of joint fused batch size.set_fuse_loss_werzThe joint module does not have `set_fuse_loss_wer(bool, RNNTLoss, RNNTWER)` as a setter function.
Ignoring update of joint fused batch size.r   zTJoint fused batch size <= 0; Will temporarily disable fused batch step in the Joint.F)	ri   hasattrrw   r   r   warningr   infor   r   r5   r5   r;   r   R  s    



z2AbstractRNNTDecoding.update_joint_fused_batch_sizer   r   r   c              
      s"  |dv sJ |j \}}}d }} || j}||_ d}tt|D ]}	|t fdd||	 d D 7 }q#|t|j krRtd| d|j  d	| d
t|j  t|}
t|D ]#\}}g }|d D ]}| jkrw|	 
t|g qe||| d< q[ |
| j\}
}g }|D ]}|d }tdd |D }|	| qd}|dv r j||
 j jd}d}|dv rň j| j j jd}t|jdkr|j}ng }d|i|_|dur|dv r||jd< |dur|dv r||jd< |dur|dv r||jd<  |j |_  jr ||_|S )a  
        Computes character, word, and segment timestamps for an RNN-T hypothesis.

        This function generates timestamps for characters, words, and segments within
        a hypothesis sequence. The type of timestamps computed depends on `timestamp_type`,
        which can be 'char', 'word', 'segment', or 'all'.

        Args:
            hypothesis (Hypothesis): Hypothesis.
            timestamp_type (str): Type of timestamps to compute. Options are 'char', 'word', 'segment', or 'all'.
                                Defaults to 'all'.

        Returns:
            Hypothesis: The updated hypothesis with computed timestamps for characters, words, and/or segments.
        )charwordsegmentr   Nr   c                    r   r5   r   r9   cr   r5   r;   r<     r   z@AbstractRNNTDecoding.compute_rnnt_timestamps.<locals>.<listcomp>r   z`char_offsets`: z and `processed_tokens`: z9 have to be of the same length, but are: `len(offsets)`: z and `len(processed_tokens)`: c                 s   s    | ]}t |V  qd S r   )r   r   r5   r5   r;   	<genexpr>  s    z?AbstractRNNTDecoding.compute_rnnt_timestamps.<locals>.<genexpr>)r   r   r   )r   r   r   r   )r   r   )segment_delimiter_tokensr   r+   r   )r   r   )r   r   r   r   )r   rr   r   r   r   rk   r   r   r   r   r   intrt   r   maxr   r%   _get_segment_offsetsr'   r+   r   r   r   r   r   )r   r   r   decoded_predictionr   r   r   word_offsetsnum_flattened_tokenstr   r   offsetsdecoded_charsr   lensvr   max_lensegment_offsetstimestep_infor5   r   r;   r   u  s|   $





z,AbstractRNNTDecoding.compute_rnnt_timestampsr   
rnnt_tokenc                    s   d}| j dur%t| j dkr%| j d }t|tr|n| }td|d }t| }t	|g|dd f}dd | j
d D }tt|D ]}tt|| D ]}	|| |	 d || |	< qRqHdd t|||D }
tt fdd	|
}
|
S )
a  
        Utility method that calculates the indidual time indices where a token starts and ends.

        Args:
            hypothesis: A Hypothesis object that contains `text` field that holds the character / subword token
                emitted at every time step after rnnt collapse.
            token_repetitions: A list of ints representing the number of repetitions of each emitted token.
            rnnt_token: The integer of the rnnt blank token used during rnnt collapse.

        Returns:

        r   Nr[   r   c                 S   s   g | ]}|qS r5   r5   )r9   al_logits_labelsr5   r5   r;   r<         z9AbstractRNNTDecoding._compute_offsets.<locals>.<listcomp>c                 S   s   g | ]\}}}|||d qS )r   start_offset
end_offsetr5   )r9   r   ser5   r5   r;   r<     s    
c                    s   | d d  kS )Nr   r   r5   )r  r  r5   r;   r     r  z7AbstractRNNTDecoding._compute_offsets.<locals>.<lambda>)r   r   r   r   itemr   npasarraycumsumconcatenater   r   r   r   filter)r   r   r  start_indexfirst_timestepend_indicesstart_indicesalignment_labelsr  ur  r5   r  r;   rr     s"   

z%AbstractRNNTDecoding._compute_offsetsc                 G   s`   t | jtjr| j  | _t | jtjr| j  | _dd t| jd | j| jD }|S )ac  
        Utility method that calculates the indidual time indices where a token starts and ends.

        Args:
            hypothesis: A Hypothesis object that contains `text` field that holds the character / subword token
                emitted at a specific time step considering predicted durations of the previous tokens.

        Returns:

        c                 S   s$   g | ]\}}}|g||| d qS r  r5   )r9   r  r  dr5   r5   r;   r<   *  s    z=AbstractRNNTDecoding._compute_offsets_tdt.<locals>.<listcomp>r   )	r   r   r   r   token_durationcpur   r   r   )r   argsr  r5   r5   r;   rq     s   z)AbstractRNNTDecoding._compute_offsets_tdtc                 C   s   | |fS r   r5   )r   r   r   r5   r5   r;   rt   0  s   	z'AbstractRNNTDecoding._refine_timestampsc                 C   sv   |s| |fS t |D ],\}}|d d |v r6|dkr6||d  d  | | d< |d< |d  | | d< |d< q
| |fS )Nr   r   r[   r  r  )r   )r   r   r   r   r   r5   r5   r;   rs   ;  s    z+AbstractRNNTDecoding._refine_timestamps_tdtr  r   r+   c                 C   sT  |rt ||s|stjd| dtjd g }g }d}t| D ]j\}}|d }	|rZ|rZ|d | |d  d  }
|
|krY|d	|| | d | |d  d d
 |	g}|}q!n,|	r|	d |v sf|	|v r||	 |r|d	|| | d |d d
 g }|d }q!||	 q!|r| | d }|d	||| d d d
 |	  |S )a  
        Utility method which constructs segment time stamps out of word time stamps.

        Args:
            offsets: A list of dictionaries, each containing "word", "start_offset" and "end_offset".
            segments_delimiter_tokens: List containing tokens representing the seperator(s) between segments.
            supported_punctuation: Set containing punctuation marks in the vocabulary.
            segment_gap_threshold: Number of frames between 2 consecutive words necessary to form segments out of plain
            text.
        Returns:
            A list of dictionaries containing the segment offsets. Each item contains "segment", "start_offset" and
            "end_offset".
        z>Specified segment seperators are not in supported punctuation z. If the seperators are not punctuation marks, ignore this warning. Otherwise, specify 'segment_gap_threshold' parameter in decoding config to form segments.)moder   r   r  r[   r  r&   )r   r  r  r   )
setintersectionr   r   r   ONCEr   r   rm   clear)r  r   r   r+   r	  segment_wordsprevious_word_indexr   r   r   gap_between_wordsr  r5   r5   r;   r   Q  sf   




z)AbstractRNNTDecoding._get_segment_offsetsr   )FN)r   )NN)%__name__
__module____qualname____doc__r   r	   r
   rd   r   r   boolr   r   r   r   r   r   r   r   r   strfloatr   r   r   r   r   r   r   r   staticmethodrr   rq   rt   rs   r   __classcell__r5   r5   r   r;   r   #   s     .  l

"N0P	#u,(
r   c                       s   e Zd ZdZ fddZdedee fddZdee	 de
fd	d
Zdee	 dee
 fddZdee	 de
fddZdee	 dee
 fddZe		ddeee
ee
ef f  deee
ee
ef f  de
dee deee
ee
ef f  f
ddZ  ZS )RNNTDecodingu+  
    Used for performing RNN-T auto-regressive decoding of the Decoder+Joint network given the encoder state.

    Args:
        decoding_cfg: A dict-like object which contains the following key-value pairs.

            strategy:
                str value which represents the type of decoding that can occur.
                Possible values are :

                -   greedy, greedy_batch (for greedy decoding).

                -   beam, tsd, alsd (for beam search decoding).

            compute_hypothesis_token_set: A bool flag, which determines whether to compute a list of decoded
                tokens as well as the decoded string. Default is False in order to avoid double decoding
                unless required.

            preserve_alignments: Bool flag which preserves the history of logprobs generated during
                decoding (sample / batched). When set to true, the Hypothesis will contain
                the non-null value for `logprobs` in it. Here, `alignments` is a List of List of
                Tuple(Tensor (of length V + 1), Tensor(scalar, label after argmax)).

                In order to obtain this hypothesis, please utilize `rnnt_decoder_predictions_tensor` function
                with the `return_hypotheses` flag set to True.

                The length of the list corresponds to the Acoustic Length (T).
                Each value in the list (Ti) is a torch.Tensor (U), representing 1 or more targets from a vocabulary.
                U is the number of target tokens for the current timestep Ti.

            confidence_cfg: A dict-like object which contains the following key-value pairs related to confidence
                scores. In order to obtain hypotheses with confidence scores, please utilize
                `rnnt_decoder_predictions_tensor` function with the `preserve_frame_confidence` flag set to True.

                preserve_frame_confidence: Bool flag which preserves the history of per-frame confidence scores
                    generated during decoding (sample / batched). When set to true, the Hypothesis will contain
                    the non-null value for `frame_confidence` in it. Here, `alignments` is a List of List of floats.

                    The length of the list corresponds to the Acoustic Length (T).
                    Each value in the list (Ti) is a torch.Tensor (U), representing 1 or more confidence scores.
                    U is the number of target tokens for the current timestep Ti.
                preserve_token_confidence: Bool flag which preserves the history of per-token confidence scores
                    generated during greedy decoding (sample / batched). When set to true, the Hypothesis will contain
                    the non-null value for `token_confidence` in it. Here, `token_confidence` is a List of floats.

                    The length of the list corresponds to the number of recognized tokens.
                preserve_word_confidence: Bool flag which preserves the history of per-word confidence scores
                    generated during greedy decoding (sample / batched). When set to true, the Hypothesis will contain
                    the non-null value for `word_confidence` in it. Here, `word_confidence` is a List of floats.

                    The length of the list corresponds to the number of recognized words.
                exclude_blank: Bool flag indicating that blank token confidence scores are to be excluded
                    from the `token_confidence`.
                aggregation: Which aggregation type to use for collapsing per-token confidence into per-word
                    confidence.
                    Valid options are `mean`, `min`, `max`, `prod`.
                tdt_include_duration: Bool flag indicating that the duration confidence scores are to be calculated and
                    attached to the regular frame confidence,
                    making TDT frame confidence element a pair: (`prediction_confidence`, `duration_confidence`).
                method_cfg: A dict-like object which contains the method name and settings to compute per-frame
                    confidence scores.

                    name:
                        The method name (str).
                        Supported values:

                            - 'max_prob' for using the maximum token probability as a confidence.

                            - 'entropy' for using a normalized entropy of a log-likelihood vector.

                    entropy_type:
                        Which type of entropy to use (str).
                        Used if confidence_method_cfg.name is set to `entropy`.
                        Supported values:

                            - 'gibbs' for the (standard) Gibbs entropy. If the alpha (α) is provided,
                                the formula is the following: H_α = -sum_i((p^α_i)*log(p^α_i)).
                                Note that for this entropy, the alpha should comply the following inequality:
                                (log(V)+2-sqrt(log^2(V)+4))/(2*log(V)) <= α <= (1+log(V-1))/log(V-1)
                                where V is the model vocabulary size.

                            - 'tsallis' for the Tsallis entropy with the Boltzmann constant one.
                                Tsallis entropy formula is the following: H_α = 1/(α-1)*(1-sum_i(p^α_i)),
                                where α is a parameter. When α == 1, it works like the Gibbs entropy.
                                More: https://en.wikipedia.org/wiki/Tsallis_entropy

                            - 'renyi' for the Rényi entropy.
                                Rényi entropy formula is the following: H_α = 1/(1-α)*log_2(sum_i(p^α_i)),
                                where α is a parameter. When α == 1, it works like the Gibbs entropy.
                                More: https://en.wikipedia.org/wiki/R%C3%A9nyi_entropy

                    alpha:
                        Power scale for logsoftmax (α for entropies). Here we restrict it to be > 0.
                        When the alpha equals one, scaling is not applied to 'max_prob',
                        and any entropy type behaves like the Shannon entropy: H = -sum_i(p_i*log(p_i))

                    entropy_norm:
                        A mapping of the entropy value to the interval [0,1].
                        Supported values:

                            - 'lin' for using the linear mapping.

                            - 'exp' for using exponential mapping with linear shift.

            The config may further contain the following sub-dictionaries:

                "greedy":
                    max_symbols: int, describing the maximum number of target tokens to decode per
                        timestep during greedy decoding. Setting to larger values allows longer sentences
                        to be decoded, at the cost of increased execution time.

                    preserve_frame_confidence: Same as above, overrides above value.

                    confidence_method_cfg: Same as above, overrides confidence_cfg.method_cfg.

                "beam":
                    beam_size: int, defining the beam size for beam search. Must be >= 1.
                        If beam_size == 1, will perform cached greedy search. This might be slightly different
                        results compared to the greedy search above.

                    score_norm: optional bool, whether to normalize the returned beam score in the hypotheses.
                        Set to True by default.

                    return_best_hypothesis: optional bool, whether to return just the best hypothesis or all of the
                        hypotheses after beam search has concluded. This flag is set by default.

                    tsd_max_sym_exp: optional int, determines number of symmetric expansions of the target symbols
                        per timestep of the acoustic model. Larger values will allow longer sentences to be decoded,
                        at increased cost to execution time.

                    alsd_max_target_len: optional int or float, determines the potential maximum target sequence
                        length. If an integer is provided, it can decode sequences of that particular maximum length.
                        If a float is provided, it can decode sequences of int(alsd_max_target_len * seq_len),
                        where seq_len is the length of the acoustic model output (T).

                        NOTE:
                            If a float is provided, it can be greater than 1!
                            By default, a float of 2.0 is used so that a target sequence can be at most twice
                            as long as the acoustic model output length T.

                    maes_num_steps: Number of adaptive steps to take. From the paper, 2 steps is generally sufficient,
                        and can be reduced to 1 to improve decoding speed while sacrificing some accuracy. int > 0.

                    maes_prefix_alpha: Maximum prefix length in prefix search. Must be an integer, and is advised to
                    keep this as 1 in order to reduce expensive beam search cost later. int >= 0.

                    maes_expansion_beta: Maximum number of prefix expansions allowed, in addition to the beam size.
                        Effectively, the number of hypothesis = beam_size + maes_expansion_beta. Must be an int >= 0,
                        and affects the speed of inference since large values will perform large beam search in the
                        next step.

                    maes_expansion_gamma: Float pruning threshold used in the prune-by-value step when computing the
                        expansions. The default (2.3) is selected from the paper. It performs a comparison
                        (max_log_prob - gamma <= log_prob[v]) where v is all vocabulary indices in the Vocab set and
                        max_log_prob is the "most" likely token to be predicted. Gamma therefore provides a margin of
                        additional tokens which can be potential candidates for expansion apart from the "most likely"
                        candidate. Lower values will reduce the number of expansions (by increasing pruning-by-value,
                        thereby improving speed but hurting accuracy). Higher values will increase the number of
                        expansions (by reducing pruning-by-value, thereby reducing speed but potentially improving
                        accuracy). This is a hyper parameter to be experimentally tuned on a validation set.

                    softmax_temperature: Scales the logits of the joint prior to computing log_softmax.

        decoder: The Decoder/Prediction network module.
        joint: The Joint network module.
        vocabulary: The vocabulary (excluding the RNNT blank token) which will be used for decoding.
    c                    s   t  |j }dd  D }t|dr|jdkrt  }t fddtt  D | _tt| j	|||||d t
| jtjsGt
| jtjrO| jd d S d S )	Nc                 S   *   h | ]}|D ]}t |d r|qqS Punicodedatacategory
startswithr9   tokenr   r5   r5   r;   	<setcomp>Z      z(RNNTDecoding.__init__.<locals>.<setcomp>
model_typetdtc                    s   g | ]}| | fqS r5   r5   )r9   r   
vocabularyr5   r;   r<   a  s    z)RNNTDecoding.__init__.<locals>.<listcomp>r   r   r   r   r   r   )r   rg   r   rA  dictr   
labels_maprc   r5  rd   r   rw   r   r~   r   r   set_decoding_type)r   r   r   r   rD  r   r   r   rC  r;   rd   Q  s&    
zRNNTDecoding.__init__r   r   c                 C   s   |  |j|jS )z
        Implemented by subclass in order to aggregate token confidence to a word-level confidence.

        Args:
            hypothesis: Hypothesis

        Returns:
            A list of word-level confidence scores.
        )!_aggregate_token_confidence_charswordsr   r   r   r5   r5   r;   r   p  s   
z(RNNTDecoding._aggregate_token_confidencer   c                 C   s   d | |}|S )
        Implemented by subclass in order to decoder a token list into a string.

        Args:
            tokens: List of int representing the token ids.

        Returns:
            A decoded string.
         )rm   r   r   r   r   r5   r5   r;   r   |  s   
z!RNNTDecoding.decode_tokens_to_strc                    s    fdd|D }|S )r   c                    s&   g | ]}| j  j k r j| qS r5   )r   rg   rG  r   r   r5   r;   r<     s   & z5RNNTDecoding.decode_ids_to_tokens.<locals>.<listcomp>r5   r   r   
token_listr5   r   r;   r     s   z!RNNTDecoding.decode_ids_to_tokensc                 C      | j |}|S z
        Compute the most likely language ID (LID) string given the tokens.

        Args:
            tokens: List of int representing the token ids.

        Returns:
            A decoded LID string.
        	tokenizerids_to_langr   r   langr5   r5   r;   r        
z"RNNTDecoding.decode_tokens_to_langc                 C   rQ  z
        Decode a token id list into language ID (LID) list.

        Args:
            tokens: List of int representing the token ids.

        Returns:
            A list of decoded LIDS.
        rT  ids_to_text_and_langsr   r   	lang_listr5   r5   r;   r     rX  z RNNTDecoding.decode_ids_to_langsr&   Nr   r   r   r   c                 C   sH  g }d}d}d}d}| D ]}	|	d D ]}
|
|krdnd}|o#|
  |v }|rd|dkrd|r?|d dkr8|dd n|}||
7 }n$|d }|	d	 |d	< |d
 d dkr[|d
 dd |d
< |d
  |
7  < q||kru|dkru|	d	 }||
7 }n|dkr|r||||d d}n
|	d }|	d	 }|
}|}qq|dkr||||d |S )a  
        Utility method which constructs word time stamps out of character time stamps.

        References:
            This code is a port of the Hugging Face code for word time stamp construction.

        Args:
            char_offsets: A list of dictionaries, each containing "char", "start_offset" and "end_offset",
                        where "char" is decoded with the tokenizer.
            encoded_char_offsets: A list of dictionaries, each containing "char", "start_offset" and "end_offset",
                        where "char" is the original id/ids from the hypotheses (not decoded with the tokenizer).
                        As we are working with char-based models here, we are using the `char_offsets` to get the word offsets.
                        `encoded_char_offsets` is passed for keeping the consistency with `AbstractRNNTDecoding`'s abstract method.
            word_delimiter_char: Character token that represents the word delimiter. By default, " ".
            supported_punctuation: Set containing punctuation marks in the vocabulary.

        Returns:
            A list of dictionaries containing the word offsets. Each item contains "word", "start_offset" and
            "end_offset".
        	DELIMITERrM  r   r   WORDr   r&   Nr  r   r   r  r  r  )stripr   )r   r   r   r   r  
last_stater   r  r  r   r   statecurr_punctuationlast_built_wordr5   r5   r;   r     sB   $

*zRNNTDecoding.get_words_offsetsr&   N)r,  r-  r.  r/  rd   r   r   r2  r   r   r1  r   r   r   r   r3  r   r   r	   r
   r   r4  r5   r5   r   r;   r5    sB     )
r5  c                       sf  e Zd ZdZdef fddZedee defddZ	ed	ed
ede
eegef fddZdedee fddZdee defddZdee dee fddZdee defddZdee dee fddZdee deeeef  f fddZ		d#deeeeeef f  deeeeeef f  d
ed ee deeeeeef f  f
d!d"Z  ZS )$RNNTBPEDecodingu0  
    Used for performing RNN-T auto-regressive decoding of the Decoder+Joint network given the encoder state.

    Args:
        decoding_cfg: A dict-like object which contains the following key-value pairs.

            strategy:
                str value which represents the type of decoding that can occur.
                Possible values are :

                -   greedy, greedy_batch (for greedy decoding).

                -   beam, tsd, alsd (for beam search decoding).

            compute_hypothesis_token_set: A bool flag, which determines whether to compute a list of decoded
                tokens as well as the decoded string. Default is False in order to avoid double decoding
                unless required.

            preserve_alignments: Bool flag which preserves the history of logprobs generated during
                decoding (sample / batched). When set to true, the Hypothesis will contain
                the non-null value for `alignments` in it. Here, `alignments` is a List of List of
                Tuple(Tensor (of length V + 1), Tensor(scalar, label after argmax)).

                In order to obtain this hypothesis, please utilize `rnnt_decoder_predictions_tensor` function
                with the `return_hypotheses` flag set to True.

                The length of the list corresponds to the Acoustic Length (T).
                Each value in the list (Ti) is a torch.Tensor (U), representing 1 or more targets from a vocabulary.
                U is the number of target tokens for the current timestep Ti.

            compute_timestamps: A bool flag, which determines whether to compute the character/subword, or
                word based timestamp mapping the output log-probabilities to discrete intervals of timestamps.
                The timestamps will be available in the returned Hypothesis.timestep as a dictionary.

            compute_langs: a bool flag, which allows to compute language id (LID) information per token,
                word, and the entire sample (most likely language id). The LIDS will be available
                in the returned Hypothesis object as a dictionary

            rnnt_timestamp_type: A str value, which represents the types of timestamps that should be calculated.
                Can take the following values - "char" for character/subword time stamps, "word" for word level
                time stamps and "all" (default), for both character level and word level time stamps.

            word_seperator: Str token representing the seperator between words.

            segment_seperators: List containing tokens representing the seperator(s) between segments.

            segment_gap_threshold: The threshold (in frames) that caps the gap between two words necessary for forming
                the segments.

            preserve_frame_confidence: Bool flag which preserves the history of per-frame confidence scores
                generated during decoding (sample / batched). When set to true, the Hypothesis will contain
                the non-null value for `frame_confidence` in it. Here, `alignments` is a List of List of ints.

            confidence_cfg: A dict-like object which contains the following key-value pairs related to confidence
                scores. In order to obtain hypotheses with confidence scores, please utilize
                `rnnt_decoder_predictions_tensor` function with the `preserve_frame_confidence` flag set to True.

                preserve_frame_confidence: Bool flag which preserves the history of per-frame confidence scores
                    generated during decoding (sample / batched). When set to true, the Hypothesis will contain
                    the non-null value for `frame_confidence` in it. Here, `alignments` is a List of List of floats.

                    The length of the list corresponds to the Acoustic Length (T).
                    Each value in the list (Ti) is a torch.Tensor (U), representing 1 or more confidence scores.
                    U is the number of target tokens for the current timestep Ti.
                preserve_token_confidence: Bool flag which preserves the history of per-token confidence scores
                    generated during greedy decoding (sample / batched). When set to true, the Hypothesis will contain
                    the non-null value for `token_confidence` in it. Here, `token_confidence` is a List of floats.

                    The length of the list corresponds to the number of recognized tokens.
                preserve_word_confidence: Bool flag which preserves the history of per-word confidence scores
                    generated during greedy decoding (sample / batched). When set to true, the Hypothesis will contain
                    the non-null value for `word_confidence` in it. Here, `word_confidence` is a List of floats.

                    The length of the list corresponds to the number of recognized words.
                exclude_blank: Bool flag indicating that blank token confidence scores are to be excluded
                    from the `token_confidence`.
                aggregation: Which aggregation type to use for collapsing per-token confidence into per-word
                    confidence. Valid options are `mean`, `min`, `max`, `prod`.
                tdt_include_duration: Bool flag indicating that the duration confidence scores are to be calculated and
                    attached to the regular frame confidence,
                    making TDT frame confidence element a pair: (`prediction_confidence`, `duration_confidence`).
                method_cfg: A dict-like object which contains the method name and settings to compute per-frame
                    confidence scores.

                    name:
                        The method name (str).
                        Supported values:

                            - 'max_prob' for using the maximum token probability as a confidence.

                            - 'entropy' for using a normalized entropy of a log-likelihood vector.

                    entropy_type: Which type of entropy to use (str).
                        Used if confidence_method_cfg.name is set to `entropy`.
                        Supported values:

                            - 'gibbs' for the (standard) Gibbs entropy. If the alpha (α) is provided,
                                the formula is the following: H_α = -sum_i((p^α_i)*log(p^α_i)).
                                Note that for this entropy, the alpha should comply the following inequality:
                                (log(V)+2-sqrt(log^2(V)+4))/(2*log(V)) <= α <= (1+log(V-1))/log(V-1)
                                where V is the model vocabulary size.

                            - 'tsallis' for the Tsallis entropy with the Boltzmann constant one.
                                Tsallis entropy formula is the following: H_α = 1/(α-1)*(1-sum_i(p^α_i)),
                                where α is a parameter. When α == 1, it works like the Gibbs entropy.
                                More: https://en.wikipedia.org/wiki/Tsallis_entropy

                            - 'renyi' for the Rényi entropy.
                                Rényi entropy formula is the following: H_α = 1/(1-α)*log_2(sum_i(p^α_i)),
                                where α is a parameter. When α == 1, it works like the Gibbs entropy.
                                More: https://en.wikipedia.org/wiki/R%C3%A9nyi_entropy

                    alpha: Power scale for logsoftmax (α for entropies). Here we restrict it to be > 0.
                        When the alpha equals one, scaling is not applied to 'max_prob',
                        and any entropy type behaves like the Shannon entropy: H = -sum_i(p_i*log(p_i))

                    entropy_norm: A mapping of the entropy value to the interval [0,1].
                        Supported values:

                            - 'lin' for using the linear mapping.

                            - 'exp' for using exponential mapping with linear shift.

            The config may further contain the following sub-dictionaries:

                "greedy":
                    max_symbols: int, describing the maximum number of target tokens to decode per
                        timestep during greedy decoding. Setting to larger values allows longer sentences
                        to be decoded, at the cost of increased execution time.

                    preserve_frame_confidence: Same as above, overrides above value.

                    confidence_method_cfg: Same as above, overrides confidence_cfg.method_cfg.

                "beam":
                    beam_size: int, defining the beam size for beam search. Must be >= 1.
                        If beam_size == 1, will perform cached greedy search. This might be slightly different
                        results compared to the greedy search above.

                    score_norm: optional bool, whether to normalize the returned beam score in the hypotheses.
                        Set to True by default.

                    return_best_hypothesis: optional bool, whether to return just the best hypothesis or all of the
                        hypotheses after beam search has concluded.

                    tsd_max_sym_exp: optional int, determines number of symmetric expansions of the target symbols
                        per timestep of the acoustic model. Larger values will allow longer sentences to be decoded,
                        at increased cost to execution time.

                    alsd_max_target_len: optional int or float, determines the potential maximum target sequence
                        length.If an integer is provided, it can decode sequences of that particular maximum length.
                        If a float is provided, it can decode sequences of int(alsd_max_target_len * seq_len),
                        where seq_len is the length of the acoustic model output (T).

                        NOTE:
                            If a float is provided, it can be greater than 1!
                            By default, a float of 2.0 is used so that a target sequence can be at most twice
                            as long as the acoustic model output length T.

                    maes_num_steps: Number of adaptive steps to take. From the paper, 2 steps is generally sufficient,
                        and can be reduced to 1 to improve decoding speed while sacrificing some accuracy. int > 0.

                    maes_prefix_alpha: Maximum prefix length in prefix search. Must be an integer, and is advised to
                        keep this as 1 in order to reduce expensive beam search cost later. int >= 0.

                    maes_expansion_beta: Maximum number of prefix expansions allowed, in addition to the beam size.
                        Effectively, the number of hypothesis = beam_size + maes_expansion_beta. Must be an int >= 0,
                        and affects the speed of inference since large values will perform large beam search in the
                        next step.

                    maes_expansion_gamma: Float pruning threshold used in the prune-by-value step when computing the
                        expansions. The default (2.3) is selected from the paper. It performs a comparison
                        (max_log_prob - gamma <= log_prob[v]) where v is all vocabulary indices in the Vocab set and
                        max_log_prob is the "most" likely token to be predicted. Gamma therefore provides a margin of
                        additional tokens which can be potential candidates for expansion apart from the "most likely"
                        candidate. Lower values will reduce the number of expansions (by increasing pruning-by-value,
                        thereby improving speed but hurting accuracy). Higher values will increase the number of
                        expansions (by reducing pruning-by-value, thereby reducing speed but potentially improving
                        accuracy). This is a hyper parameter to be experimentally tuned on a validation set.

                    softmax_temperature: Scales the logits of the joint prior to computing log_softmax.

        decoder: The Decoder/Prediction network module.
        joint: The Joint network module.
        tokenizer: The tokenizer which will be used for decoding.
    rT  c                    s   |j j}t|dr|j}ndd |jD }t|dr&|jdkr&|j j|j }|| _ | |j| _t	t
| j|||||d t| jtjsKt| jtjrS| jd d S d S )Nr   c                 S   r6  r7  r9  r=  r5   r5   r;   r?    r@  z+RNNTBPEDecoding.__init__.<locals>.<setcomp>rA  
multiblankrE  subword)rT  
vocab_sizer   r   vocabrA  rg   define_tokenizer_typetokenizer_typerc   rg  rd   r   rw   r   r~   r   r   rH  )r   r   r   r   rT  r   r   r   r5   r;   rd     s,   

zRNNTBPEDecoding.__init__rD  r   c                 C   s   t dd | D rdS dS )zD
        Define the tokenizer type based on the vocabulary.
        c                 s   s    | ]}| d V  qdS )##Nr<  )r9   r>  r5   r5   r;   r     s    z8RNNTBPEDecoding.define_tokenizer_type.<locals>.<genexpr>wpebpe)anyrC  r5   r5   r;   rl    s   z%RNNTBPEDecoding.define_tokenizer_typerm  r   c                    s,    dkr| dkrdd S dd S  fddS )zk
        Define the word start condition based on the tokenizer type and word delimiter character.
        r&   rp  c                 S   s   |o| d S )Nrn  ro  r>  
token_textr5   r5   r;   r     r  z=RNNTBPEDecoding.define_word_start_condition.<locals>.<lambda>c                 S   s   | |kS r   r5   rs  r5   r5   r;   r         c                    s   | kS r   r5   rs  r   r5   r;   r     ru  r5   )rm  r   r5   rv  r;   define_word_start_condition  s
   z+RNNTBPEDecoding.define_word_start_conditionr   c                 C   s   |  |j|j|jS )a"  
        Implemented by subclass in order to reduce token confidence to a word-level confidence.

        **Note**: Only supports Sentencepiece based tokenizers!

        Args:
            hypothesis: Hypothesis

        Returns:
            A list of word-level confidence scores.
        )2_aggregate_token_confidence_subwords_sentencepiecerJ  r   r   rK  r5   r5   r;   r     s   z+RNNTBPEDecoding._aggregate_token_confidencer   c                 C   rQ  )rL  )rT  ids_to_textrN  r5   r5   r;   r     rX  z$RNNTBPEDecoding.decode_tokens_to_strc                 C   rQ  r   )rT  ids_to_tokensrO  r5   r5   r;   r     s   z$RNNTBPEDecoding.decode_ids_to_tokensc                 C   rQ  rR  rS  rV  r5   r5   r;   r   &  rX  z%RNNTBPEDecoding.decode_tokens_to_langc                 C   rQ  rY  rZ  r\  r5   r5   r;   r   3  rX  z#RNNTBPEDecoding.decode_ids_to_langsr   c                    s   t  |} jrGt jtrBtt|D ]*}|| j}t	|t
kr&| } fdd|D } ||| _ ||| _q|S td |S )z
        Decode a list of hypotheses into a list of strings.
        Overrides the super() method optionally adding lang information

        Args:
            hypotheses_list: List of Hypothesis.

        Returns:
            A list of strings.
        c                    r   r5   r   r8   r   r5   r;   r<   W  r   z5RNNTBPEDecoding.decode_hypothesis.<locals>.<listcomp>zzIgnoring request for lang output in hypotheses since the model does not use an aggregate                         tokenizer)rc   r   r    r   rT  r   r   r   r   r   r   r   r   langsr   langs_charsr   r   )r   r   r   r   r   r   r   r;   r   @  s   
z!RNNTBPEDecoding.decode_hypothesisr&   Nr   r   r   c              	   C   s  |  }g }d}g }| | j|}t|D ]\}	}
|
d D ]}|| jkr%qt|}| |gd }| |g }|o>||v }|||rr|sr|rb| |}|rb|	||| d ||	d  d d |
  ||krq|	| |	}q|r|s|d }|
d |d< |d d d	kr|d d
d |d< |d  |7  < q|s|	}|	| qqt|dkr|r| |}|r|	||d d |d d d |S |d d |d d< |r| |}|r|	||| d |d d d |S )a  
        Utility method which constructs word time stamps out of sub-word time stamps.

        **Note**: Only supports Sentencepiece based tokenizers !

        Args:
            char_offsets: A list of dictionaries, each containing "char", "start_offset" and "end_offset",
                        where "char" is decoded with the tokenizer.
            encoded_char_offsets: A list of dictionaries, each containing "char", "start_offset" and "end_offset",
                        where "char" is the original id/ids from the hypotheses (not decoded with the tokenizer).
                        This is needed for subword tokenization models.
            word_delimiter_char: Character token that represents the word delimiter. By default, " ".
            supported_punctuation: Set containing punctuation marks in the vocabulary.

        Returns:
            A list of dictionaries containing the word offsets. Each item contains "word", "start_offset" and
            "end_offset".
        r   r   r  r[   r  r`  r   r   r&   N)r   rw  rm  r   r   r   r   r   ra  r   r(  r   )r   r   r   r   r   r  previous_token_indexbuilt_tokenscondition_for_word_startr   r   r   r>  rt  rd  
built_wordre  r5   r5   r;   r   c  sx   


	
8





z!RNNTBPEDecoding.get_words_offsetsrf  )r,  r-  r.  r/  r   rd   r3  r   r1  rl  r   r0  rw  r   r2  r   r   r   r   r   r   r   r   r   r   r	   r
   r   r4  r5   r5   r   r;   rg    s6     <$&'rg  c                   @   sL  e Zd ZU dZdZeed< dZeed< dZe	ed< dZ
ee	 ed	< dZee	 ed
< edd dZeed< dZee ed< dZee	 ed< dZe	ed< dZeed< edd dZeee  ed< dZee ed< dZeed< eejdZejed< edd dZejed< dZe ed< ee!dZ"eee  ed< ee!dZ#eee  ed< dS ) RNNTDecodingConfigz
    RNNT Decoding config
    rnntrA  r-   rl   Fr   Nr!   r$   c                   C   s   t  S r   )r   r5   r5   r5   r;   r     s    zRNNTDecodingConfig.<lambda>)default_factoryr>   r"   r#   r    r&   r%   c                   C   s   g dS )N)r(   r*   r)   r5   r5   r5   r5   r;   r     ru  r'   r+   r   r   r,   c                   C   s   t jddS )N   )rR   )r   BeamRNNTInferConfigr5   r5   r5   r;   r     s    r.   rQ   temperaturer   r   )$r,  r-  r.  r/  rA  r1  __annotations__rl   r   r0  r!   r	   r$   r   r>   r   r"   r   r#   r    r%   r'   r   r+   r   r   GreedyBatchedRNNTInferConfigr,   r.   r   r  r  r2  r   r   r   r5   r5   r5   r;   r    s0   
 r  c                   @   s   e Zd ZdZdS )RNNTBPEDecodingConfigz"
    RNNT BPE Decoding Config
    N)r,  r-  r.  r/  r5   r5   r5   r;   r    s    r  ).r   r6   r:  abcr   dataclassesr   r   r   typingr   r   r   r	   r
   r   numpyr  r   	omegaconfr   %nemo.collections.asr.parts.submodulesr   r   r   5nemo.collections.asr.parts.utils.asr_confidence_utilsr   r   <nemo.collections.asr.parts.utils.batched_beam_decoding_utilsr   r   +nemo.collections.asr.parts.utils.rnnt_utilsr   r   6nemo.collections.common.tokenizers.aggregate_tokenizerr   1nemo.collections.common.tokenizers.tokenizer_specr   
nemo.utilsr   r   r   r5  rg  r  r  r5   r5   r5   r;   <module>   sJ               f   X<