o
    irN                     @  s  d dl mZ d dlZd dlmZ d dlmZ d dlm	Z	m
Z
mZmZmZmZ d dlZd dlmZmZmZmZmZmZmZmZ m!Z"m#Z$ d dl%m&Z'm(Z)m*Z+ d dl,m-Z- zd d	l.m/Z0 W n e1yw   zd d	lm/Z0 W n e1yt   dZ0Y nw Y nw g d
Z2edg dZ3dd Z4dd Z5G dd deZ6G dd deZ7G dd deZ8G dd dZ9dddddddd e:dd dddd fdBd:d;Z;dCd>d?Z<dCd@dAZ=dS )D    )annotationsN)abstractmethod)
namedtuple)DictList
NamedTupleOptionalTupleUnion)
CriterionTypeLexiconDecoderLexiconDecoderOptionsLexiconFreeDecoderLexiconFreeDecoderOptionsLMLMStateSmearingModeTrieZeroLM)create_word_dict
Dictionary
load_words)download_asset)KenLM)CTCHypothesis
CTCDecoderCTCDecoderLMCTCDecoderLMStatectc_decoderdownload_pretrained_filesPretrainedFileslexicontokenslmc                   s      }t||}|d}| D ]&\}}	||}
|||
\}}|	D ]} fdd|D }|||
| q%q|tj	 |S )NFc                   s   g | ]}  |qS  )	get_index).0tokentokens_dictr%   Z/home/ubuntu/.local/lib/python3.10/site-packages/torchaudio/models/decoder/_ctc_decoder.py
<listcomp>;   s    z#_construct_trie.<locals>.<listcomp>)

index_size_Triestartitemsr&   scoreinsertsmear_SmearingModeMAX)r*   	word_dictr"   r$   silence
vocab_sizetriestart_stateword	spellingsword_idx_r1   spellingspelling_idxr%   r)   r+   _construct_trie2   s   


rA   c                   sv   d }|d ur
t |}| r|d u rt| }|S | s9|d u r9t|tkr9 fddt  D }|gg||< t|}|S )Nc                   s"   i | ]}  |  |ggqS r%   )	get_entry)r'   ir)   r%   r+   
<dictcomp>I   s   " z"_get_word_dict.<locals>.<dictcomp>)_Dictionary_create_word_dicttypestrranger-   )r"   r$   lm_dictr*   unk_wordr6   dr%   r)   r+   _get_word_dictA   s   rM   c                   @  s8   e Zd ZU dZded< 	 ded< 	 ded< 	 ded	< d
S )r   zORepresents hypothesis generated by CTC beam search decoder :class:`CTCDecoder`.torch.LongTensorr#   z	List[str]wordsfloatr1   torch.IntTensor	timestepsN)__name__
__module____qualname____doc____annotations__r%   r%   r%   r+   r   P   s   
 r   c                      s>   e Zd ZdZed fddZd fdd	ZdddZ  ZS )r   zLanguage model state.returnDict[int, CTCDecoderLMState]c                   s   t  jS )zMap of indices to LM states)superchildrenself	__class__r%   r+   r[   h   s   zCTCDecoderLMState.children	usr_indexintc                   s   t  |S )a!  Returns child corresponding to usr_index, or creates and returns a new state if input index
        is not found.

        Args:
            usr_index (int): index corresponding to child state

        Returns:
            CTCDecoderLMState: child state corresponding to usr_index
        )rZ   child)r]   r`   r^   r%   r+   rb   m   s   
zCTCDecoderLMState.childstatec                 C  s   dS )zCompare two language model states.

        Args:
            state (CTCDecoderLMState): LM state to compare against

        Returns:
            int: 0 if the states are the same, -1 if self is less, +1 if self is greater.
        Nr%   r]   rc   r%   r%   r+   comparey   s   	zCTCDecoderLMState.compare)rX   rY   )r`   ra   rX   r   )rc   r   rX   r   )	rS   rT   rU   rV   propertyr[   rb   re   __classcell__r%   r%   r^   r+   r   e   s    r   c                   @  s:   e Zd ZdZedddZedddZedddZdS )r   zVLanguage model base class for creating custom language models to use with the decoder.start_with_nothingboolrX   r   c                 C     t )zInitialize or reset the language model.

        Args:
            start_with_nothing (bool): whether or not to start sentence with sil token.

        Returns:
            CTCDecoderLMState: starting state
        NotImplementedError)r]   rh   r%   r%   r+   r/      s   
zCTCDecoderLM.startrc   usr_token_idxra   Tuple[CTCDecoderLMState, float]c                 C  rj   )ax  Evaluate the language model based on the current LM state and new word.

        Args:
            state (CTCDecoderLMState): current LM state
            usr_token_idx (int): index of the word

        Returns:
            (CTCDecoderLMState, float)
                CTCDecoderLMState:
                    new LM state
                float:
                    score
        rk   )r]   rc   rm   r%   r%   r+   r1      s   zCTCDecoderLM.scorec                 C  rj   )a8  Evaluate end for language model based on current LM state.

        Args:
            state (CTCDecoderLMState): current LM state

        Returns:
            (CTCDecoderLMState, float)
                CTCDecoderLMState:
                    new LM state
                float:
                    score
        rk   rd   r%   r%   r+   finish   s   zCTCDecoderLM.finishN)rh   ri   rX   r   )rc   r   rm   ra   rX   rn   )rc   r   rX   rn   )rS   rT   rU   rV   r   r/   r1   ro   r%   r%   r%   r+   r      s    r   c                   @  st   e Zd ZdZd2ddZd3ddZd4ddZdd Zdd Zd5d"d#Z	d6d%d&Z
d6d'd(Z	)d7d8d-d.Zd9d0d1Zd)S ):r   zCTC beam search decoder from *Flashlight* :cite:`kahn2022flashlight`.

    .. devices:: CPU

    Note:
        To build the decoder, please use the factory function :func:`ctc_decoder`.
    nbestra   r"   Optional[Dict]r6   rE   r*   r$   r   decoder_options9Union[_LexiconDecoderOptions, _LexiconFreeDecoderOptions]blank_tokenrH   	sil_tokenrK   rX   Nonec
              	   C  s   || _ || _|| _| j|| _| j|}
g }|r7t|||||
}||	}	d}t||||
| j|	||| _n
t|||
| j|| _|| _	dS )a  
        Args:
            nbest (int): number of best decodings to return
            lexicon (Dict or None): lexicon mapping of words to spellings, or None for lexicon-free decoder
            word_dict (_Dictionary): dictionary of words
            tokens_dict (_Dictionary): dictionary of tokens
            lm (CTCDecoderLM): language model. If using a lexicon, only word level LMs are currently supported
            decoder_options (_LexiconDecoderOptions or _LexiconFreeDecoderOptions):
                parameters used for beam search decoding
            blank_token (str): token corresopnding to blank
            sil_token (str): token corresponding to silence
            unk_word (str): word corresponding to unknown
        FN)
rp   r6   r*   r&   blankrA   _LexiconDecoderdecoder_LexiconFreeDecoderr$   )r]   rp   r"   r6   r*   r$   rr   rt   ru   rK   r7   transitionsr9   token_lmr%   r%   r+   __init__   s,   

zCTCDecoder.__init__idxsrQ   rN   c                   s4   dd t |D }t fdd|}tt|S )Nc                 s  s    | ]}|d  V  qdS )r   Nr%   )r'   gr%   r%   r+   	<genexpr>   s    z)CTCDecoder._get_tokens.<locals>.<genexpr>c                   s
   |  j kS N)rw   )xr\   r%   r+   <lambda>   s   
 z(CTCDecoder._get_tokens.<locals>.<lambda>)itgroupbyfiltertorch
LongTensorlistr]   r~   r%   r\   r+   _get_tokens   s   zCTCDecoder._get_tokensc                 C  sN   g }t |D ]\}}|| jkrq|dks|||d  kr!|| qt|S )z8Returns frame numbers corresponding to non-blank tokens.r      )	enumeraterw   appendr   	IntTensor)r]   r~   rR   rC   idxr%   r%   r+   _get_timesteps   s   


zCTCDecoder._get_timestepsc                 C     | j   dS )a  Initialize the internal state of the decoder.

        See :py:meth:`decode_step` for the usage.

        .. note::

           This method is required only when performing online decoding.
           It is not necessary when performing batch decoding with :py:meth:`__call__`.
        N)ry   decode_beginr\   r%   r%   r+   r        
zCTCDecoder.decode_beginc                 C  r   )a  Finalize the internal state of the decoder.

        See :py:meth:`decode_step` for the usage.

        .. note::

           This method is required only when performing online decoding.
           It is not necessary when performing batch decoding with :py:meth:`__call__`.
        N)ry   
decode_endr\   r%   r%   r+   r     r   zCTCDecoder.decode_end	emissionstorch.FloatTensorc                 C  sp   |j tjkr
td|jstd| std|jdkr&td|j |	 \}}| j
| || dS )a  Perform incremental decoding on top of the curent internal state.

        .. note::

           This method is required only when performing online decoding.
           It is not necessary when performing batch decoding with :py:meth:`__call__`.

        Args:
            emissions (torch.FloatTensor): CPU tensor of shape `(frame, num_tokens)` storing sequences of
                probability distribution over labels; output of acoustic model.

        Example:
            >>> decoder = torchaudio.models.decoder.ctc_decoder(...)
            >>> decoder.decode_begin()
            >>> decoder.decode_step(emission1)
            >>> decoder.decode_step(emission2)
            >>> decoder.decode_end()
            >>> result = decoder.get_final_hypothesis()
        emissions must be float32.emissions must be a CPU tensor.emissions must be contiguous.   zemissions must be 2D. Found N)dtyper   float32
ValueErroris_cpuRuntimeErroris_contiguousndimshapesizery   decode_stepdata_ptr)r]   r   TNr%   r%   r+   r      s   
zCTCDecoder.decode_stepList[CTCHypothesis]c                       fdd|D S )Nc              	     s>   g | ]}t  |j fd d|jD |j |jdqS )c                   s    g | ]}|d kr j |qS )r   )r6   rB   )r'   r   r\   r%   r+   r,   G  s     z2CTCDecoder._to_hypo.<locals>.<listcomp>.<listcomp>)r#   rO   r1   rR   )r   r   r#   rO   r1   r   )r'   resultr\   r%   r+   r,   D  s    

z'CTCDecoder._to_hypo.<locals>.<listcomp>r%   r]   resultsr%   r\   r+   _to_hypoC  s   
zCTCDecoder._to_hypoc                 C  s   | j  }| |d| j S )a9  Get the final hypothesis

        Returns:
            List[CTCHypothesis]:
                List of sorted best hypotheses.

        .. note::

           This method is required only when performing online decoding.
           It is not necessary when performing batch decoding with :py:meth:`__call__`.
        N)ry   get_all_final_hypothesisr   rp   r   r%   r%   r+   get_final_hypothesisN  s   
zCTCDecoder.get_final_hypothesisNlengthsOptional[torch.Tensor]List[List[CTCHypothesis]]c                 C  s   |j tjkr
td|jstd| std|jdkr&td|j |dur1|js1td|	 \}}}|du rCt
|f|}d}g }t|D ]&}| || |d	  }	| j|	|| |}
|| |
d| j  qK|S )
a  
        Performs batched offline decoding.

        .. note::

           This method performs offline decoding in one go. To perform incremental decoding,
           please refer to :py:meth:`decode_step`.

        Args:
            emissions (torch.FloatTensor): CPU tensor of shape `(batch, frame, num_tokens)` storing sequences of
                probability distribution over labels; output of acoustic model.
            lengths (Tensor or None, optional): CPU tensor of shape `(batch, )` storing the valid length of
                in time axis of the output Tensor in each batch.

        Returns:
            List[List[CTCHypothesis]]:
                List of sorted best hypotheses for each audio sequence in the batch.
        r   r   r      zemissions must be 3D. Found Nzlengths must be a CPU tensor.   r   )r   r   r   r   r   r   r   r   r   r   fullrI   r   stridery   decoder   r   rp   )r]   r   r   Br   r   float_byteshyposbemissions_ptrr   r%   r%   r+   __call__]  s(   
zCTCDecoder.__call__r   c                   r   )z
        Map raw token IDs into corresponding tokens

        Args:
            idxs (LongTensor): raw token IDs generated from decoder

        Returns:
            List: tokens corresponding to the input IDs
        c                   s   g | ]
} j | qS r%   )r*   rB   item)r'   r   r\   r%   r+   r,     s    z-CTCDecoder.idxs_to_tokens.<locals>.<listcomp>r%   r   r%   r\   r+   idxs_to_tokens  s   
zCTCDecoder.idxs_to_tokens)rp   ra   r"   rq   r6   rE   r*   rE   r$   r   rr   rs   rt   rH   ru   rH   rK   rH   rX   rv   )r~   rQ   rX   rN   )r~   rQ   rX   rQ   )r   r   )rX   r   r   )r   r   r   r   rX   r   )r~   rN   rX   r   )rS   rT   rU   rV   r}   r   r   r   r   r   r   r   r   r   r%   r%   r%   r+   r      s    

9


#
2r   r   2   r   z-infF-|z<unk>r"   Optional[str]r#   Union[str, List[str]]r$   Union[str, CTCDecoderLM]rJ   rp   ra   	beam_sizebeam_size_tokenOptional[int]beam_thresholdrP   	lm_weight
word_score	unk_score	sil_scorelog_addri   rt   rH   ru   rK   rX   c                 C  s   |durt |turtdt|}| r+t| } t||p| |||	|
||tjd	}nt	||p2| ||||tjd}t
| ||||}t |tkrWtdu rQtdt||}n|du r^t }t|| |||||||d	S )aY	  Builds an instance of :class:`CTCDecoder`.

    Args:
        lexicon (str or None): lexicon file containing the possible words and corresponding spellings.
            Each line consists of a word and its space separated spelling. If `None`, uses lexicon-free
            decoding.
        tokens (str or List[str]): file or list containing valid tokens. If using a file, the expected
            format is for tokens mapping to the same index to be on the same line
        lm (str, CTCDecoderLM, or None, optional): either a path containing KenLM language model,
            custom language model of type `CTCDecoderLM`, or `None` if not using a language model
        lm_dict (str or None, optional): file consisting of the dictionary used for the LM, with a word
            per line sorted by LM index. If decoding with a lexicon, entries in lm_dict must also occur
            in the lexicon file. If `None`, dictionary for LM is constructed using the lexicon file.
            (Default: None)
        nbest (int, optional): number of best decodings to return (Default: 1)
        beam_size (int, optional): max number of hypos to hold after each decode step (Default: 50)
        beam_size_token (int, optional): max number of tokens to consider at each decode step.
            If `None`, it is set to the total number of tokens (Default: None)
        beam_threshold (float, optional): threshold for pruning hypothesis (Default: 50)
        lm_weight (float, optional): weight of language model (Default: 2)
        word_score (float, optional): word insertion score (Default: 0)
        unk_score (float, optional): unknown word insertion score (Default: -inf)
        sil_score (float, optional): silence insertion score (Default: 0)
        log_add (bool, optional): whether or not to use logadd when merging hypotheses (Default: False)
        blank_token (str, optional): token corresponding to blank (Default: "-")
        sil_token (str, optional): token corresponding to silence (Default: "|")
        unk_word (str, optional): word corresponding to unknown (Default: "<unk>")

    Returns:
        CTCDecoder: decoder

    Example
        >>> decoder = ctc_decoder(
        >>>     lexicon="lexicon.txt",
        >>>     tokens="tokens.txt",
        >>>     lm="kenlm.bin",
        >>> )
        >>> results = decoder(emissions) # List of shape (B, nbest) of Hypotheses
    Nz!lm_dict must be None or str type.)	r   r   r   r   r   r   r   r   criterion_type)r   r   r   r   r   r   r   zflashlight-text is installed, but KenLM is not installed. Please refer to https://github.com/kpu/kenlm#python-module for how to install it.)	rp   r"   r6   r*   r$   rr   rt   ru   rK   )rG   rH   r   rE   _load_words_LexiconDecoderOptionsr-   _CriterionTypeCTC_LexiconFreeDecoderOptionsrM   _KenLMr   _ZeroLMr   )r"   r#   r$   rJ   rp   r   r   r   r   r   r   r   r   rt   ru   rK   r*   rr   r6   r%   r%   r+   r     sZ   9

r   model_PretrainedFilesc                 C  sP   | dvrt |  dd|  }t| d| d| dkr$| ddS d dS )	N)librispeechzlibrispeech-3-gramzlibrispeech-4-gramzZ not supported. Must be one of ['librispeech-3-gram', 'librispeech-4-gram', 'librispeech']zdecoder-assets/z/lexicon.txtz/tokens.txtr   z/lm.binr!   )r   r   )r   prefixr%   r%   r+   _get_filenames  s   
r   c                 C  sD   t | }t|j}t|j}|jdurt|j}nd}t|||dS )aM  
    Retrieves pretrained data files used for :func:`ctc_decoder`.

    Args:
        model (str): pretrained language model to download.
            Valid values are: ``"librispeech-3-gram"``, ``"librispeech-4-gram"`` and ``"librispeech"``.

    Returns:
        Object with the following attributes

            * ``lm``: path corresponding to downloaded language model,
              or ``None`` if the model is not associated with an lm
            * ``lexicon``: path corresponding to downloaded lexicon file
            * ``tokens``: path corresponding to downloaded tokens file
    Nr!   )r   r   r"   r#   r$   r   )r   fileslexicon_filetokens_filelm_filer%   r%   r+   r     s   


r   )"r"   r   r#   r   r$   r   rJ   r   rp   ra   r   ra   r   r   r   rP   r   rP   r   rP   r   rP   r   rP   r   ri   rt   rH   ru   rH   rK   rH   rX   r   )r   rH   rX   r   )>
__future__r   	itertoolsr   abcr   collectionsr   typingr   r   r   r   r	   r
   r   flashlight.lib.text.decoderr   r   r   rx   r   r   r   rz   r   r   r   _LMr   _LMStater   r4   r   r.   r   r   flashlight.lib.text.dictionaryr   rF   r   rE   r   r   torchaudio.utilsr   !flashlight.lib.text.decoder.kenlmr   r   	Exception__all__r   rA   rM   r   r   r   r   rP   r   r   r   r%   r%   r%   r+   <module>   sZ     0	 1 j
q