o
    ei@                     @   sb  d Z ddlmZ ddlmZ ddlmZmZmZm	Z	 ddl
Z
ddlmZ ddlmZ ddlmZ d	d
lmZ d	dlmZmZ eeZ	d*dedejfddZe
 							d+de
jde
jdejdededededededejfdd Ze
 	!d,d"ejd#edejfd$d%Z e
 		!d-d"ejd&ejd'eee  d#ede	ejee!ejf f f
d(d)Z"dS ).a  Different decoding graph algorithms for k2, be it HL or HLG (with G LM
and bigger rescoring LM).

This code was adjusted from icefall (https://github.com/k2-fsa/icefall/blob/master/icefall/decode.py).


Authors:
  * Pierre Champion 2023
  * Zeyu Zhao 2023
  * Georgios Karakasidis 2023
    )OrderedDict)Path)DictListOptionalUnionN)arpa_to_fst)run_on_main)
get_logger   )k2)graph_compilerutilscpuhparamsgraphCompilerc                    s   d} ddk}dv rd du rdnd|s|rltd }|d d	d
 }|r:|d d	d
 nd|rTtttd d |d  |ddd |rltttd d |d  ddd d}dv rt|}|rtj|d}j||d}	nj|d}	 ddkrd t	d t
sd gd< dtjdtttjf f fdd}
n& ddv rtd dtjdtttjf fdd}
n
dtjffdd}
|	||
dS ) a1  
    This function reads a config and creates the decoder for k2 graph compiler
    decoding.
    There are the following cases:
        - HLG is compiled and LM rescoring is used. In that case,
          compose_HL_with_G and use_G_rescoring are both True and we will
          create for example G_3_gram.fst.txt and G_4_gram.fst.txt. Note that
          the 3gram and 4gram ARPA lms will need to exist under
          `hparams['lm_dir']`.
        - HLG is compiled but LM rescoring is not used. In that case,
          compose_HL_with_G is True and use_G_rescoring is False and we will
          create for example G_3_gram.fst.txt. Note that the 3gram ARPA lm will
          need to exist under `hparams['lm_dir']`.
        - HLG is not compiled (only use HL graph) and LM rescoring used.
          In that case, compose_HL_with_G is False and use_G_rescoring is True.
          Note that the 4gram ARPA lms will need to exist under
          `hparams['lm_dir']`.
        - HLG is not compiled (only use HL graph) and LM rescoring is not used.
          In that case, compose_HL_with_G is False and use_G_rescoring is False
          and we will not convert LM to FST.

    Arguments
    ---------
    hparams: dict
        The hyperparameters.
    graphCompiler: graph_compiler.GraphCompiler
        The graphCompiler (H)
    device : torch.device
        The device to use.

    Returns
    -------
    Dict:
        decoding_graph: k2.Fsa
            A HL or HLG decoding graph.
            Used with a nnet output and the function `get_lattice` to
            obtain a decoding lattice `k2.Fsa`.
        decoding_method: Callable[[k2.Fsa], k2.Fsa]
            A function to call with a decoding lattice `k2.Fsa` (obtained
            after nnet output intersect with a HL or HLG).
            Returns an FsaVec containing linear FSAs

    Example
    -------
    >>> import torch
    >>> from speechbrain.k2_integration.losses import ctc_k2
    >>> from speechbrain.k2_integration.utils import lattice_paths_to_text
    >>> from speechbrain.k2_integration.graph_compiler import CtcGraphCompiler
    >>> from speechbrain.k2_integration.lexicon import Lexicon
    >>> from speechbrain.k2_integration.prepare_lang import prepare_lang
    >>> from speechbrain.k2_integration.lattice_decoder import get_decoding
    >>> from speechbrain.k2_integration.lattice_decoder import get_lattice

    >>> batch_size = 1

    >>> log_probs = torch.randn(batch_size, 40, 10)
    >>> log_probs.requires_grad = True
    >>> # Assume all utterances have the same length so no padding was needed.
    >>> input_lens = torch.ones(batch_size)
    >>> # Create a small lexicon containing only two words and write it to a file.
    >>> lang_tmpdir = getfixture('tmpdir')
    >>> lexicon_sample = "hello h e l l o\nworld w o r l d\n<UNK> <unk>"
    >>> lexicon_file = lang_tmpdir.join("lexicon.txt")
    >>> lexicon_file.write(lexicon_sample)
    >>> # Create a lang directory with the lexicon and L.pt, L_inv.pt, L_disambig.pt
    >>> prepare_lang(lang_tmpdir)
    >>> # Create a lexicon object
    >>> lexicon = Lexicon(lang_tmpdir)
    >>> # Create a random decoding graph
    >>> graph = CtcGraphCompiler(
    ...     lexicon,
    ...     log_probs.device,
    ... )

    >>> decode = get_decoding(
    ...     {"compose_HL_with_G": False,
    ...      "decoding_method": "onebest",
    ...      "lang_dir": lang_tmpdir},
    ...     graph)
    >>> lattice = get_lattice(log_probs, input_lens, decode["decoding_graph"])
    >>> path = decode["decoding_method"](lattice)['1best']
    >>> text = lattice_paths_to_text(path, lexicon.word_table)
    compose_HL_with_Gdecoding_methodzwhole-lattice-rescoringcachingFTlm_dirG_arpaarpazfst.txtG_rescoring_arpaNlang_dirz	words.txt   )	words_txtin_arpaout_fstngram_ordercache)kwargs   output_folderr   )	cache_dirr   rescoring_lm_scalelatticereturnc                    sX    du r#t d t d  tjd}j| t| t|  d dS )z:Get the best path from a lattice given rescoring_lm_scale.Nz(Decoding method: whole-lattice-rescoringzLoading rescoring LM: r#   r%   )lm_scale_list)loggerinfor   load_Glexicon#remove_G_rescoring_disambig_symbolsprepare_rescoring_Grescore_with_whole_lattice)r&   G_rescoring_ptG_rescoringG_rescoring_pathr   r   r    h/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/speechbrain/k2_integration/lattice_decoder.pyr      s   

z%get_decoding.<locals>.decoding_method)1bestonebestz"Decoding method: one-best-decodingc                 S   s   t dt| iS )z!Get the best path from a lattice.r6   )r   one_best_decodingr&   r4   r4   r5   r      s   c                    s   t  d d)z-A dummy decoding method that raises an error.r   z% not implemented as a decoding_method)NotImplementedErrorgetr9   )r   r4   r5   r      s   )decoding_graphr   )r;   r   replacer	   r   r   r+   compile_HLG
compile_HL
isinstancelistr   Fsar   strr)   r*   to)r   r   devicer   use_G_rescoringr   G_pathr"   Gr<   r   r4   r1   r5   get_decoding   sn   
W


,
 rI      ,          ?log_probs_nnet_output
input_lensdecodersearch_beamoutput_beammin_active_statesmax_active_statesac_scalesubsampling_factorr'   c	              
   C   sj   | j }	||	}|j |	krtd ||	}|| jd    }| |9 } tj| |||||||d}
|
S )a  
    Get the decoding lattice from a decoding graph and neural network output.

    Arguments
    ---------
    log_probs_nnet_output: torch.Tensor
        It is the output of a neural model of shape `(batch, seq_len, num_tokens)`.
    input_lens: torch.Tensor
        It is an int tensor of shape (batch,). It contains lengths of
        each sequence in `log_probs_nnet_output`.
    decoder: k2.Fsa
        It is an instance of :class:`k2.Fsa` that represents the decoding graph.
    search_beam: int
        Decoding beam, e.g. 20.  Ger is faster, larger is more exact
        (less pruning). This is the default value; it may be modified by
        `min_active_states` and `max_active_states`.
    output_beam: int
         Beam to prune output, similar to lattice-beam in Kaldi.  Relative
         to best path of output.
    min_active_states: int
        Minimum number of FSA states that are allowed to be active on any given
        frame for any given intersection/composition task. This is advisory,
        in that it will try not to have fewer than this number active.
        Set it to zero if there is no constraint.
    max_active_states: int
        Maximum number of FSA states that are allowed to be active on any given
        frame for any given intersection/composition task. This is advisory,
        in that it will try not to exceed that but may not always succeed.
        You can use a very large number if no constraint is needed.
    ac_scale: float
        acoustic scale applied to `log_probs_nnet_output`
    subsampling_factor: int
        The subsampling factor of the model.

    Returns
    -------
    lattice: k2.Fsa
        An FsaVec containing the decoding result. It has axes [utt][state][arc].
    zmDecoding graph (HL or HLG) not loaded on the same device  as nnet, this will cause decoding speed degradationr   )rQ   rR   rS   rT   rV   )	rE   rD   r)   warnshaperoundintr   get_lattice)rN   rO   rP   rQ   rR   rS   rT   rU   rV   rE   r&   r4   r4   r5   r[      s(   4


r[   Tr&   use_double_scoresc                 C   s   t j| |d}|S )a  
    Get the best path from a lattice.

    Arguments
    ---------
    lattice: k2.Fsa
        The decoding lattice returned by :func:`get_lattice`.
    use_double_scores: bool
        True to use double precision floating point in the computation.
        False to use single precision.

    Returns
    -------
    best_path: k2.Fsa
        An FsaVec containing linear paths.
    r\   )r   shortest_path)r&   r\   	best_pathr4   r4   r5   r8   )  s   r8   G_with_epsilon_loopsr(   c              
   C   s  |j dksJ || j}| j}t| dr| j| j | _| `t|ds%J t| }| j d }tj	||tj
d}g d}|g d7 }d}	d}
|
|	krz |dkrXtj||d	d
}n	tj|||d	d}tt|}W nW ty } zCtd| d |
|	krtd W Y d}~dS td|j   td t|||
 d	}td|j   W Y d}~nd}~ww |
d7 }
|
|	ksJt|}|du r|S t }|j|j }|D ]}|| }||j |_tj||d}d|d}|||< q|S )a  
    Intersect the lattice with an n-gram LM and use shortest path to decode.
    The input lattice is obtained by intersecting `HLG` with
    a DenseFsaVec, where the `G` in `HLG` is in general a 3-gram LM.
    The input `G_with_epsilon_loops` is usually a 4-gram LM. You can consider
    this function as a second pass decoding. In the first pass decoding, we
    use a small G, while we use a larger G in the second pass decoding.

    Arguments
    ---------
    lattice: k2.Fsa
        An FsaVec with axes [utt][state][arc]. Its `aux_labels` are word IDs.
        It must have an attribute `lm_scores`.
    G_with_epsilon_loops: k2.Fsa
        An FsaVec containing only a single FSA. It contains epsilon self-loops.
        It is an acceptor and its labels are word IDs.
    lm_scale_list: Optional[List[float]]
        If none, return the intersection of `lattice` and `G_with_epsilon_loops`.
        If not None, it contains a list of values to scale LM scores.
        For each scale, there is a corresponding decoding result contained in
        the resulting dict.
    use_double_scores: bool
        True to use double precision in the computation.
        False to use single precision.

    Returns
    -------
    If `lm_scale_list` is None, return a new lattice which is the intersection
    result of `lattice` and `G_with_epsilon_loops`.
    Otherwise, return a dict whose key is an entry in `lm_scale_list` and the
    value is the decoding result (i.e., an FsaVec containing linear FSAs).
    )r   NN	lm_scoresr   )rE   dtype)g|=g&.>g:0yE>gHz>gư>)gh㈵>g-C6?gMbP?g{Gz?g?
   r   T)treat_epsilons_specially)sorted_match_azCaught exception:

z2Return None as the resulting lattice is too large.Nznum_arcs before pruning: zThis OOM is not an error. You can ignore it. If your model does not converge well, or the segment length is too large, or the input sound file is difficult to decode, you will meet this exception.znum_arcs after pruning: r   r]   whole_lattice_rescore_lm_scale_z.1f)rX   rD   rE   hasattrscoresra   r   inverttorchzerosint32	intersectintersect_devicetop_sortconnectRuntimeErrorr)   r*   arcsnum_elementsprune_on_arc_postr   r^   )r&   r`   r(   r\   rE   inv_latticenum_seqs
b_to_a_mapprune_th_listmax_loop_count
loop_countrescoring_latticeelatanssaved_am_scoreslm_scale	am_scoresr_   keyr4   r4   r5   r/   B  s   '



-
r/   )r   )rJ   rJ   rK   rL   rM   r   )T)NT)#__doc__collectionsr   pathlibr   typingr   r   r   r   rk   speechbrain.lm.arpar   speechbrain.utils.distributedr	   speechbrain.utils.loggerr
    r   r   r   __name__r)   GraphCompilerrI   no_gradTensorrB   rZ   floatr[   boolr8   rC   r/   r4   r4   r4   r5   <module>   s    
 >	
N
