o
    }oig                     @   s  d dl Z d dlZd dlZd dlmZmZmZ d dlmZm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZmZmZmZmZmZmZ d dlmZ d	Zzd dlZe  d
ZW n eee fyd   dZY nw zd dl!Z!d
Z"W n eefyz   dZ"Y nw zd dl#Z#d
Z$W n eefy   dZ$Y nw dZ%dZ&dZ'dd Z(dd Z)dd Z*dd Z+dd Z,dd Z-eG dd dZ.G dd dZ/dnde0d e1d!d"fd#d$Z2	%	&	'dod(d"d)ee0 d*e3d+e3d,e0d!e4fd-d.Z5d'dd/e6d0fd1d2d3ee4e0f d4e0d5e1d6e4d7e0d!e/fd8d9Z7d:e/d!e/fd;d<Z8	
dnd:e/d e1d!d"fd=d>Z9	
dpd?e0d@ee0e4f dAe1d e1d!d"f
dBdCZ:dnd@ee0e4f dAe1d!d"fdDdEZ;dnd@ee0e4f dAe1d!d"fdFdGZ<d@ee0e4f d!d"fdHdIZ=	J			K	Ldqd1d2deee0f dMe0dNeeee0f  dOe1dPee3e3f dQe0d!eedR e4f fdSdTZ>G dUdV dVeZ?G dWdX dXeZ@G dYdZ dZeZAG d[d\ d\eAZBd!e1fd]d^ZCd_eee0f d`e3d!ee0ee4e4f f fdadbZD	cdrddede dfe3d!dgfdhdiZE	dsdjeee0f d3eee4e0f  dkeee4e0f  d!ee0eBf fdldmZFdS )t    N)ABCabstractmethodabstractproperty)defaultdict
namedtuple)	dataclass)Enum)Path)AnyDictList
NamedTupleOptionalTupleUnion)loggingu   ‡TFzkaldifst is not installed or is installed incorrectly.
please run `pip install kaldifst` or `bash scripts/installers/install_riva_decoder.sh` to install.z_graphviz is not installed.
please run `bash scripts/installers/install_graphviz.sh` to install.z{kaldilm is not installed.
please run `pip install kaldilm` or `bash scripts/installers/install_riva_decoder.sh` to install.c                   C      t du rttd S NF)_KALDIFST_AVAILABLEImportErrorKALDIFST_INSTALLATION_MESSAGE r   r   _/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/asr/parts/utils/wfst_utils.py_kaldifst_maybe_raiseK      r   c                   C   
   t   tS zUImport helper function that returns kaldifst package or raises ImportError exception.)r   kaldifstr   r   r   r   kaldifst_importerP      r   c                   C   r   r   )_GRAPHVIZ_AVAILABLEr   GRAPHVIZ_INSTALLATION_MESSAGEr   r   r   r   _graphviz_maybe_raiseV   r   r"   c                   C   r   )zUImport helper function that returns graphviz package or raises ImportError exception.)r"   graphvizr   r   r   r   graphviz_importer[   r   r$   c                   C   r   r   )_KALDILM_AVAILABLEr   KALDILM_INSTALLATION_MESSAGEr   r   r   r   _kaldilm_maybe_raisea   r   r'   c                   C   r   r   )r'   kaldilmr   r   r   r   kaldilm_importerf   r   r)   c                   @   s&   e Zd ZU dZeed< dZeed< dS )LexiconUnitzhA dataclass encapsulating the name of the language unit (e.g. wordpiece) and its mark (e.g. word begin).name markN)__name__
__module____qualname____doc__str__annotations__r-   r   r   r   r   r*   l   s   
 r*   c                   @   s   e Zd Zedfdeeeee  f deeee	f eee
f f deeee	f eee
f f de	fddZdee	ee	 f fd	d
Zdd Zedee fddZdS )Lexicon^#\d+$wordid2tokenidid2wordid2tokendisambig_patternc                    s  t t| d t }|rdd | D n| _dd  j D  _t t| d t }|r<dd | D n| _dd  j D  _| _	t
t} j	 D ]\}}	| j| j   fdd|	D 7  < qV| _| _d	}
d}d
 _d _d _d _ j D ]7\}} j|jr|s|j js j|_||
kr|}
|d7 }|j jks|j jkr|dksJ d _q|
 _| _|r j D ]\}} j|jrވ j|_q|j jkr|j jk qdS dS )a  
        Lexicon class which contains word-to-token-sequence, word-to-id, and token-to-id mappings.

        Args:
          wordid2tokenid:
            Lexicon.
            Mapping from word_id to token1_id token2_id ... tokenN_id.

          id2word:
            Word index.
            Mapping from word_id to word_str.

          id2token:
            Token index.
            Mapping from token_id to token_str.

          disambig_pattern:
            Pattern for disambiguation symbols.
        r   c                 S      i | ]	\}}|t |qS r   r*   .0kvr   r   r   
<dictcomp>       z$Lexicon.__init__.<locals>.<dictcomp>c                 S      i | ]\}}|j |qS r   r+   r<   r   r   r   r@          c                 S   r:   r   r;   r<   r   r   r   r@      rA   c                 S   rB   r   rC   r<   r   r   r   r@      rD   c                    s   g | ]} fd d|D qS )c                    s   g | ]} j | jqS r   )r8   r+   r=   iselfr   r   
<listcomp>   rD   z/Lexicon.__init__.<locals>.<listcomp>.<listcomp>r   )r=   vprG   r   r   rI          z$Lexicon.__init__.<locals>.<listcomp>Fdisambigepsilon<eps>   TN)
isinstancelistvaluesr*   itemsr8   token2idr7   word2idr6   r   r+   word2tokensr9   has_epsilon_default_disambig_mark_default_epsilon_mark_default_epsilon_namematchr-   
startswithmax_disambig_idnum_disambigs)rH   r6   r7   r8   r9   is_id2token_stris_id2word_strrW   r>   r?   r^   r_   rF   sr   rG   r   __init__u   sP   (
zLexicon.__init__returnc                 c   s.    | j  D ]\}}|D ]}||fV  qqd S N)r6   rT   )rH   wordidtokenid_listtokenidsr   r   r   __iter__   s   zLexicon.__iter__c                 C   s
   t | jS re   )r2   rW   rG   r   r   r   __str__   s   
zLexicon.__str__c                 C   sJ   g }| j  D ]\}}|j| js| jr|dkr|| q|  |S )zXReturn a list of token IDs excluding those from
        disambiguation symbols.
        r   )r8   rT   r-   r]   rZ   rX   appendsort)rH   ansrF   rb   r   r   r   	token_ids   s   
zLexicon.token_idsN)r.   r/   r0   recompiler   intr   r   r2   r*   rc   r   ri   rj   propertyrn   r   r   r   r   r4   t   s    
Ar4   lm_pathattach_symbol_tablerd   kaldifst.StdVectorFstc              	   C   s   t   t  t Z}tj|d}tj|d}tj| |d|d t	j
|}|r]t	 }t|dd}|D ]}|  \}	}
|j|	t|
d q7W d   n1 sUw   Y  ||_W d   n1 sgw   Y  t	j|d	d
 |S )a
  
    Compiles an ARPA LM file into a grammar WFST (G.fst).

    Args:
      lm_path:
        Path to the ARPA LM file.

      attach_symbol_table:
        Whether to attach the words for indices of the returned WFST.

    Returns:
      Kaldi-type grammar WFST.
    z
output.fstz	words.txt#0)
input_arpa
output_fstdisambig_symbolwrite_symbol_tableutf-8encodingsymbolkeyNilabel	sort_type)r   r'   tempfileTemporaryDirectoryospathjoinr(   arpa2fstr   StdVectorFstreadSymbolTableopenstripsplit
add_symbolrq   output_symbolsarcsort)rs   rt   tempdirnamerx   	words_txtGosymflinewrF   r   r   r   r      s2   
r          @      @<unk>g_fsttokensword_weighttoken_unigram_weight	token_oovc              	   C   s   t   d}t| |jjd| jdfvsJ | j }d}| j|| | 	 }| j
|tj||||dd | j
|tj||d|dd |d }	|D ]%}
|
|krp| j
|tj|	|	||dd | j|
 t | |	 |	d7 }	qK|S )a  
    Adds special words representing individual tokens (tokenwords).
    In-place operation.

    Args:
      g_fst:
        Kaldi-type grammar WFST.
        Will be augmented with the tokenwords.

      tokens:
        Token vocabulary.

      word_weight:
        The weight of an Out Of Vocabulary (OOV) word emission.

      token_unigram_weight:
        The weight of a tokenword emission.

      token_oov:
        OOV token.

    Returns:
        The id of the tokenword disambiguation token.
    r   rv   #1r   olabelweight	nextstatestatearc        rP   )r   r   ArcIteratorvaluer   r   findavailable_keyr   	add_stateadd_arcStdArcTW_BREAK)r   r   r   r   r   unigram_statetokenword_disambig_idtokenword_disambigtokenword_statelabeltr   r   r   add_tokenwords_   sR   $
			r   rL   r5   	tokenizerTokenizerSpecr7   oovadd_epsilonfirst_tokenword_idr9   c                    s6  dd |  D }d}dd|dk}|r8g g }	}
|  D ]\}} |r(q||k r1|	|n|
| qn fdd| D g }	}
| j|	td	}t|t| |  fd
d  D }| | }t|d||< t	|
 }tdd||d < t|d||d < |rtd||d < |rtdd|d< dd t|  dd dD }|r|	|
7 }	|fdd|
D 7 }tt}t|	|D ]#\}}|dr|dkrq|dkrq|||  fdd|D  qt|||}d|j|j|  _|rd|j|j  _|
D ]}d|j|j|  _q|S )a(  
    Generate a Lexicon using a SentencePiece tokenizer.

    Args:
      tokenizer:
        NeMo SentencePiece tokenizer.

      id2word:
        Word index.
        Mapping from word_id to word_str.

      oov:
        Out Of Vocabulary word in lexicon.

    Returns:
      Lexicon object.
    c                 S   s   i | ]\}}||qS r   r   r<   r   r   r   r@   ]      z2generate_lexicon_sentencepiece.<locals>.<dictcomp>rv   r   u   ▁rL   c                    s   g | ]	}  |s|qS r   )r\   )r=   r?   )r9   r   r   rI   j  rA   z2generate_lexicon_sentencepiece.<locals>.<listcomp>)out_typec                    s.   i | ]\}}|  t ||rd ndqS )beginr,   )r*   r]   r<   )maybe_subtract_oneword_begin_markr   r   r@   s  s    unk<blk>blankrP   disambig_backoff   disambig_tokenword   rO   rN   r   c                 S   s   i | ]\}}||qS r   r   r<   r   r   r   r@     r   c                 S   s   | d S )Nr   r   )itemr   r   r   <lambda>  s    z0generate_lexicon_sentencepiece.<locals>.<lambda>)r   c                    s(   g | ]}| t     gqS r   )rstripr   )r=   tw)maybe_add_oner   vocabr   r   rI     s   ( <c                    s   g | ]}|  qS r   r   )r=   p)r   r   r   rI         	tokenword)rT   r\   rk   rS   encoderq   	get_vocabpiece_to_idr*   maxkeyssortedr   rR   zipr]   r4   r7   rV   r-   )r   r7   r   r   r   r9   rV   backoff_disambigtokenword_modewords
tokenwordsr>   r?   words_piece_idsr8   unk_idmax_token_idr6   word	piece_idslexiconr   r   )r9   r   r   r   r   r   r   generate_lexicon_sentencepieceD  s`   


"r   r   c                 C   s  d| j v }|r| j d d }| jd }n| jd }tt}| D ]\}}|t|  d7  < qtt}| D ]!\}}|r?||kr?q4| }|  |rUd|t|< |  |sIq4tt}	| j }
| j	}|d }|d }tt}| D ]i\}}t|}t
|dksJ || dkr|| dks|r||kr|	| | qp|| }|dkr|}|}nt|
| jdd }||kr|}t|
 d }td| d|
|< |||< |	| ||g  qpt|	| j|
S )am  
    Adds pseudo-token disambiguation symbols #1, #2 and so on
    at the ends of tokens to ensure that all pronunciations are different,
    and that none is a prefix of another.

    See also add_lex_disambig.pl from kaldi.

    Args:
      lexicon:
        Lexicon object.

    Returns:
      Return Lexicon augmented with subseqence disambiguation symbols.
    r   rP   rv   r   #disambig_subsequence)rV   rU   r   rq   tuplecopypoprR   r8   r_   lenrk   r+   lstripr   r   r*   r4   r7   )r   r   r   last_used_disambig_idcount_rn   issubseqword_idr6   r8   first_allowed_disambigfirst_allowed_disambig_idmax_disambiglast_used_disambig_id_of	token_keycur_disambig_idcur_disambigr   r   r   add_disambig_symbols  sV   

	
$r   c              
   C   s(  t   d}d}|| jv }|r| j| d }t }| }||_|j|dd |j|tj| j	| | j| d|dd |rU| }|j|tj| j	| | j| d|dd | D ]W\}	}
|}|re|	|d k rt
|
dd	 D ]\}}| }|j|tj||dkr|	ndd|dd |}qmt|
d }|j|tj|dkr|
d	 nd|dkr|	ndd|dd qW|rVg g }}t|t| jd D ]>}	| j	| j|	 jt |  }| j| }|jd
r|||	f q|jdkr|||	f qtd|j d|j d| }|D ]\}}	|j|tj||	d|dd q| }|D ]!\}}	|j|tj||	d|dd |j|tj||	d|dd q |j|tj| j	| | j| d|dd |rt }| j	 D ]\}}|j||d qb||_t }| j D ]\}}|j||d q{||_tj|dd |S )z
    Compiles a Lexicon into a lexicon WFST (L.fst).

    See also make_lexicon_fst.py from kaldi.

    Args:
      lexicon:
        Lexicon object.

    Returns:
      Kaldi-type lexicon WFST.
    rv   r   rP   r   r   r   r   r   NrL   r   r,   zUnexpected mark `z` for tokenword ``r~   r   r   )r   rV   r   r   r   start	set_finalr   r   rU   	enumerater   ranger   r7   r+   r   r   r8   r-   r]   rk   RuntimeErrorr   rT   r   input_symbolsr   r   )r   rt   r   r   r   r   fststart_statetokenword_state_beginr   rn   	cur_staterF   token_id
next_statetokenword_begintokenword_other
token_unittokenword_state_maintokenword_state_endisymr   r   r   r   r   r   make_lexicon_fst_no_silence  s   
	
		
 


		
	
r  r+   rU   with_self_loopsc                 C   s   t   | dkrt||}n| dkrt||}n| dkr t|}ntd|  |rFt }| D ]\}}|j||d q1||_	|j	
 |_|S )a  Helper function to build a topology WFST (T.fst).

    Args:
      name:
        Topology name. Choices: default, compact, minimal

      token2id:
        Token index.
        Mapping from token_str to token_id.

      with_self_loops:
        Whether to add token-to-epsilon self-loops to the topology.

      attach_symbol_table:
        Whether to attach the token names for indices of the returned WFST.

    Returns:
      Kaldi-type topology WFST.
    defaultcompactminimalzUnknown topo name: r~   )r   build_default_topobuild_compact_topobuild_minimal_topo
ValueErrorr   r   rT   r   r  r   r   )r+   rU   r  rt   r  r  r   rF   r   r   r   
build_topoy  s   
r  c              
   C   s  t   td}| d }t }| }||_|j|dd |j|tj	|dd|dd g }i }| 
 D ]S\}}	|dks@|dkrAq4||rL||	 q4| }
|j|
dd |	||
< |j|tj	|	|	d|
dd |ry|j|
tj	|	dd|
dd |j|
tj	|dd|dd q4t|D ];}|dkrt|D ]}|dkr||kr|| }|j|tj	||d|dd q|D ]}|j|tj	d|d|dd qq|S )z)Build the default (correct) CTC topology.r5   r   r   r   r   r   rO   )r   ro   rp   r   r   r   r   r   r   r   rT   r\   rk   StateIterator)rU   r  r9   blank_idr  r  disambig_idsrn   rb   rF   r   istateostater   disambig_idr   r   r   r    s   


		
	r  c           	   	   C   s  t   td}| d }t }| }||_|j|dd |j|tj	|dd|dd | 
 D ]Q\}}|dks<|dkr=q0||rQ|j|tj	d|d|dd q0| }|j|tj	||d|dd |rs|j|tj	|dd|dd |j|tj	ddd|dd q0|S )zBuild the Compact CTC topology.r5   r   r   r   r   r   rO   r   ro   rp   r   r   r   r   r   r   r   rT   r\   )	rU   r  r9   r  r  r  rb   rF   r   r   r   r   r    sv   



		
r  c              	   C   s   t   td}| d }t }| }||_|j|dd |j|tj	|dd|dd | 
 D ]/\}}|dks<|dkr=q0||rQ|j|tj	d|d|dd q0|j|tj	||d|dd q0|S )zBuild the Minimal CTC topology.r5   r   r   r   r   r   rO   r  )rU   r9   r  r  r  rb   rF   r   r   r   r  >  sN   




r  r  r   r   kalditopology_namewrite_tlg_pathopen_vocabularyopen_vocabulary_weightstarget)ru   zk2.Fsac                    s  t   td t|}|r t|| j  |d |d d}nd}td dd t|j	
 d	D }	t| j|	d
|d}
t|
}t|}tj|dd td t||}t| t| tj|dd td t||j}tj|dd t||}|dkr|rtd| d || ||fS |dkrtd ddl}ddlm} dd |j D d }dd |j D d  |j|j  j }|j| jdksJ d	  fdd|j D }t|j	}d|_!d|_	|"d~ |j#j$|j%d
ddd}||j&|j& k< d|j'|j'j(|k< d|j)d < |*|+|,|}|j&|j&dk d |j&|j&dk< d|j)d < |j-.||_/|j-.||_0|*|}|rltd!| d |1|2 | W d   ||fS W d   ||fS 1 szw   Y  ||fS t3d"| d#)$a  
    Builds a decoding WFST (TLG.fst or TLG.pt).

    See also mkgraph.sh from kaldi.

    Args:
      tokenizer:
        NeMo SentencePiece tokenizer.

      lm_path:
        Path to the ARPA LM file.

      topology_name:
        Topology name. Choices: default, compact, minimal.

      write_tlg_path:
        Where to buffer the TLG.

      open_vocabulary:
        Whether to build a decoding WFST suitable for the open vocabulary decoding.

      open_vocabulary_weights:
        Pair of weights (oov_word_weight, token_unigram_weight).

      target:
        What type to build the WFST for. Choices: kaldi, k2.

    Returns:
      A pair of kaldi- or k2-type decoding WFST and its id of the tokenword disambiguation token.
    zCompiling G.fst ...r   rP   )r   r   r   r   rL   zBuilding L.fst ...c                 S   *   i | ]}t |d d |d d qS 	rP   r   rq   r   r=   r   r   r   r   r@     s   * z"mkgraph_ctc_ov.<locals>.<dictcomp>
T)r   r   r   r   zBuilding LG.fst ...r   zBuilding TLG.fst ...r!  zBuffering TLG.fst into z ...k2zConverting TLG.fst to k2 ...N)r-  c                 S      g | ]\}}|j d kr|qS )r   r-   r=   rF   r   r   r   r   rI     rK   z"mkgraph_ctc_ov.<locals>.<listcomp>c                 S   r.  )r   r/  r0  r   r   r   rI     rK   r   c                    s:   g | ]\}}d |  k r k rn n	| d|d  qS )r    rP   r   r<   first_token_disambig_idr   r   rI     s   : Fshow_weight_one)acceptor_propertieszBuffering TLG.pt into zUnsupported target: `r   )4r   r   infor   r   r   r   r   r2   r   r   r   r   r   r  r   r   composedeterminize_starminimize_encodedr  rU   writetorchnemo.core.utils.k2_guardr-  r8   rT   rV   r+   r7   r-   r   r  inference_modeFsafrom_openfstto_strlabels
aux_labelsrS   __dict__arc_sortconnectremove_epsilonr   from_str
labels_symaux_labels_symsaveas_dictr  )r   rs   r"  r#  r$  r%  r&  r   r   r7   r   lexicon_disambigLLGTTLGr=  r-  r  word_disambig_idr  r   r   r2  r   mkgraph_ctc_ovm  s   '







#



 



rT  c                   @   s@   e Zd ZdZdZdZdZdZdZdZ	dZ
d	Zd
ZdZdZdZdS )KaldiFstMaski      l        l         i   i   @i   i   i  @ i   l        l        l        N)r.   r/   r0   AcceptorError	TopSortedAcyclicIlabelSortedOlabelSortedIlabelDeterministicOlabelDeterministicHasEpsilonsHasIEpsilons
AccessibleCoaccessibleWeightedr   r   r   r   rU    s    rU  c                   @   sf   e Zd ZU eed< eed< eed< eed< eed< eed< eed< eed< eed	< eed
< eed< dS )LatticePropertiesrW  ValidNonemptyrY  rZ  	ArcSortedDeterministicEpsilonFreeInputEpsilonFree	Connectedrc  N)r.   r/   r0   boolr3   r   r   r   r   rd    s   
 rd  c                   @   s   e Zd ZdZdefddZedddZe	
ddee	e
ef  dee	e
ef  dede	d fddZedee defddZedd ZedefddZedeeeef  fddZedeee  fddZd	S )AbstractLatticez/A lattice wrapper with high-level capabilities.latticec                 C   s   || _ d | _d S re   )_latticer7  )rH   rn  r   r   r   rc     s   
zAbstractLattice.__init__rd   torch.Tensorc                 C      dS )URepresents the lattice as a tensor.

        Returns:
          torch.Tensor
        Nr   rG   r   r   r   	as_tensor  s   zAbstractLattice.as_tensorN      ?filenametitlezoomzgraphviz.DigraphzIPython.display.HTMLc                 C   rq  )  Render FSA as an image via graphviz, and return the Digraph object; and optionally save to file filename.
        filename must have a suffix that graphviz understands, such as pdf, svg or png.

        Note:
          You need to install graphviz to use this function::

            ./scripts/installers/install_graphviz.sh

        Args:
          filename:
            Filename to (optionally) save to, e.g. ‘foo.png’, ‘foo.svg’, ‘foo.png’.

          title:
            Title to be displayed in image, e.g. ‘A simple lattice example’.

          zoom:
            Zoom-in lattice in IPython notebook (needed for large lattices).

        Returns:
          graphviz.Digraph or IPython.display.HTML
        Nr   )rH   ru  rv  rw  r   r   r   draw  s   zAbstractLattice.drawreference_sequencec                 C   rq  )Get the edit distance from a reference sequence to the lattice.

        Args:
          reference_sequence:
            List of word- or token-ids.

        Returns:
          Number of edits.
        Nr   )rH   r{  r   r   r   edit_distance*  s    zAbstractLattice.edit_distancec                 C   s   d | _ | jS re   )r7  ro  rG   r   r   r   rn  6  s   zAbstractLattice.latticec                 C      d S re   r   rG   r   r   r   
properties;     zAbstractLattice.propertiesc                 C   r~  re   r   rG   r   r   r   symbol_table?  r  zAbstractLattice.symbol_tablec                 C   r~  re   r   rG   r   r   r   auxiliary_tablesC  r  z AbstractLattice.auxiliary_tablesrd   rp  NNrt  )r.   r/   r0   r1   r
   rc   r   rs  r   r   r	   r2   floatrz  r   rq   r}  rr   rn  r   rd  r  r   r  r   r  r   r   r   r   rm    s4    
rm  c                
       s   e Zd ZdZ		ddddeeeef  deeeef  f fddZ	e
d	efd
dZe
d	eeeef  fddZe
d	eee  fddZdddZdee d	efddZ	ddeeeef  deeeef  ded	ed fddZ  ZS ) KaldiWordLatticez5A Kaldi lattice wrapper with high-level capabilities.Nrn  kaldifst.Latticer  r  c                    s  t   t|tjstdt| dt | dd }d | _|d ur(|| _n| j	j
d ur5|| j	j
| _d | _|d urkt| t| }}d|vra| j	jd ura|d ||| j	j td|| | _d S | j	jd urtdd|| j	j| _d S d S )NzWrong lattice type: `r   c                 S   s   dd t |  dD S )Nc                 S   r'  r(  r*  r+  r   r   r   r@   W  s    z?KaldiWordLattice.__init__.<locals>.<lambda>.<locals>.<dictcomp>r,  )r2   r   r   )symbolsr   r   r   r   W  s    z+KaldiWordLattice.__init__.<locals>.<lambda>r  KaldiAuxiliaryTables)r   rQ   r   Latticer  typesuperrc   _symbol_tablero  r   _auxiliary_tablesrR   r   rS   r  rk   r   )rH   rn  r  r  kaldi_symbols2dict
attributesrS   	__class__r   r   rc   K  s,   


zKaldiWordLattice.__init__rd   c                 C   s  | j d u r| jtjjdtjjk}| jtjjdtjjk}| jjdk}| jtjjdtjjk}| jtj	jdtj	jk}| jtj
jdtj
jkoX| jtjjdtjjk}| jtjjdtjjkor| jtjjdtjjk}| jtjjdtjjk}| jtjjdtjjk}	| jtjjdtjjko| jtjjdtjjk}
| jtjjdtjjk}t|||||||||	|
|d| _ | j S )NTr   )rW  re  rf  rY  rZ  rg  rh  ri  rj  rk  rc  )r7  ro  r  rU  rW  r   rX  
num_statesrY  rZ  r[  r\  r]  r^  r_  r`  ra  rb  rc  rd  )rH   r6  validnonempty
top_sortedacyclic
arc_sorteddeterministicepsilon_freeinput_epsilon_free	connectedweightedr   r   r   r  o  sL   
zKaldiWordLattice.propertiesc                 C      | j S re   )r  rG   r   r   r   r       zKaldiWordLattice.symbol_tablec                 C   r  re   )r  rG   r   r   r   r    r  z!KaldiWordLattice.auxiliary_tablesrp  c                 C   s   t d)rr  z+Tensor representation is not supported yet.)NotImplementedErrorrG   r   r   r   rs    s   zKaldiWordLattice.as_tensorr{  c                 C   s   t   | jjstd t|stdtt	|}t| j
}t| t||}tt|\}}}}|s=tdt|jS )r|  zPLattice contains input epsilons. Edit distance calculations may not be accurate.z8reference_sequence contains zeros, which is not allowed.zRSomething went wrong while calculating edit_distance. Please check input manually.)r   r  rj  r   warningallr  levenshtein_graph_kaldir   make_linear_acceptorro  invertr9  get_linear_symbol_sequenceshortest_pathr  roundr   )rH   r{  refhypali_fst	succeededr   total_weightr   r   r   r}    s   




zKaldiWordLattice.edit_distancert  ru  rv  rw  rx  c                 C   sR  t   t  d\}}| jr"t }| j D ]\}}|j||d q| jrEt| jdrE| jj	rEt }| jj	 D ]\}}|j||d q9tj
| jdd||dd}	t|	}
t|
 }|d |d d	d
dddddd}|durx||d< tj|d}| j|7  _|rtj|\}}|dks|d dkrtd| dt }|jd||dd dd}t|| W d   n1 sw   Y  t r'ddl}ddlm} t }|jd|ddd}t ||\}\}}W d   n1 sw   Y  |!  |"d |d| dt#|| d   d!W  d   S 1 s"w   Y  |S )"ry  NNr~   r  FT)r6  portraitisymbolsosymbolsr5  r   rL   LRz8.5,111Portraitz0.4z0.25z0.0)rankdirsizecenterorientationranksepnodesepmarginNr   )
graph_attrr,   .z@Filename needs to have a suffix like .png, .pdf, .svg, or .gv: `r   temprP   )ru  	directoryformatcleanup)HTMLsvgignorez<iframe srcdoc='z' width="100%" height="r   z-px" frameborder="0" allowfullscreen></iframe>)$r   r"   r  r   r   rT   r   r  hasattrr  rz  ro  r#   Sourcer2   
splitlinesr   Digraphbodyr   r   splitextr  r   r   rendershutilmove_is_notebookwarningsIPython.displayr  _svg_srcdoc_resizecatch_warningssimplefilterr  )rH   ru  rv  rw  r  r   rF   r   r   fst_dotsourcesource_linesr  digraphr   	extensiontmp_dirtemp_fnr  r  r  widthheightr   r   r   rz    sr   



	



"zKaldiWordLattice.drawr  r  r  )r.   r/   r0   r1   r   r   rq   r2   r
   rc   rr   rd  r  r  r   r  rs  r   r}  r   r	   r  rz  __classcell__r   r   r  r   r  H  s:    $+
r  c                  C   s.   zt  jj} | dks
W dS  ty   Y dS w )NZMQInteractiveShellShellTTerminalInteractiveShellF)get_ipythonr  r.   	NameError)shellr   r   r   r    s   
r  ru  rw  c                 C   s   t | dddD}| }|ds| }|drtd|\}}t|t|}}dt||  dt||  d|  ||ffW  d    S 1 sNw   Y  d S )	Nrtr{   r|   z<svgz\d+z<svg width="zpt" height="zpt"
)r   readliner]   ro   findallrq   r  r   )ru  rw  r   r   r  r  r   r   r   r  +  s   

0$r  x&1?r  )kaldifst.StdFstr  ins_del_scorer  c              
   C   s&  t   | tjjdtjjkrtd | tjjdtjjkr$tdt	| t
jr1| jdd}nt	| t
jrFt
tdd| jdd}n
tdt|  d	d
}d}t
|D ]2}t
j||||dg}t
||D ]}|t
j||j||jd d|_qk|D ]	}|j||d qqYt
| |S )a  Construct the levenshtein graph from a kaldi-type WFST or a lattice.

    See also levenshtein_graph from k2.

    Args:
      fst:
        Kaldi-type source WFST or lattice.

      ins_del_score:
        Insertion and deletion penalty.
        Should be more than 0.5 for substitutions to be preferred over insertions/deletions, or less otherwise.

    Returns:
      Kaldi-type levenshtein WFST.
    Tz]Levenshtein graph construction is not safe for WFSTs with different input and output symbols.z7Levenshtein graph is not defined for WFSTs with cycles.)safez[-\d.]+,[-\d.]+0r4  z:Levenshtein graph building is not supported for the type `z`.g      ?r   r   r   r   )r   r  rU  rW  r   r   r  rZ  r  rQ   r   StdFstr   r  rp   ro   subrB  r  r  r   r   rk   r   r   r   r   r   )r  r  lfst	sub_scoreepsr   arcs_to_addr   r   r   r   r  5  sJ   	
r  lat_filenamer8   c              	   C   sn  t   i }d}d}g }t| d}| D ]}|  }	t|	}
|
dkr.g }d}d}q|
dkrf|du s8J |dks>J t|dksFJ |	d }t }t||||dd||< |	 }||_
|d7 }q|
dv r|
dkrd	d
 |	dd D \}}}|	d d}dd
 |dd D \}}ntd| d dd
 |	D \}}}dg}d\}}||krt||d D ]}|	  q|d }tj||tj||d|d}|j||d |||d dkrdd
 |d tD ng f q|
dkrt|	d }|	d d}dd
 |dd D \}}|j|tj||dd qtd| dW d   |S 1 s0w   Y  |S )a  Helper function to load riva-decoder recognition lattices.

    Args:
      lat_filename:
        Path to the riva-decoder recognition lattice file.

      id2word:
        Word index.
        Mapping from word_id to word_str.

      id2token:
        Token index.
        Mapping from token_id to token_str.

    Returns:
      Dictionary with lattice names and corresponding lattices in KaldiWordLattice format.
    Nr   r  rP   )token_seq_listr  )rn  r  r  )r   rV  rV  c                 S      g | ]}t |qS r   rq   rE   r   r   r   rI     r   z%load_word_lattice.<locals>.<listcomp>rL   ,c                 S   r  r   r  rE   r   r   r   rI     r   z/An arc without weight is detected for lattice `zY`.
                                    Weights and token sequences will be set trivially.c                 S   r  r   r  rE   r   r   r   rI     r   r,   )r   r   )
graph_costacoustic_costr   r   c                 S   r  r   r  rE   r   r   r   rI     r   r   c                 S   r  r   r  rE   r   r   r   rI     r   r   zBroken line: `r   )r   r   	readlinesr   r   r   r   r  r  r   r   r   r  r   
LatticeArcLatticeWeightr   rk   r   rq   r   r  )r  r7   r8   lattice_dictrn  	max_stater  r   r   
line_itemsline_lenlat_idr   r   r  r   trunkr  r  rF   arkr   r   r   load_word_latticet  s~   



4

88r  )T)r   r   r   )TT)r  NFr   r!  )r  r  )Gr   ro   r   abcr   r   r   collectionsr   r   dataclassesr   enumr   pathlibr	   typingr
   r   r   r   r   r   r   
nemo.utilsr   r   r   r   r   r   ModuleNotFoundErrorAttributeErrorr#   r    r(   r%   r   r!   r&   r   r   r"   r$   r'   r)   r*   r4   r2   rl  r   r  rq   r   rp   r   r   r  r  r  r  r  rT  rU  rd  rm  r  r  r  r  r  r   r   r   r   <module>   s2  $W-
R

ZN
 

 * XC2


tI W.
@

