o
    ei+                     @   s   d Z ddlZddlmZ ddlmZ ddlmZ eeZ	dd Z
dd	 Zd
d Zdd Zdd Zdd Z		ddeeef deeef deeef dededefddZdS )au  
Tools for working with ARPA format N-gram models

Expects the ARPA format to have:
- a \data\ header
- counts of ngrams in the order that they are later listed
- line breaks between \data\ and \n-grams: sections
- \end\
E.G.
    ```
    \data\
    ngram 1=2
    ngram 2=1

    \1-grams:
    -1.0000 Hello -0.23
    -0.6990 world -0.2553

    \2-grams:
    -0.2553 Hello world

    \end\
    ```


Example
-------
>>> # This example loads an ARPA model and queries it with BackoffNgramLM
>>> import io
>>> from speechbrain.lm.ngram import BackoffNgramLM
>>> # First we'll put an ARPA format model in TextIO and load it:
>>> with io.StringIO() as f:
...     print("Anything can be here", file=f)
...     print("", file=f)
...     print("\\data\\", file=f)
...     print("ngram 1=2", file=f)
...     print("ngram 2=3", file=f)
...     print("", file=f)  # Ends data section
...     print("\\1-grams:", file=f)
...     print("-0.6931 a", file=f)
...     print("-0.6931 b 0.", file=f)
...     print("", file=f)  # Ends unigram section
...     print("\\2-grams:", file=f)
...     print("-0.6931 a a", file=f)
...     print("-0.6931 a b", file=f)
...     print("-0.6931 b a", file=f)
...     print("", file=f)  # Ends bigram section
...     print("\\end\\", file=f)  # Ends whole file
...     _ = f.seek(0)
...     num_grams, ngrams, backoffs = read_arpa(f)
>>> # The output of read arpa is already formatted right for the query class:
>>> lm = BackoffNgramLM(ngrams, backoffs)
>>> lm.logprob("a", context = tuple())
-0.6931
>>> # Query that requires a backoff:
>>> lm.logprob("b", context = ("b",))
-0.6931

Authors
 * Aku Rouhe 2020
 * Pierre Champion 2023
    N)Path)Union)
get_loggerc              	   C   s  t |  i }| D ]?}| }|dd dkr.|d\}}t| d }t|}|||< q|s8t| \}} nt|rDd}t|} ntdi }i }	|st	t
}
i }|d }zG| D ]B}| }t| }t|d	 }t||kr|dd
 }|d
 }t|d }||f }|||< n
|dd }|d }||
| |< q\W n4 ttfy   |
||< ||	|< |st| \}}nt|rd}t|}nt|rd}d}ntdY nw |rN| | kstd|||	fS )a  
    Reads an ARPA format N-gram language model from a stream

    Arguments
    ---------
    fstream : TextIO
        Text file stream (as commonly returned by open()) to read the model
        from.

    Returns
    -------
    dict
        Maps N-gram orders to the number ngrams of that order. Essentially the
        \data\ section of an ARPA format file.
    dict
        The log probabilities (first column) in the ARPA file.
        This is a triply nested dict.
        The first layer is indexed by N-gram order (integer).
        The second layer is indexed by the context (tuple of tokens).
        The third layer is indexed by tokens, and maps to the log prob.
        This format is compatible with `speechbrain.lm.ngram.BackoffNGramLM`
        Example:
        In ARPA format, log(P(fox|a quick red)) = -5.3 is expressed:
            `-5.3 a quick red fox`
        And to access that probability, use:
            `ngrams_by_order[4][('a', 'quick', 'red')]['fox']`
    dict
        The log backoff weights (last column) in the ARPA file.
        This is a doubly nested dict.
        The first layer is indexed by N-gram order (integer).
        The second layer is indexed by the backoff history (tuple of tokens)
        i.e. the context on which the probability distribution is conditioned
        on. This maps to the log weights.
        This format is compatible with `speechbrain.lm.ngram.BackoffNGramLM`
        Example:
        If log(P(fox|a quick red)) is not listed, we find
        log(backoff(a quick red)) = -23.4 which in ARPA format is:
            `<logp> a quick red -23.4`
        And to access that here, use:
            `backoffs_by_order[3][('a', 'quick', 'red')]`

    Raises
    ------
    ValueError
        If no LM is found or the file is badly formatted.
    N   ngram=   FzNot a properly formatted line   r   T"Not a properly formatted ARPA file)_find_data_sectionstripsplitint_next_section_or_end_starts_ngrams_section_parse_order
ValueErrorcollectionsdefaultdictdicttuplefloatlen
IndexError
_ends_arpakeys)fstream
num_ngramslinelhsrhsorder	num_gramsendedngrams_by_orderbackoffs_by_orderprobsbackoffsbackoff_line_length	all_partsprobcontexttokenbackoffbackoff_context r1   Q/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/speechbrain/lm/arpa.py	read_arpaI   sp   5




#
r3   c                 C   s(   | D ]}|dd dkr dS qt d)zI
    Reads (lines) from the stream until the \data\ header is found.
    N   z\data\r   )r   )r   r    r1   r1   r2   r      s
   r   c                 C   sD   | D ]}|  }t|rt|}d|f  S t|r dS qtd)z
    Arguments
    ---------
    fstream : stream
        Stream from which to read lines

    Returns
    -------
    bool
        Whether end was found.
    int
        The order of section that starts
    F)TNr   )r   r   r   r   r   )r   r    r#   r1   r1   r2   r      s   r   c                 C   s   |   dS )Nz-grams:)r   endswithr    r1   r1   r2   r      s   r   c                 C   s   t | dd  dd }|S )Nr   -r   )r   r   )r    r#   r1   r1   r2   r      s   r   c                 C   s   | dkS )Nz\end\r1   r6   r1   r1   r2   r      s   r   #0T	words_txtin_arpaout_fstngram_orderdisambig_symbolcachec           
      C   s  zddl m} W n ty   tdw |r| rdS | s*t| d| dztd| d |t||t| |d	}W n ty` } ztd
| d| d| d|   |d}~ww td|  t	|ddd}	|	
| W d   dS 1 sw   Y  dS )a  
    Use kaldilm to convert an ARPA LM to FST. For example, you could use
    speechbrain.lm.train_ngram to create an ARPA LM and then use this function
    to convert it to an FST.

    It is worth noting that if the fst already exists in the output_dir,
    then they will not be converted again (so you may need to delete them
    by hand if you, at any point, change your ARPA model).

    Arguments
    ---------
    words_txt: str | Path
        path to the words.txt file created by prepare_lang.
    in_arpa: str | Path
        Path to an ARPA LM to convert to an FST.
    out_fst: str | Path
        Path to where the fst will be saved.
    ngram_order: int
        ARPA (and FST) ngram order.
    disambig_symbol: str
        the disambiguation symbol to use.
    cache: bool
        Whether or not to re-create the fst.txt file if it already exist.

    Raises
    ------
    ImportError: If kaldilm is not installed.

    Returns
    -------
    None

    Example
    -------
    >>> from speechbrain.lm.arpa import arpa_to_fst

    >>> # Create a small arpa model
    >>> arpa_file = getfixture('tmpdir').join("bigram.arpa")
    >>> arpa_file.write(
    ...     "Anything can be here\n"
    ...     + "\n"
    ...     + "\\data\\\n"
    ...     + "ngram 1=3\n"
    ...     + "ngram 2=4\n"
    ...     + "\n"
    ...     + "\\1-grams:\n"
    ...     + "0 <s>\n"
    ...     + "-0.6931 a\n"
    ...     + "-0.6931 b 0.\n"
    ...     + "" # Ends unigram section
    ...     + "\\2-grams:\n"
    ...     + "-0.6931 <s> a\n"
    ...     + "-0.6931 a a\n"
    ...     + "-0.6931 a b\n"
    ...     + "-0.6931 b a\n"
    ...     + "\n"  # Ends bigram section
    ...     + "\\end\\\n")  # Ends whole file
    >>> # Create words vocab
    >>> vocav = getfixture('tmpdir').join("words.txt")
    >>> vocav.write(
    ...     "a 1\n"
    ...     + "b 2\n"
    ...     + "<s> 3\n"
    ...     + "#0 4")  # Ends whole file
    >>> out = getfixture('tmpdir').join("bigram.txt.fst")
    >>> arpa_to_fst(vocav, arpa_file, out, 2)
    r   )arpa2fstz\Optional dependencies must be installed to use kaldilm.
Install using `pip install kaldilm`.Nz& not found while trying to create the z FST.zConverting arpa LM 'z' to FST)
input_arpar=   read_symbol_table	max_orderzFailed to create z-gram FST from input=z, disambig_symbol=z, read_symbol_table=zWriting wzutf-8)encoding)kaldilm.arpa2fstr?   ImportErrorexistsFileNotFoundErrorloggerinfostr	Exceptionopenwrite)
r9   r:   r;   r<   r=   r>   r?   sefr1   r1   r2   arpa_to_fst   sL   K	
"rR   )r8   T)__doc__r   pathlibr   typingr   speechbrain.utils.loggerr   __name__rI   r3   r   r   r   r   r   rK   r   boolrR   r1   r1   r1   r2   <module>   s6    ?x	


