o
    ½e¦i  ã                   @   s8   d Z ddlZedƒZG dd„ dƒZdd„ Zdd	d
„ZdS )zB
N-gram language model query interface

Authors
 * Aku Rouhe 2020
é    Nz-infc                   @   s&   e Zd ZdZdd„ Zeƒ fdd„ZdS )ÚBackoffNgramLMa¯  
    Query interface for backoff N-gram language models

    The ngrams format is best explained by an example query: P( world | <s>,
    hello ), i.e. trigram model, probability of "world" given "<s> hello", is:
    `ngrams[2][("<s>", "hello")]["world"]`

    On the top level, ngrams is a dict of different history lengths, and each
    order is a dict, with contexts (tuples) as keys and (log-)distributions
    (dicts) as values.

    The backoffs format is a little simpler. On the top level, backoffs is a
    list of different context-orders, and each order is a mapping (dict) from
    backoff context to backoff (log-)weight

    Arguments
    ---------
    ngrams : dict
        The N-gram log probabilities.
        This is a triply nested dict.
        The first layer is indexed by N-gram order (integer).
        The second layer is indexed by the context (tuple of tokens).
        The third layer is indexed by tokens, and maps to the log prob.
        Example:
        log(P(fox|a quick red)) = -5.3 is accessed by:
        `ngrams[4][('a', 'quick', 'red')]['fox']`
    backoffs : dict
        The backoff log weights.
        This is a doubly nested dict.
        The first layer is indexed by N-gram order (integer).
        The second layer is indexed by the backoff history (tuple of tokens)
        i.e. the context on which the probability distribution is conditioned
        on. This maps to the log weights.
        Example:
        If log(P(fox|a quick red)) is not listed, we find
        log(backoff(a quick red)) = -23.4, which is accessed:
        `backoffs[3][('a', 'quick', 'red')]`
        This dict needs to have entries for orders up to at least N-1 (even if
        they are empty). It may also have entries for order N, though those
        can never be accessed.

    Example
    -------
    >>> import math
    >>> ngrams = {1: {tuple(): {'a': -0.6931, 'b': -0.6931}},
    ...           2: {('a',): {'a': -0.6931, 'b': -0.6931},
    ...               ('b',): {'a': -0.6931}}}
    >>> backoffs = {1: {('b',): 0.}}
    >>> lm = BackoffNgramLM(ngrams, backoffs)
    >>> round(math.exp(lm.logprob('a', ('b',))), 1)
    0.5
    >>> round(math.exp(lm.logprob('b', ('b',))), 1)
    0.5

    c                 C   sB   t |ƒ}t |ƒ|kst |ƒ|d kstdƒ‚|| _|| _|| _d S )Né   z+Backoffs dict needs to be of order N or N-1)ÚlenÚ
ValueErrorÚngramsÚbackoffsÚ	top_order)Úselfr   r   Úorder© r   úR/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/speechbrain/lm/ngram.pyÚ__init__F   s   
zBackoffNgramLM.__init__c                 C   sž   t |ƒd }|| jkr|  ||dd… ¡S || j| v r.|| j| | v r.| j| | | S |dkr4tS |d }| j|  |d¡}|  ||dd… ¡}|| S )z2Computes the backoff log weights and applies them.r   Ng        )r   r   Úlogprobr   ÚNEGINFINITYr   Úget)r	   ÚtokenÚcontextÚquery_orderÚcontext_orderÚbackoff_log_weightÚlpr   r   r   r   Q   s   
zBackoffNgramLM.logprobN)Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   Útupler   r   r   r   r   r      s    8r   c              	   C   s\   g }| D ]'}t  ¡ }|D ]\}}|d  d7  < |d  | ||¡ 7  < q| |¡ q|S )a@  
    Evaluates the N-gram LM on each sentence in data

    Call `ngram_perplexity` with the output of this function to compute the
    perplexity.

    Arguments
    ---------
    data : iterator
        An iterator over sentences, where each sentence should be an iterator
        as returned by `speechbrain.lm.counting.ngrams_for_evaluation`
    LM : BackoffNgramLM
        The language model to evaluate

    Returns
    -------
    list
        List of `collections.Counter`s which have the keys "num_tokens" and
        "neglogprob", giving the number of tokens and logprob of each sentence
        (in the same order as data).

    NOTE
    ----
    The `collections.Counter` cannot add negative numbers. Thus it is important
    to use negative log probabilities (always >=0).

    Example
    -------
    >>> class MockLM:
    ...     def __init__(self):
    ...         self.top_order = 3
    ...     def logprob(self, token, context):
    ...         return -1.0
    >>> LM = MockLM()
    >>> data = [[("S", ("<s>",)),
    ...          ("p", ("<s>", "S")),
    ...          ("e", ("S", "p")),
    ...          ("e", ("p", "e")),
    ...          ("c", ("e", "e")),
    ...          ("h", ("e", "c")),
    ...          ("</s>", ("c", "h"))],
    ...         [("B", ("<s>",)),
    ...          ("r", ("<s>", "B")),
    ...          ("a", ("B", "r")),
    ...          ("i", ("r", "a")),
    ...          ("n", ("a", "i")),
    ...          ("</s>", ("i", "n"))]]
    >>> sum(ngram_evaluation_details(data, LM), collections.Counter())
    Counter({'num_tokens': 13, 'neglogprob': 13.0})

    Ú
num_tokensr   Ú
neglogprob)ÚcollectionsÚCounterr   Úappend)ÚdataÚLMÚdetailsÚsentenceÚcounterr   r   r   r   r   Úngram_evaluation_detailsn   s   4r&   ç      $@c                 C   s*   t | t ¡ ƒ}|d |d  }|| }|S )a?  
    Computes perplexity from a list of individual sentence evaluations.

    Arguments
    ---------
    eval_details : list
        List of individual sentence evaluations. As returned by
        `ngram_evaluation_details`
    logbase : float
        The logarithm base to use.

    Returns
    -------
    float
        The computed perplexity.

    Example
    -------
    >>> eval_details = [
    ...     collections.Counter(neglogprob=5, num_tokens=5),
    ...     collections.Counter(neglogprob=15, num_tokens=15)]
    >>> ngram_perplexity(eval_details)
    10.0

    r   r   )Úsumr   r   )Úeval_detailsÚlogbaser%   ÚexponentÚ
perplexityr   r   r   Úngram_perplexity¬   s   r-   )r'   )r   r   Úfloatr   r   r&   r-   r   r   r   r   Ú<module>   s    a>