o
    %ݫi                     @   s   d Z ddlZddlmZmZmZmZmZ ddlm	Z	 ddl
mZ eeZzddlZW n ey5   edw dedee fd	d
ZG dd dZdee dddee fddZdddZG dd dZdS )a  Language model wrapper for kenlm n-gram.

This file is based on the implementation of the kenLM wrapper from
PyCTCDecode (see: https://github.com/kensho-technologies/pyctcdecode) and
is used in CTC decoders.

See: speechbrain.decoders.ctc.py

Authors
 * Adel Moumen 2023
    N)
CollectionOptionalSetTuplecast)CharTrie)
get_loggerzwkenlm python bindings are not installed. To install it use: pip install https://github.com/kpu/kenlm/archive/master.zip	arpa_pathreturnc                 C   s   t  }t| dd:}d}|D ]-}| }|dkrd}n|dkr! n|r;t|dkr;|d}t|d	kr;||d
  qW d   n1 sFw   Y  t|dkrUtd|S )zRead unigrams from arpa file.

    Taken from: https://github.com/kensho-technologies/pyctcdecode

    Arguments
    ---------
    arpa_path : str
        Path to arpa file.

    Returns
    -------
    unigrams : set
        Set of unigrams.
    zutf-8)encodingFz	\1-grams:Tz	\2-grams:r   	      NzANo unigrams found in arpa file. Something is wrong with the file.)setopenstriplensplitadd
ValueError)r	   unigramsfstart_1_gramlineparts r   W/home/ubuntu/.local/lib/python3.10/site-packages/speechbrain/decoders/language_model.pyload_unigram_set_from_arpa   s(   
r   c                   @   s(   e Zd ZdZd
ddZedddZd	S )
KenlmStateaE  Wrapper for kenlm state.

    This is a wrapper for the kenlm state object. It is used to make sure that the
    state is not modified outside of the language model class.

    Taken from: https://github.com/kensho-technologies/pyctcdecode

    Arguments
    ---------
    state : kenlm.State
        Kenlm state object.
    statekenlm.Statec                 C   s
   || _ d S )N_state)selfr   r   r   r   __init__P   s   
zKenlmState.__init__r
   c                 C   s   | j S )zGet the raw state object.r!   r#   r   r   r   r   S   s   zKenlmState.stateN)r   r    r
   r    )__name__
__module____qualname____doc__r$   propertyr   r   r   r   r   r   B   s
    
r   r   kenlm_modelkenlm.Modelc                    s|   t | dk rtdt |  t| }t fdd|D }t | dkr%dnt |t |  }|dk r<tdt|d	 d
 |S )aM  Filter unigrams down to vocabulary that exists in kenlm_model.

    Taken from: https://github.com/kensho-technologies/pyctcdecode

    Arguments
    ---------
    unigrams : list
        List of unigrams.
    kenlm_model : kenlm.Model
        Kenlm model.

    Returns
    -------
    unigram_set : set
        Set of unigrams.
    i  zHOnly %s unigrams passed as vocabulary. Is this small or artificial data?c                    s   g | ]}| v r|qS r   r   ).0tr,   r   r   
<listcomp>r   s    z(_prepare_unigram_set.<locals>.<listcomp>r         ?g?zOnly %s%% of unigrams in vocabulary found in kenlm model-- this might mean that your vocabulary and language model are incompatible. Is this intentional?d   r   )r   loggerwarningr   round)r   r,   unigram_setretained_fractionr   r0   r   _prepare_unigram_setY   s   r9   r    c                  C   s&   zt  } W | S  ty   tdw )zGet uninitialized kenlm state.

    Taken from: https://github.com/kensho-technologies/pyctcdecode

    Returns
    -------
    kenlm_state : kenlm.State
        Empty kenlm state.
    z3To use a language model, you need to install kenlm.)kenlmStateImportErrorr   )kenlm_stater   r   r   _get_empty_lm_state   s   

r>   c                   @   s   e Zd ZdZ					d!ddd	eee  d
ededededdfddZ	e
defddZdefddZdddefddZdedefddZ	d"dededeeef fdd ZdS )#LanguageModela  Language model container class to consolidate functionality.

    This class is a wrapper around the kenlm language model. It provides
    functionality to score tokens and to get the initial state.

    Taken from: https://github.com/kensho-technologies/pyctcdecode

    Arguments
    ---------
    kenlm_model : kenlm.Model
        Kenlm model.
    unigrams : list
        List of known word unigrams.
    alpha : float
        Weight for language model during shallow fusion.
    beta : float
        Weight for length score adjustment of during scoring.
    unk_score_offset : float
        Amount of log score offset for unknown tokens.
    score_boundary : bool
        Whether to have kenlm respect boundaries when scoring.
    N      ?      ?      $Tr,   r-   r   alphabetaunk_score_offsetscore_boundaryr
   c           	      C   sb   || _ |d u rtd t }d }nt|| j }t|}|| _|| _|| _	|| _
|| _|| _d S )NzBNo known unigrams provided, decoding results might be a lot worse.)_kenlm_modelr4   r5   r   r9   r   fromkeys_unigram_set
_char_trierC   rD   rE   rF   )	r#   r,   r   rC   rD   rE   rF   r7   	char_trier   r   r   r$      s   	

zLanguageModel.__init__c                 C   s   t t| jjS )z+Get the order of the n-gram language model.)r   intrG   orderr%   r   r   r   rM      s   zLanguageModel.orderc                 C   s4   t  }| jr| j| t|S | j| t|S )zGet initial lm state.)r>   rF   rG   BeginSentenceWriteNullContextWriter   )r#   start_stater   r   r   get_start_state   s   zLanguageModel.get_start_staterP   r    c                 C   s(   | j rt }| j|d|}|S d}|S )zCalculate final lm score.z</s>g        )rF   r>   rG   	BaseScore)r#   rP   	end_statescorer   r   r   _get_raw_end_score   s   z LanguageModel._get_raw_end_scorepartial_tokenc                 C   sN   | j du rd}n
t| j |dk}| j| }t|dkr%|t| d }|S )zGet partial token score.Nr2   r      )rJ   rL   has_noderE   r   )r#   rV   is_oov	unk_scorer   r   r   score_partial_token   s   

z!LanguageModel.score_partial_tokenFwordis_last_wordc                 C   s   t |tstdt| t }| j|j||}t| j	dkr&|| j	vs+|| jvr0|| j
7 }|r9|| | }| j| d ttj | j }|t|fS )z&Score word conditional on start state.z7Wrong input state type found. Expected KenlmState, got r   r2   )
isinstancer   AssertionErrortyper>   rG   rR   r   r   rI   rE   rU   rC   mathlog10erD   )r#   
prev_stater\   r]   rS   lm_scorer   r   r   rT      s    



 zLanguageModel.score)Nr@   rA   rB   T)F)r'   r(   r)   r*   r   r   strfloatboolr$   r+   rL   rM   r   rQ   rU   r[   r   rT   r   r   r   r   r?      sF    

	
r?   r&   )r*   ra   typingr   r   r   r   r   pygtrier   speechbrain.utils.loggerr   r'   r4   r:   r<   rf   r   r   r9   r>   r?   r   r   r   r   <module>   s0    #

&