o
    oi                     @   s   d Z ddlZddlmZmZmZmZmZ ddlm	Z	 ddl
mZ eeZzddlZW n ey5   edw dedee fd	d
ZG dd dZdee dddee fddZdddZG dd dZdS )a  Language model wrapper for kenlm n-gram.

This file is based on the implementation of the kenLM wrapper from
PyCTCDecode (see: https://github.com/kensho-technologies/pyctcdecode) and
is used in CTC decoders.

See: speechbrain.decoders.ctc.py

Authors
 * Adel Moumen 2023
    N)
CollectionOptionalSetTuplecast)CharTrie)
get_loggerzwkenlm python bindings are not installed. To install it use: pip install https://github.com/kpu/kenlm/archive/master.zip	arpa_pathreturnc                 C   s   t  }t| :}d}|D ]-}| }|dkrd}n|dkr n|r9t|dkr9|d}t|dkr9||d  qW d	   n1 sDw   Y  t|dkrStd
|S )zRead unigrams from arpa file.

    Taken from: https://github.com/kensho-technologies/pyctcdecode

    Arguments
    ---------
    arpa_path : str
        Path to arpa file.

    Returns
    -------
    unigrams : set
        Set of unigrams.
    Fz	\1-grams:Tz	\2-grams:r   	      NzANo unigrams found in arpa file. Something is wrong with the file.)setopenstriplensplitadd
ValueError)r	   unigramsfstart_1_gramlineparts r   a/home/ubuntu/SoloSpeech/.venv/lib/python3.10/site-packages/speechbrain/decoders/language_model.pyload_unigram_set_from_arpa   s(   

r   c                   @   s(   e Zd ZdZd
ddZedddZd	S )
KenlmStateaE  Wrapper for kenlm state.

    This is a wrapper for the kenlm state object. It is used to make sure that the
    state is not modified outside of the language model class.

    Taken from: https://github.com/kensho-technologies/pyctcdecode

    Arguments
    ---------
    state : kenlm.State
        Kenlm state object.
    statekenlm.Statec                 C   s
   || _ d S )N_state)selfr   r   r   r   __init__P   s   
zKenlmState.__init__r
   c                 C   s   | j S )zGet the raw state object.r    r"   r   r   r   r   S   s   zKenlmState.stateN)r   r   r
   r   )__name__
__module____qualname____doc__r#   propertyr   r   r   r   r   r   B   s
    
r   r   kenlm_modelkenlm.Modelc                    s|   t | dk rtdt |  t| }t fdd|D }t | dkr%dnt |t |  }|dk r<tdt|d	 d
 |S )aM  Filter unigrams down to vocabulary that exists in kenlm_model.

    Taken from: https://github.com/kensho-technologies/pyctcdecode

    Arguments
    ---------
    unigrams : list
        List of unigrams.
    kenlm_model : kenlm.Model
        Kenlm model.

    Returns
    -------
    unigram_set : set
        Set of unigrams.
    i  zHOnly %s unigrams passed as vocabulary. Is this small or artificial data?c                    s   g | ]}| v r|qS r   r   ).0tr+   r   r   
<listcomp>r   s    z(_prepare_unigram_set.<locals>.<listcomp>r         ?g?zOnly %s%% of unigrams in vocabulary found in kenlm model-- this might mean that your vocabulary and language model are incompatible. Is this intentional?d   r   )r   loggerwarningr   round)r   r+   unigram_setretained_fractionr   r/   r   _prepare_unigram_setY   s   r8   r   c                  C   s&   zt  } W | S  ty   tdw )zGet uninitialized kenlm state.

    Taken from: https://github.com/kensho-technologies/pyctcdecode

    Returns
    -------
    kenlm_state : kenlm.State
        Empty kenlm state.
    z3To use a language model, you need to install kenlm.)kenlmStateImportErrorr   )kenlm_stater   r   r   _get_empty_lm_state   s   

r=   c                   @   s   e Zd ZdZ					d!ddd	eee  d
ededededdfddZ	e
defddZdefddZdddefddZdedefddZ	d"dededeeef fdd ZdS )#LanguageModela  Language model container class to consolidate functionality.

    This class is a wrapper around the kenlm language model. It provides
    functionality to score tokens and to get the initial state.

    Taken from: https://github.com/kensho-technologies/pyctcdecode

    Arguments
    ---------
    kenlm_model : kenlm.Model
        Kenlm model.
    unigrams : list
        List of known word unigrams.
    alpha : float
        Weight for language model during shallow fusion.
    beta : float
        Weight for length score adjustment of during scoring.
    unk_score_offset : float
        Amount of log score offset for unknown tokens.
    score_boundary : bool
        Whether to have kenlm respect boundaries when scoring.
    N      ?      ?      $Tr+   r,   r   alphabetaunk_score_offsetscore_boundaryr
   c           	      C   sb   || _ |d u rtd t }d }nt|| j }t|}|| _|| _|| _	|| _
|| _|| _d S )NzBNo known unigrams provided, decoding results might be a lot worse.)_kenlm_modelr3   r4   r   r8   r   fromkeys_unigram_set
_char_trierB   rC   rD   rE   )	r"   r+   r   rB   rC   rD   rE   r6   	char_trier   r   r   r#      s   	

zLanguageModel.__init__c                 C   s   t t| jjS )z+Get the order of the n-gram language model.)r   intrF   orderr$   r   r   r   rL      s   zLanguageModel.orderc                 C   s4   t  }| jr| j| t|S | j| t|S )zGet initial lm state.)r=   rE   rF   BeginSentenceWriteNullContextWriter   )r"   start_stater   r   r   get_start_state   s   zLanguageModel.get_start_staterO   r   c                 C   s(   | j rt }| j|d|}|S d}|S )zCalculate final lm score.z</s>g        )rE   r=   rF   	BaseScore)r"   rO   	end_statescorer   r   r   _get_raw_end_score   s   z LanguageModel._get_raw_end_scorepartial_tokenc                 C   sN   | j du rd}n
t| j |dk}| j| }t|dkr%|t| d }|S )zGet partial token score.Nr1   r      )rI   rK   has_noderD   r   )r"   rU   is_oov	unk_scorer   r   r   score_partial_token   s   

z!LanguageModel.score_partial_tokenFwordis_last_wordc                 C   s   t |tstdt| t }| j|j||}t| j	dkr&|| j	vs+|| jvr0|| j
7 }|r9|| | }| j| d ttj | j }|t|fS )z&Score word conditional on start state.z7Wrong input state type found. Expected KenlmState, got r   r1   )
isinstancer   AssertionErrortyper=   rF   rQ   r   r   rH   rD   rT   rB   mathlog10erC   )r"   
prev_stater[   r\   rR   lm_scorer   r   r   rS      s    



 zLanguageModel.score)Nr?   r@   rA   T)F)r&   r'   r(   r)   r   r   strfloatboolr#   r*   rK   rL   r   rP   rT   rZ   r   rS   r   r   r   r   r>      sF    

	
r>   r%   )r)   r`   typingr   r   r   r   r   pygtrier   speechbrain.utils.loggerr   r&   r3   r9   r;   re   r   r   r8   r=   r>   r   r   r   r   <module>   s0    #

&