o
    qia                  
   @   s   d Z ddlmZ ddlmZ ddlZddlZddlm	Z	 de
deeef fdd	Zd
e
dededee	e	f fddZddefddZde
de
fddZde
de
ddfddZdS )zx
keras_tokenizer module
-------------------------------

This module create tokens using a pre-trained sequence model .
    )Path)TupleN)ndarray
vocab_pathreturnc                 C   sn   t | ddd}| }W d   n1 sw   Y  td| }dd t|D }dd t|D }||fS )	z
    Maps characters to integers and vice versa
    Args:
        vocab_path (str): path to the vocab file
    Returns:
        Two dictionaries containing character to integer mapping and integer to character mapping
    rzUTF-8)encodingN_c                 S   s   i | ]\}}||qS  r
   .0idxcharr
   r
   Y/home/ubuntu/.local/lib/python3.10/site-packages/urduhack/tokenization/keras_tokenizer.py
<dictcomp>       z_load_vocab.<locals>.<dictcomp>c                 S   s   i | ]\}}||qS r
   r
   r   r
   r
   r   r      r   )openreadlinelist	enumerate)r   
vocab_filevocabchar2idxidx2charr
   r
   r   _load_vocab   s   	
r   sentencer   max_lenc                 C   s   t jt| |ftd}t t| |f}d}| D ](}|dkr'd|d|d f< q||v r@|| |d|f< |d7 }||kr@ ||fS q||fS )aR  
    Makes the input and output arrays for the data explaining where is a character or a space

    Args:
        sentence (str): Sentence to be tokenized
        char2idx (dict): Dict containing character to integer mapping
        max_len (int): integer
    Returns:
        Input and Output arrays representing features and labels
    )dtyper       )npzeroslenint)r   r   r   input_output_
char_indexletterr
   r
   r   _preprocess_sentence!   s    r(         ?c                 C   sx   | dk}| | }|| }d}g }t |jd D ]}	||	 }
|
dkr'|||
 7 }||	 |kr4|| d}q|| |S )a  
    Retrieve the original words from predicted and actual arrays as per char2idx mapping

    Args:
        features (array): Input array
        labels (array): Output array
        idx2char (dict): Dict mapping integer to character
        thresh (float): Confidence to tell whether prediction is a character or space
    Returns:
        list : Containing ``urdu`` word tokens
    r    )rangeshapeappend)featureslabelsr   threshmasklettersspacesfinaltokensr'   r   r
   r
   r   _retrieve_words;   s   

r6   
model_pathc                 C   s$   t jj| }t|\}}|||fS )z
    Loads pre_trained keras model and vocab file

    Args:
        model_path (str): Path to the model file
        vocab_path (str): Path to the vocab file
    Returns:
        tuple: contains object
    )tfkerasmodels
load_modelr   )r7   r   model_	char2idx_	idx2char_r
   r
   r   _load_modelW   s   

r?   c                 C   s,   t |  st | sd}t|dS dS )a   
    Check if the models file exist.

    Args:
        model_path (str): path to the tokenizer model file
        vocab_path (str): Path to the tokenizer vocab file
    Raises:
        FileNotFoundError: If model_path does not exist
    Returns: None
    zWord tokenizer Model not found!Please run 'urduhack download' in terminal.Doc: https://urduhack.readthedocs.io/en/stable/installation.html#downloading-modelsN)r   existsFileNotFoundError)r7   r   _errorr
   r
   r   _is_model_existf   s   rC   )r)   )__doc__pathlibr   typingr   numpyr    
tensorflowr8   r   strdictr   r#   r(   r   r6   r?   rC   r
   r
   r
   r   <module>   s   "