o
    'N i  ã                   @   s€   d Z ddlZddlZddlZddlmZ e dej d d ¡Ze dej d d ¡Z	e d¡Z
d	d
„ Zdd„ Zddd„ZdS )z©
Tokenizer for Indian languages. Currently, simple punctuation-based tokenizers
are supported (see `trivial_tokenize`). Major Indian language punctuations are
handled. 
é    N)ÚIndicNlpExceptionz([zB\u0964\u0965\uAAF1\uAAF0\uABEB\uABEC\uABED\uABEE\uABEF\u1C7E\u1C7Fz])z6\u0609\u060A\u060C\u061E\u066A\u066B\u066C\u066D\u06D4z([0-9]+ [,.:/] )+[0-9]+c                 C   s¢   t  d|  dd¡¡}t dd|¡ d¡}d}d}t |¡D ]$}| ¡ }| ¡ }||krA||||…  }||||…  dd¡ }|}q|||d…  }|}| 	d¡S )a™  tokenize string for Indian language scripts using Brahmi-derived scripts

    A trivial tokenizer which just tokenizes on the punctuation boundaries. 
    This also includes punctuations for the Indian language scripts (the 
    purna virama and the deergha virama). This is a language independent 
    tokenizer

    Args:
        text (str): text to tokenize

    Returns:
        list: list of tokens

    ú \1 ú	ú ú[ ]+Ú r   N)
Útriv_tokenizer_indic_patÚsubÚreplaceÚreÚstripÚpat_num_seqÚfinditerÚstartÚendÚsplit)ÚtextÚtok_strÚsÚnew_sÚprevÚmr   r   © r   úT/home/ubuntu/.local/lib/python3.10/site-packages/indicnlp/tokenize/indic_tokenize.pyÚtrivial_tokenize_indic   s   €
r   c                 C   s.   t  d|  dd¡¡}t dd|¡ d¡ d¡S )aƒ  tokenize Urdu string 

    A trivial tokenizer which just tokenizes on the punctuation boundaries. 
    This also includes punctuations for the Urdu script.
    These punctuations characters were identified from the Unicode database 
    for Arabic script by looking for punctuation symbols.

    Args:
        text (str): text to tokenize

    Returns:
        list: list of tokens
    r   r   r   r   )Útriv_tokenizer_urdu_patr	   r
   r   r   r   )r   r   r   r   r   Útrivial_tokenize_urduA   s   r   Úhic                 C   s   |dkrt | ƒS t| ƒS )a°  trivial tokenizer for Indian languages using Brahmi for Arabic scripts

    A trivial tokenizer which just tokenizes on the punctuation boundaries. 
    Major punctuations specific to Indian langauges are handled. 
    These punctuations characters were identified from the Unicode database. 

    Args:
        text (str): text to tokenize
        lang (str): ISO 639-2 language code

    Returns:
        list: list of tokens
    Úur)r   r   )r   Úlangr   r   r   Útrivial_tokenizeT   s   r    )r   )Ú__doc__Ústringr   ÚsysÚindicnlp.commonr   ÚcompileÚpunctuationr   r   r   r   r   r    r   r   r   r   Ú<module>   s   
$