o
    'Ni}                     @   sX   d Z ddlZddlmZ ddlmZ edZedZedZ	dd	 Z
dddZdS )z
Sentence splitter for Indian languages. Contains a rule-based 
sentence splitter that can understand common non-breaking phrases
in many Indian languages.
    N)unicode_transliterate)langinfoz[\?!\u0964\u0965]zI[\.\?!\u0964\u0965\uAAF1\uAAF0\uABEB\uABEC\uABED\uABEE\uABEF\u1C7E\u1C7F]z[\u0964\u0965]c                 C   s   h d}t j| |d|v S )zIs the text a non-breaking phrase

    Args:
        text (str): text to check for non-breaking phrase
        lang (str): ISO 639-2 language code

    Returns:
        boolean: true if `text` is a non-breaking phrase
    >}      ऄ   अ   आ   इ   ई   उ   ऊ   ऋ   ऌ   ऍ   ऎ   ए   ऐ   ऑ   ऒ   ओ   औ   क   ख   ग   घ   ङ   च   छ   ज   झ   ञ   ट   ठ   ड   ढ   ण   त   थ   द   ध   न   ऩ   प   फ   ब   भ   म   य   र   ऱ   ल   ळ   ऴ   व   श   ष   स   ह   ॠ   ॡ   आइ   आई   आर   ऎच   ऎन   ऎफ   ऎम   ऎल   ऎस   एच   एन   एफ   एम   एल   एस   कु   कॆ   के   चि   जि   जी   जॆ   जे   टि   टी   डि   डी   डॉ   पि   पी   बि   बी   यु   यू   वि   वी   सि   सी   सौ	   आर्	   ऎच्	   ऎन्	   ऎफ्	   ऎम्	   ऎल्	   ऎस्	   एच्	   एन्	   एफ्	   एम्	   एल्	   एस्	   जेड	   वाय	   ज़ेड   ऎक्स   एक्स   क्यु   क्यू   जेड्   वाय्   व्हि   व्ही   श्री   ज़ेड्   ऎक्स्   एक्स्   डब्ल्यु   डब्ल्यूhi)r   UnicodeIndicTransliteratortransliterate)textlang	ack_chars r   W/home/ubuntu/.local/lib/python3.10/site-packages/indicnlp/tokenize/sentence_tokenize.pyis_acronym_abbvr#   s   sr   autoc                 C   s  |dkrddl m} || }|S |dkr(t|r&t| du r#t}nt}nt}g }d}|  } |	| D ]2}|
 }| }	|dkrL| |d   rLq5|d }
| ||
  }t|dkrc|| |d }q5| |d  }t|dkr{|| |ds|S g }d}d	}t|D ]h\}}|d
}t|dkr|d dkrd}|d
 | }q|d dkrt|d dd |rt|dkr|s|| d}|}q|r|d
 | }t|dkr|| d}d	}qt|dkr|| |}d	}qt|dkr|| |S )a	  split the text into sentences

    A rule-based sentence splitter for Indian languages written in 
    Brahmi-derived scripts. The text is split at sentence delimiter 
    boundaries. The delimiters can be configured by passing appropriate
    parameters. 

    The sentence splitter can identify non-breaking phrases like 
    single letter, common abbreviations/honorofics for some Indian 
    languages.

    Args:
        text (str): text to split into sentence
        lang (str): ISO 639-2 language code
        delim_pat (str): regular expression to identify sentence delimiter characters. If set to 'auto', the delimiter pattern is chosen automatically based on the language and text. 


    Returns:
        list: list of sentences identified from the input text 
    urr   )sentence_tokenizerr   N   . F T)urduhack.tokenizationr   r   is_danda_delimCONTAINS_DANDAsearchDELIM_PAT_NO_DANDADELIM_PAT_DANDAstripfinditerstartend	isnumericlenappend	enumeratesplitr   )r   r   	delim_patr   	sentencescand_sentencesbeginmop1p2r   sfinal_sentences
sen_buffer	bad_stateisentencewordsr   r   r   sentence_split   sl   




	
"



r   )r   )__doc__reindicnlp.transliterater   indicnlpr   compiler   r   r   r   r   r   r   r   r   <module>   s   


 