o
    qi%                     @   sv  d Z ddlZddlZddlZddddddd	d
ddddddddZejdejejB dZ	edZ
edZedddd eD ZedZedZejdejejB dZejdejdZdefd d!Zd5defd#d$Zd5defd%d&Zd5defd'd(Zd5defd)d*Zd6defd+d,Zed-d eejD d"Zd6ded.efd/d0Zded.efd1d2Z defd3d4Z!dS )7z
Preprocessing utilities
    NUSDPLNGBPJPYTHBCRCNGNKRWILSVNDEURPHPPYGUAHINR)$u   zł   £   ¥u   ฿u   ₡u   ₦u   ₩u   ₪u   ₫u   €u   ₱u   ₲u   ₴u   ₹z\(?:^|(?<=[^\w@.)]))([\w+-](\.(?!\.))?)*?[\w+-]@(?:\w-?)*?\w+(\.([a-z]{2,})){1,3}(?:$|(?=\b))flagszp(?:^|(?<=[^\w)]))(\+?1[ .-]?)?(\(?\d{3}\)?[ .-]?)?(\d{3}[ .-]?\d{4})(\s?(?:ext\.?|[#x-])\s?\d{2,6})?(?:$|(?=\W))u{   (?:^|(?<=[^\w,.]))[+–-]?(([1-9]\d{0,2}(,\d{3})+(\.\d*)?)|([1-9]\d{0,2}([ .]\d{3})+(,\d*)?)|(\d*?[.,]\d+)|\d+)(?:$|(?=\b))z({})+|c                 c   s    | ]}t |V  qd S N)reescape.0c r   O/home/ubuntu/.local/lib/python3.10/site-packages/urduhack/preprocessing/util.py	<genexpr>   s    r   z((\r\n)|[\n\v])+z	(?!\n)\s+a  (?:^|(?<![\w/.]))(?:(?:https?://|ftp://|www\d{0,3}\.))(?:\S+(?::\S*)?@)?(?:(?!(?:10|127)(?:\.\d{1,3}){3})(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)(?:\.(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)*(?:\.(?:[a-z\u00a1-\uffff]{2,})))(?::\d{2,5})?(?:/\S*)?(?:$|(?![\w?!+&/]))zn(?:^|(?<![\w/.]))(?:(?:https?://)?)(?:\w-?)*?\w+(?:\.[a-z]{2,12}){1,3}/[^\s.,?!'\"|+]{2,12}(?:$|(?![\w?!+&/]))textc                 C   s   t dtd|  S )u  
    Given ``text`` str, replace one or more spacings with a single space, and one
    or more linebreaks with a single newline. Also strip leading/trailing whitespace.

    Args:
        text (str): ``Urdu`` text
    Returns:
        str: Returns a ``str`` object containing normalized text.
    Examples:
        >>> from urduhack.preprocessing import normalize_whitespace
        >>> text = "عراق اور شام     اعلان کیا ہے دونوں         جلد اپنے     گے؟"
        >>> normalized_text = normalize_whitespace(text)
        >>> normalized_text
        عراق اور شام اعلان کیا ہے دونوں جلد اپنے گے؟
     z\n)_NONBREAKING_SPACE_REsub_LINEBREAK_REstripr    r   r   r   normalize_whitespaceG   s   r'    c                 C   s   t |t|| S )u  
    Replace all URLs in ``text`` str with ``replace_with`` str.

    Args:
        text (str): ``Urdu`` text
        replace_with (str): Replace string
    Returns:
        str: Returns a ``str`` object replace url with ``replace_with`` text.
    Examples:
        >>> from urduhack.preprocessing import replace_urls
        >>> text = "20 www.gmail.com  فیصد"
        >>> replace_urls(text)
        '20  فیصد'
    )_URL_REr#   _SHORT_URL_REr    replace_withr   r   r   replace_urlsZ      r-   c                 C      t || S )u  
    Replace all emails in ``text`` str with ``replace_with`` str.

    Args:
        text (str): ``Urdu`` text
        replace_with (str): Replace string
    Returns:
        str: Returns a ``str`` object replace emails with ``replace_with`` text.
    Examples:
        >>> text = "20 gunner@gmail.com  فیصد"
        >>> from urduhack.preprocessing import replace_emails
        >>> replace_emails(text)
    )	_EMAIL_REr#   r+   r   r   r   replace_emailsl   s   r1   c                 C   r/   )u  
    Replace all phone numbers in ``text`` str with ``replace_with`` str.

    Args:
        text (str): ``Urdu`` text
        replace_with (str): Replace string
    Returns:
        str: Returns a ``str`` object replace number_no with ``replace_with`` text.
    Examples:
        >>> from urduhack.preprocessing import replace_numbers
        >>> text = "20  فیصد"
        >>> replace_numbers(text)
        ' فیصد'
    )	_PHONE_REr#   r+   r   r   r   replace_phone_numbers}      r3   c                 C   r/   )ug  
    Replace all numbers in ``text`` str with ``replace_with`` str.

    Args:
        text (str): ``Urdu`` text
        replace_with (str): Replace string
    Returns:
        str: Returns a ``str`` object replace number with ``replace_with`` text.
    Examples:
        >>> from urduhack.preprocessing import replace_phone_numbers
        >>> text = "یعنی لائن آف کنٹرول پر فائربندی کا معاہدہ 555-123-4567 میں ہوا تھا"
        >>> replace_phone_numbers(text)
        'یعنی لائن آف کنٹرول پر فائربندی کا معاہدہ میں ہوا تھا'
    )_NUMBERS_REr#   r+   r   r   r   replace_numbers   r4   r6   c                 C   s6   |du rt  D ]
\}}| ||} q| S t|| S )uZ  
    Replace all currency symbols in ``text`` str with string specified by ``replace_with`` str.

    Args:
        text (str): Raw text
        replace_with (str): if None (default), replace symbols with
            their standard 3-letter abbreviations (e.g. '$' with 'USD', '£' with 'GBP');
            otherwise, pass in a string with which to replace all symbols
            (e.g. "*CURRENCY*")
    Returns:
        str: Returns a ``str`` object containing normalized text.
    Examples:
        >>> from urduhack.preprocessing import replace_currency_symbols
        >>> text = "یعنی لائن آف کنٹرول پر فائربندی کا معاہدہ 2003 میں ہوا 33$ تھا۔"
        >>> replace_currency_symbols(text)
    'یعنی لائن آف کنٹرول پر فائربندی کا معاہدہ 2003 میں ہوا 33USD تھا۔'
    N)
CURRENCIESitemsreplace_CURRENCY_REr#   )r    r,   keyvaluer   r   r   replace_currency_symbols   s
   r=   c                 c   s(    | ]}t t|d r|V  qdS )PN)unicodedatacategorychr
startswith)r   ir   r   r   r      s    returnc                 C   s.   |rt jdt |d| t jdS | tS )u,  
    Remove punctuation from ``text`` by removing all instances of ``marks``.

    Args:
        text (str): Urdu text
        marks (str): If specified, remove only the characters in this string,
            e.g. ``marks=',;:'`` removes commas, semi-colons, and colons.
            Otherwise, all punctuation marks are removed.
    Returns:
        str: returns a ``str`` object containing normalized text.
    Note:
        When ``marks=None``, Python's built-in :meth:`str.translate()` is
        used to remove punctuation; otherwise, a regular expression is used
        instead. The former's performance is about 5-10x faster.
    Examples:
        >>> from urduhack.preprocessing import remove_punctuation
        >>> output = remove_punctuation("کر ؟ سکتی ہے۔")
        کر سکتی ہے

    z[{}]+r(   r   )r   r#   formatr   UNICODE	translatePUNCTUATION_TRANSLATE_UNICODE)r    marksr   r   r   remove_punctuation   s    
rJ   c                 C   s   d dd | D S )u  
    Remove accents from any accented unicode characters in ``text`` str, either by
    transforming them into ascii equivalents or removing them entirely.

    Args:
        text (str): Urdu text
    Returns:
        str
    Examples:
        >>> from urduhack.preprocessing import remove_accents
        >>>text = "دالتِ عظمیٰ درخواست"
        >>> remove_accents(text)
    'دالت عظمی درخواست'
    r(   c                 s   s    | ]
}t |s|V  qd S r   )r?   	combiningr   r   r   r   r      s    z!remove_accents.<locals>.<genexpr>)joinr&   r   r   r   remove_accents   r.   rM   c                 C   s"   d}t dd |D }| |S )z
    Removes ``English`` words and digits from a ``text``

    Args:
         text (str): Urdu text
    Returns:
        str: ``str`` object with english alphabets removed
    >ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890c                 S   s   i | ]}|d qS r   r   )r   r;   r   r   r   
<dictcomp>   s    z,remove_english_alphabets.<locals>.<dictcomp>)str	maketransrG   )r    
characterstabler   r   r   remove_english_alphabets   s   	
rT   )r(   r   )"__doc__sysr?   regexr   r7   compile
IGNORECASErF   r0   r2   r5   rE   rL   r:   r$   r"   r)   r*   rP   r'   r-   r1   r3   r6   r=   dictfromkeysrange
maxunicoderH   rJ   rM   rT   r   r   r   r   <module>   sF   





 


!"	