o
    iL                     @   s   d Z ddlZddlZddlmZ g dZdZdZdZdd	gZ	d
Z
dZg dZg Zee ee eeg eeg ee	 eeg eeg ee
g dde d Zejdk rhedZeeejejB ZdddZdS )u   This code inspired from underthesea package, edited by enamoria.

What changed from the original version: PSG. consider to be abbreviation, but not with the dot.
Just add a boundary for word at r"[A-ZĐ]+\s"
    N)Text)z==>z->z\.\.\.z>>z=\)\)z\d+([\.,_]\d+)+z.[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+u               # Capture 1: entire matched URL
  (?:
  (ftp|http)s?:               # URL protocol and colon
    (?:
      /{1,3}            # 1-3 slashes
      |                 #   or
      [a-z0-9%]         # Single letter or digit or '%'
                        # (Trying not to match e.g. "URI::Escape")
    )
    |                   #   or
                        # looks like domain name followed by a slash:
    [a-z0-9.\-]+[.]
    (?:[a-z]{2,13})
    /
  )
  (?:                                  # One or more:
    [^\s()<>{}\[\]]+                   # Run of non-space, non-()<>{}[]
    |                                  #   or
    \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...)
    |
    \([^\s]+?\)                        # balanced parens, non-recursive: (...)
  )+
  (?:                                  # End with:
    \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...)
    |
    \([^\s]+?\)                        # balanced parens, non-recursive: (...)
    |                                  #   or
    [^\s`!()\[\]{};:'".,<>?«»“”‘’]     # not a space or one of these punct chars
  )
  |                        # OR, the following to match naked domains:
  (?:
    (?<!@)                 # not preceded by a @, avoid matching foo@_gmail.com_
    [a-z0-9]+
    (?:[.\-][a-z0-9]+)*
    [.]
    (?:[a-z]{2,13})
    \b
    /?
    (?!@)                  # not succeeded by a @,
                           # avoid matching "foo.na" in "foo.na@example.com"
  )
z\d{1,2}\/\d{1,2}(\/\d+)?z\d{1,2}-\d{1,2}(-\d+)?z\w+z[^\w\s])u
   [A-ZĐ]+\szTp\.zMr\.zMrs\.zMs\.zDr\.zThS\.(|))   r   zutf-8c                 C   sH   t |  } | dd} tt| }dd |D }|dkr"d|S |S )zeTokenize text for word segmentation.

    :param text: raw text input
    :return: tokenize text
    	 c                 S   s   g | ]}|d  qS )r    ).0tokenr	   r	   U/home/ubuntu/.local/lib/python3.10/site-packages/vietnamese_cleaner/regex_tokenize.py
<listcomp>c   s    ztokenize.<locals>.<listcomp>text)r   lowerreplacerefindallpatternsjoin)r   formattokensr	   r	   r   tokenizeZ   s   
r   )N)__doc__r   sys$underthesea.feature_engineering.textr   specialsdigitemailurlsdatetimewordnon_wordabbreviationsr   extendr   version_infodecodecompileVERBOSEUNICODEr   r	   r	   r	   r   <module>   s8   +




