o
    i                     @   s   d dl Z d dlZd dlmZ d dlmZmZmZ d dlZddl	m
Z
 ddlmZmZ ddlmZ ddl
mZmZmZ dd	lmZ d
dlmZ d
dlmZ dZddefddZG dd deZG dd deZG dd deZdgZdS )    N)Path)AnyDictUnion   )util)BaseDefaultsLanguage)Doc)DummyTokenizerload_config_from_strregistry)Vocab   )	LEX_ATTRS)
STOP_WORDSzU
[nlp]

[nlp.tokenizer]
@tokenizers = "spacy.vi.VietnameseTokenizer"
use_pyvi = true
Tuse_pyvic                    s    fdd}|S )Nc                    s   t | j dS )Nr   )VietnameseTokenizervocab)nlpr    J/home/ubuntu/.local/lib/python3.10/site-packages/spacy/lang/vi/__init__.pyvietnamese_tokenizer_factory   s   zAcreate_vietnamese_tokenizer.<locals>.vietnamese_tokenizer_factoryr   )r   r   r   r   r   create_vietnamese_tokenizer   s   r   c                   @   s   e Zd Zd dedefddZdd Zded	efd
dZ	dd Z
dd Zd	eeef fddZi fdeeef d	dfddZd	efddZded	d fddZdeeef d	dfddZdeeef d	d fddZdS )!r   Fr   r   c                 C   sL   || _ || _| jr$zddlm} || _W d S  ty#   d}t|d w d S )Nr   )ViTokenizerz`Pyvi not installed. Either set use_pyvi = False, or install it https://pypi.python.org/pypi/pyvi)r   r   pyvir   ImportError)selfr   r   r   msgr   r   r   __init__!   s   
zVietnameseTokenizer.__init__c                 C   s   t | j| jffS N)r   r   r   r   r   r   r   
__reduce__0   s   zVietnameseTokenizer.__reduce__textreturnc                 C   sT   | j r| |}t||\}}t| j||dS t| |\}}t| j||dS )N)wordsspaces)r   pyvi_tokenizer   get_words_and_spacesr
   r   split)r   r$   r&   r'   r   r   r   __call__3   s   
zVietnameseTokenizer.__call__c                 C   s   g d}d}d}d}d}d}g d}g }	|	 | |	 | |	 ||g |	 |||g dd	|	 d
 }	t|	|tj}
dd |
D S )zQModified from pyvi to preserve whitespace and skip unicode
        normalization.)z==>z->z\.\.\.z>>z\d+([\.,_]\d+)+z2([a-zA-Z0-9_.+-]+@([a-zA-Z0-9-]+\.)+[a-zA-Z0-9-]+)z\w+://[^\s]+z\w+z[^\w\s])u
   [A-ZĐ]+\.zTp\.zMr\.zMrs\.zMs\.zDr\.zThS\.z(\s+||)c                 S   s   g | ]}|d  qS )r   r   ).0tokenr   r   r   
<listcomp>[   s    z?VietnameseTokenizer.pyvi_sylabelize_with_ws.<locals>.<listcomp>)extendjoinrefindallUNICODE)r   r$   specialsdigitemailwebwordnon_wordabbreviationspatternstokensr   r   r   pyvi_sylabelize_with_ws?   s   


z+VietnameseTokenizer.pyvi_sylabelize_with_wsc           	      C   sr  t |dkrg S | r|gS | |}g }g }t|D ]$\}}| s@|| ||dks7||d   s9dn||d   q| jjj| jj|dg}|d }g }t	dt |d D ]R}|d | dkr|| t
jvr||d  t
jvr|| d  s||d  d  s|| d  r||d  d  r|||  ||  }q_|| || }q_|| |S )z3Modified from pyvi to preserve text and whitespace.r   r    FI_W)lenisspacer?   	enumerateappendr   modelpredictsent2featuresrangestringpunctuationisdigitistitle)	r   r$   segsr&   preceding_wsir/   labelsr>   r   r   r   r(   ]   sH   

&



z!VietnameseTokenizer.pyvi_tokenizec                 C   s
   d| j iS )Nr   r   r"   r   r   r   _get_config   s   
zVietnameseTokenizer._get_configconfigNc                 C   s   | dd| _d S )Nr   F)getr   )r   rS   r   r   r   _set_config   s   zVietnameseTokenizer._set_configc                    s   d fddi}t |g S )Ncfgc                      s   t   S r!   )srsly
json_dumpsrR   r   r"   r   r   <lambda>   s    z.VietnameseTokenizer.to_bytes.<locals>.<lambda>)r   to_bytes)r   kwargsserializersr   r"   r   rZ      s   zVietnameseTokenizer.to_bytesdatac                    s"   d fddi}t ||g   S )NrV   c                         t| S r!   )rU   rW   
json_loads)br"   r   r   rY          z0VietnameseTokenizer.from_bytes.<locals>.<lambda>)r   
from_bytes)r   r]   r[   deserializersr   r"   r   rb      s   zVietnameseTokenizer.from_bytespathc                    s,   t |}d fddi}t ||g  d S )NrV   c                    s   t |   S r!   )rW   
write_jsonrR   pr"   r   r   rY      ra   z-VietnameseTokenizer.to_disk.<locals>.<lambda>)r   ensure_pathto_diskr   rd   r[   r\   r   r"   r   ri      s   
zVietnameseTokenizer.to_diskc                    s,   t |}d fddi}t ||g   S )NrV   c                    r^   r!   )rU   rW   	read_jsonrf   r"   r   r   rY      ra   z/VietnameseTokenizer.from_disk.<locals>.<lambda>)r   rh   	from_diskrj   r   r"   r   rl      s   
zVietnameseTokenizer.from_disk)F)__name__
__module____qualname__r   boolr    r#   strr
   r+   r?   r(   r   r   rR   rU   bytesrZ   rb   r   r   ri   rl   r   r   r   r   r       s    $r   c                   @   s   e Zd ZeeZeZeZ	dS )VietnameseDefaultsN)
rm   rn   ro   r   DEFAULT_CONFIGrS   r   lex_attr_gettersr   
stop_wordsr   r   r   r   rs      s    rs   c                   @   s   e Zd ZdZeZdS )
VietnameseviN)rm   rn   ro   langrs   Defaultsr   r   r   r   rw      s    rw   )T) r3   rJ   pathlibr   typingr   r   r   rW   r@   r   languager   r	   r>   r
   r   r   r   r   r   	lex_attrsr   rv   r   rt   rp   r   r   rs   rw   __all__r   r   r   r   <module>   s$    	|
