o
    i7                     @   s  d dl Z d dlZd dlmZ ddlmZ ddlmZ e dj	Z
edd	Zd
edefddZd
edefddZd
edefddZd
edefddZd
edefddZd
edefddZd
edefddZd
edefddZd
edefddZd
edefddZd
edefd d!Zd"edefd#d$Zd"edefd%d&Zd"edefd'd(Zd"edefd)d*Zd"edefd+d,Zd"edefd-d.Z d"edefd/d0Z!d"edefd1d2Z"d"edefd3d4Z#e fd"ed5ee defd6d7Z$d<d
ed9edefd:d;Z%i ej&eej'eej(eej)eej*eej+eej,e ej-e!ej.e"ej/e#ej0e$ej1eej2eej3eej4eej5eej6eej7eej8eej9eej:eej;eiZ<dS )=    N)Set   )attrs   )	URL_MATCHz0([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)aY  com|org|edu|gov|net|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|pro|tel|travel|xyz|icu|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|ss|st|su|sv|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw|textreturnc                 C   s$   | D ]}t |ds dS qdS )NPFT)unicodedatacategory
startswithr   char r   H/home/ubuntu/.local/lib/python3.10/site-packages/spacy/lang/lex_attrs.pyis_punct   s
   r   c                 C   s    | D ]}t |dkr dS qdS )N   FT)ordr   r   r   r   is_ascii    s
   r   c                 C   sj   |  dr| dd  } | dddd} |  rdS | ddkr3| d\}}| r3| r3dS dS )	N)+-   ±~r   , .T/F)r   replaceisdigitcountsplit)r   numdenomr   r   r   like_num'   s   
r$   c                 C      d}| |v S )N)()[]{}<>r   )r   bracketsr   r   r   
is_bracket5      r/   c                 C   r%   )N)"'`   «   »   ‘   ’   ‚   ‛   “   ”   „   ‟   ‹   ›   ❮   ❯''``r   )r   quotesr   r   r   is_quote:      rE   c                 C   r%   )N)r&   r(   r*   r,   r1   r2   r4   r6   r8   r9   r:   r<   r=   r>   r@   rC   r   )r   
left_punctr   r   r   is_left_punctA   rF   rH   c                 C   r%   )N)r'   r)   r+   r-   r1   r2   r5   r7   r;   r?   rA   rB   r   )r   right_punctr   r   r   is_right_punctH   r0   rJ   c                 C   s"   | D ]}t |dkr dS qdS )NScFT)r   r   r   r   r   r   is_currencyM   s
   rL   c                 C   s   t t| S N)bool_like_email)r   r   r   r   
like_emailU      rP   c                 C   s   |  ds
|  drdS |  drt| dkrdS | d dks%| d dkr'd	S d
| v r-d	S tt| D ]
}| | dkr= nq3d	S | ddd ddd }|drUdS | r_|tv r_dS t| redS d	S )Nzhttp://zhttps://Tzwww.   r   r   F@r   :r   )	r   lenrangersplitr!   endswithisalpha_tldsr   )r   itldr   r   r   like_urlY   s*   
r^   c                 C   s   t | dkrdS g }d}d}d}| D ]/}| r"| rd}nd}n	| r)d}n|}||kr4|d7 }nd}|}|d	k rA|| qd|S )
Nd   LONGr   r   Xxdr      )rV   rZ   isupperr   appendjoin)r   shapelast
shape_charseqr   r   r   r   
word_shapeu   s,   


rl   stringc                 C      |   S rM   lowerrm   r   r   r   rp         rp   c                 C   s   | d S )Nr   r   rq   r   r   r   prefix   rr   rs   c                 C   s   | dd  S )Nr   rq   r   r   r   suffix   rQ   ru   c                 C   rn   rM   )rZ   rq   r   r   r   is_alpha   rr   rv   c                 C   rn   rM   )r   rq   r   r   r   is_digit   rr   rw   c                 C   rn   rM   )islowerrq   r   r   r   is_lower   rr   ry   c                 C   rn   rM   )isspacerq   r   r   r   is_space   rr   r{   c                 C   rn   rM   )istitlerq   r   r   r   is_title   rr   r}   c                 C   rn   rM   )re   rq   r   r   r   is_upper   rr   r~   stopsc                 C   s   |   |v S rM   ro   )rm   r   r   r   r   is_stop   rQ   r   r   langc                 C   s   |S rM   r   )r   r   r   r   r   get_lang   s   r   )r   )=rer   typingr   r   r   tokenizer_exceptionsr   compilematchrO   setr!   r[   strrN   r   r   r$   r/   rE   rH   rJ   rL   rP   r^   rl   rp   rs   ru   rv   rw   ry   r{   r}   r~   r   r   LOWERNORMPREFIXSUFFIXIS_ALPHAIS_DIGITIS_LOWERIS_SPACEIS_TITLEIS_UPPERIS_STOP
LIKE_EMAILLIKE_NUMIS_PUNCTIS_ASCIISHAPE
IS_BRACKETIS_QUOTEIS_LEFT_PUNCTIS_RIGHT_PUNCTIS_CURRENCYLIKE_URL	LEX_ATTRSr   r   r   r   <module>   s     	

