o
    i                     @   sn   d dl Z d dlZd dlmZ d dlmZ d dlmZmZm	Z	 ej
dd Zdd Zd	d
 Zdd Zdd ZdS )    N)English)	Tokenizer)compile_infix_regexcompile_prefix_regexcompile_suffix_regexc                 C   sR   t tjj}ttjj}g d}t|}td}t	| tjj
|j|j|j|jdS )N)z\.\.\.+z(?<=[0-9])-(?=[0-9])z[0-9]+(,[0-9]+)+u   [\[\]!&:,()\*—–\/-]a-b)token_match)r   r   Defaultsprefixesr   suffixesr   recompiler   tokenizer_exceptionssearchfinditermatch)en_vocab	prefix_re	suffix_recustom_infixesinfix_retoken_match_re r   a/home/ubuntu/.local/lib/python3.10/site-packages/spacy/tests/lang/en/test_customized_tokenizer.pycustom_en_tokenizer
   s   
r   c                 C   sP   d}dd | |D }|g dksJ d}dd | |D }|g dks&J d S )Nz\The 8 and 10-county definitions are not used for the greater Southern California Megaregion.c                 S      g | ]}|j qS r   text.0wordr   r   r   
<listcomp>"       z@test_en_customized_tokenizer_handles_infixes.<locals>.<listcomp>)The8and10-countydefinitionsarenotusedforthegreaterSouthern
California
Megaregion.z]The 8- and 10-county definitions are not used for the greater Southern California Megaregion.c                 S   r   r   r   r   r   r   r   r!   8   r"   )r#   r$   r'   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r   r   sentencecontextr   r   r   ,test_en_customized_tokenizer_handles_infixes    s   r7   c                 C   *   d}dd | |D }|g dksJ d S )Nz\The 8 and 10-county definitions a-b not used for the greater Southern California Megaregion.c                 S   r   r   r   r   r   r   r   r!   Q   r"   zDtest_en_customized_tokenizer_handles_token_match.<locals>.<listcomp>)r#   r$   r%   r&   r'   r(   r)   r   r+   r,   r-   r.   r/   r0   r1   r2   r3   r   r4   r   r   r   0test_en_customized_tokenizer_handles_token_matchO      r9   c                 C   r8   )N_The 8 and 10-county definitions are not used for the greater Southern California Megaregion. :)c                 S   r   r   r   r   r   r   r   r!   i   r"   z>test_en_customized_tokenizer_handles_rules.<locals>.<listcomp>)r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   :)r   r4   r   r   r   *test_en_customized_tokenizer_handles_rulesg   r:   r=   c                 C   s<   d}| j }|d= || _ dd | |D }|g dksJ d S )Nr;   r<   c                 S   r   r   r   r   r   r   r   r!      r"   zGtest_en_customized_tokenizer_handles_rules_property.<locals>.<listcomp>)r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   :))rules)r   r5   r@   r6   r   r   r   3test_en_customized_tokenizer_handles_rules_property   s   rA   )r   pytestspacy.lang.enr   spacy.tokenizerr   
spacy.utilr   r   r   fixturer   r7   r9   r=   rA   r   r   r   r   <module>   s    
/