o
    i2                     @   sv  d dl Z e jddd Ze jddd Ze jde jd	d
dgdd Ze jde jdddgdd Ze jde jdddgdd Ze jde jdg ddd Z	e jde jdg dd d! Z
e jd"e jdd#d#gd$d% Ze jd"e jdd&d'gd(d) Ze jd*e jdd+d,gd-d. Ze jd/e jdd0gd1d2 Ze jd3e jdd4gd5d6 Ze jd7e jjd8d9 Ze jdd:d;ge jd<d=d> Ze jd?d@dA Ze jdBdCdD Ze jdEdFdG Ze jdHg dIe jdJdKdL Ze jdMe jddNdOgdPdQ ZdS )R    Ni_  c                 C   sD   | d}|d j dksJ t|d dksJ |d j dks J d S )Nz   This is a cat.r         )idxlenen_tokenizerdoc r	   V/home/ubuntu/.local/lib/python3.10/site-packages/spacy/tests/lang/en/test_tokenizer.pytest_issue351   s   r   ih  c                 C   s   | d}t |dksJ dS )z!Test tokenization of big ellipsisz$45...............Asking   Nr   r   tokensr	   r	   r
   test_issue360      r   i  ztext,number)7am7)z11p.m.11c                 C   s.   | |}t |dksJ |d j|ksJ dS )z`Test that times like "7am" are tokenized correctly and that numbers are
    converted to string.r   r   Nr   text)r   r   numberr   r	   r	   r
   test_issue736      r   i  r   z3/4/2012z
01/12/1900c                 C      | |}t |dksJ dS )zTest that dates are not split and kept as one token. This behaviour is
    currently inconsistent, since dates separated by hyphens are still split.
    This will be hard to prevent without causing clashes with numeric ranges.r   Nr   r   r   r   r	   r	   r
   test_issue740   s   r   i  zWe were scaredzWe Were Scaredc                 C   s2   | |}t |dksJ |d j dksJ dS )zqTest that 'were' and 'Were' are excluded from the contractions
    generated by the English tokenizer exceptions.r   r   wereN)r   r   lowerr   r	   r	   r
   test_issue744'   s   r   i  ztext,is_num))oneT)tenT)	tenelevenFc                 C   s   | |}|d j |ksJ d S )Nr   )like_num)r   r   is_numr   r	   r	   r
   test_issue7591   s   r%   i  )ShellshellShedshedc                 C   s.   | |}t |dksJ |d j|ksJ dS )zsTest that 'Shell' and 'shell' are excluded from the contractions
    generated by the English tokenizer exceptions.r   r   Nr   r   r	   r	   r
   test_issue775:   r   r*   i  zThis is a string c                 C   (   | |}d dd |D |ksJ dS )zGTest for Issue #792: Trailing whitespace is removed after tokenization. c                 S      g | ]}|j qS r	   text_with_ws.0tokenr	   r	   r
   
<listcomp>I       z!test_issue792.<locals>.<listcomp>Njoinr   r   r   r	   r	   r
   test_issue792D       r8   zThis is a stringzThis is a string
c                 C   r+   )z6Test base case for Issue #792: Non-trailing whitespacer,   c                 S   r-   r	   r.   r0   r	   r	   r
   r3   Q   r4   z)test_control_issue792.<locals>.<listcomp>Nr5   r7   r	   r	   r
   test_control_issue792L   r9   r:   i[  zaaabbb@ccc.com
Thank you!zaaabbb@ccc.com 
Thank you!c                 C   s   | |}|j |ksJ dS )z5Test that no extra space is added in doc.text method.N)r   r7   r	   r	   r
   test_issue859T   s   r;   iv  zDatum:2014-06-02
Dokument:76467c                 C   sF   | |}|D ]}t |jt |jksJ ||j |jd ks J qdS )zLTest that token.idx matches the original text index for texts with newlines.r   N)r   r   r/   r   )r   r   r   r2   r	   r	   r
   test_issue886^   s
   r<   i{  z	want/needc                 C   s.   | |}t |dksJ |d jdksJ dS )z(Test that / infixes are split correctly.r   r   /Nr   r   r	   r	   r
   test_issue891h   s   r>   i  c                 C   sL   t d dD ]}d}tddD ]
}||t| 7 }q| |}|s#J qdS )zTest that spaCy doesn't hang on many punctuation characters.
    If this test hangs, check (new) regular expressions for conflicting greedy operators
    pytest_timeout)	.,'":?!;-0r   d   N)pytestimportorskiprangestr)r   punctstringir   r	   r	   r
   test_issue957q   s   

rR   ztest@example.comzjohn.doe@example.co.uki  c                 C   s*   | |}t |dksJ |d jrJ dS )z6Test that doc doesn't identify email-addresses as URLsr   r   N)r   like_urlr7   r	   r	   r
   test_issue1698   s   rT   i  c                 C   s   | d}t |dksJ dS )zDTest that "would've" is handled by the English tokenizer exceptions.zwould'ver   Nr   r   r	   r	   r
   test_issue1758   r   rU   i  c                 C   s0   | d}|d j dkr|d jdksJ dS dS )zyTest that spaces don't receive a POS but no TAG. This is the root cause
    of the serialization issue reported in #1773.
r   SPACEr,   N)pos_tag_r   r	   r	   r
   test_issue1773   s
   rZ   i  c                 C   sR   | d}t |dksJ |d jdksJ |d jdksJ |d jdks'J dS )	z2Test that hyphens are split correctly as prefixes.uA   —Yo me llamo... –murmuró el niño– Emilio Sánchez Pérez.   r   u   —   u   –	   Nr   )es_tokenizerr   r	   r	   r
   test_issue3277   s
   r_   word)zdon'tu   don’tzI'du   I’di  c                 C   s   | |d }|j sJ d S )Nr   )is_stop)r   r`   tokr	   r	   r
   test_issue3521   s   rc   i)  thesesthisrec                 C   r   )zuTest that 'theses' and 'thisre' are excluded from the contractions
    generated by the English tokenizer exceptions.r   Nr   r   r	   r	   r
   test_issue10699   s   rf   )rK   markissuer   r   parametrizer   r   r   r%   r*   r8   r:   r;   r<   r>   slowrR   rT   rU   rZ   r_   rc   rf   r	   r	   r	   r
   <module>   sr    





















	

