o
    i>                     @   s  d dl Z d dlmZ g dZeg d Zg ddddd	d
dddddddddddddddddddddd d!d"e jd#e j d$e jd%e j d$d&d'd(d)d*d+d,d-d.d/d0d1d2Zg d3d4d5d6d7d8d9d:d;d<d=d>d?d@dAdBdCdDdEdFdGdHdIdJdKdLdMdNdOdPdQdRdSdTe jdUe j d$dVdWZ	g dXZ
g dYZe jdZed[d\ Ze jdZe	d]d^ Ze jdZed_d` Ze jdZedadb Ze jje jdce
e jdZeddde Ze jje jdfee jdZedgdh Ze jje jdce
e jdfee jdZedidj Ze jje jdke
e jdle
e jdZedmdn Ze jje jdoee jdpee jdZedqdr ZdS )s    N)BASE_EXCEPTIONS)zhttp://www.nytimes.com/2016/04/20/us/politics/new-york-primary-preview.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=a-lede-package-region&region=top-news&WT.nav=top-news&_r=0zwww.red-stars.comzmailto:foo.bar@baz.com)zmailto:foo-bar@baz-co.comz$mailto:foo-bar@baz-co.com?subject=hizwww.google.com?q=google&http://foo.com/blah_(wikipedia)#cite-1zhttp://foo.com/blah_blahzhttp://BlahBlah.com/Blah_Blahzhttp://foo.com/blah_blah/z%http://www.example.com/wpstyle/?p=364z1https://www.example.com/foo/?bar=baz&inga=42&quuxz'http://userid:password@example.com:8080z(http://userid:password@example.com:8080/zhttp://userid@example.comzhttp://userid@example.com/zhttp://userid@example.com:8080zhttp://userid@example.com:8080/z"http://userid:password@example.comz#http://userid:password@example.com/zhttp://142.42.1.1/zhttp://142.42.1.1:8080/r   z+http://foo.com/blah_(wikipedia)_blah#cite-1u&   http://foo.com/unicode_(✪)_in_parensz'http://foo.com/(something)?after=parensz/http://code.google.com/events/#&product=browserzhttp://j.mpzftp://foo.bar/bazz,http://foo.bar/?q=Test%20URL-encoded%20stuffz2http://-.~_!$&'()*+,;=:%40:80%2f::::::@example.comzhttp://1337.netzhttp://a.b-c.dezhttp://223.255.255.254zhttp://a.b--c.de/z+ssh://login@server.com:12345/repository.gitz&svn+ssh://user@ssh.yourdomain.com/pathz8chrome://extensions/?id=mhjfbmdgcfjbbpaeojofohoefgiehjai)marksz3chrome-extension://mhjfbmdgcfjbbpaeojofohoefgiehjaiz$http://foo.com/blah_blah_(wikipedia)z,http://foo.com/blah_blah_(wikipedia)_(again)zhttp://www.foo.co.ukzhttp://www.foo.co.uk/zhttp://www.foo.co.uk/blah/blahu   http://⌘.wsu   http://⌘.ws/u   http://☺.damowmow.com/u   http://✪df.ws/123u   http://➡.ws/䨹u   http://مثال.إختبارu   http://例子.测试u/   http://उदाहरण.परीक्षाzhttp://zhttp://.z	http://..z
http://../zhttp://?z	http://??z
http://??/zhttp://#z	http://##z
http://##/z)http://foo.bar?q=Spaces should be encodedz//z//az///az///z	http:///azrdar://1234zh://testzhttp:// shouldfail.comz:// should failzhttp://foo.bar/foo(bar)baz quuxzhttp://-error-.invalid/zhttp://a.b-.cozhttp://0.0.0.0zhttp://10.1.1.0zhttp://10.1.1.255zhttp://224.1.1.1zhttp://123.123.123zhttp://3628126748zhttp://.www.foo.bar/zhttp://.www.foo.bar./zhttp://10.1.1.1zNASDAQ:GOOGzhttp://-a.b.cozfoo.comzhttp://1.1.1.1.1zhttp://www.foo.bar./)(">)r   :r   urlc                 C   s   |  |d us	J d S N	url_matchen_tokenizerr	    r   S/home/ubuntu/.local/lib/python3.10/site-packages/spacy/tests/tokenizer/test_urls.pytest_should_matchy      r   c                 C   s   |  |d u s	J d S r
   r   r   r   r   r   test_should_not_match~   r   r   c                 C   s.   | |}t |dksJ |d j|ksJ d S )N   r   lentext	tokenizerr	   tokensr   r   r   !test_tokenizer_handles_simple_url   s   r   c                 C   sZ   | d| d }t |dksJ |d jdksJ |d j|ks"J |d jdks+J d S )Nr   )   r   r      r   r   r   r   r   *test_tokenizer_handles_simple_surround_url   s
   r   prefixc                 C   D   | || }t |dksJ |d j|ksJ |d j|ks J d S Nr   r   r   r   )r   r    r	   r   r   r   r   #test_tokenizer_handles_prefixed_url      r#   suffixc                 C   r!   r"   r   )r   r	   r%   r   r   r   r   #test_tokenizer_handles_suffixed_url   r$   r&   c                 C   sZ   | || | }t |dksJ |d j|ksJ |d j|ks"J |d j|ks+J d S Nr   r   r   r   r   )r   r    r%   r	   r   r   r   r   #test_tokenizer_handles_surround_url   
   r(   prefix1prefix2c                 C   sZ   | || | }t |dksJ |d j|ksJ |d j|ks"J |d j|ks+J d S r'   r   )r   r*   r+   r	   r   r   r   r   %test_tokenizer_handles_two_prefix_url   r)   r,   suffix1suffix2c                 C   s   | || | }|| t v r,t|dksJ |d j|ksJ |d j|| ks*J d S t|dks4J |d j|ks=J |d j|ksFJ |d j|ksOJ d S )Nr   r   r   r   )r   r   r   )r   r-   r.   r	   r   r   r   r   %test_tokenizer_handles_two_suffix_url   s   r/   )pytestspacy.lang.tokenizer_exceptionsr   
URLS_BASIC	URLS_FULLparammarkxfailURLS_SHOULD_MATCHURLS_SHOULD_NOT_MATCHPREFIXESSUFFIXESparametrizer   r   r   r   slowr#   r&   r(   r,   r/   r   r   r   r   <module>   s   	
#
&'()*+,-./0125	
 !"#$%*



