o
    i                     @   s   d dl Z d dlmZ d dlmZmZ dZed  g dfgZed  g dfgZe j	
dedd	 Ze j	
d
edd Ze j	
d
edd Zdd Zdd Zdd Zdd ZdS )    N)ConfigValidationError)Chinese_get_pkuseg_trie_data)ul   作为语言而言，为世界使用人数最多的语言，目前世界有五分之一人口做为母语。)   作为   语言   而言   ，   为   世界   使用u   人u	   数最多   的r   r      目前r
      有   五分之一   人口u   做r	      母语   。)r   r   r   r   r	   r
   r   u   人数u   最多r   r   r   r   r
   r   r   r   u   做为r   r   textc                 C   s&   dd | |D }|t |ksJ d S )Nc                 S      g | ]}|j qS  r   .0tokenr   r   V/home/ubuntu/.local/lib/python3.10/site-packages/spacy/tests/lang/zh/test_tokenizer.py
<listcomp>       z*test_zh_tokenizer_char.<locals>.<listcomp>)list)zh_tokenizer_charr   tokensr   r   r   test_zh_tokenizer_char   s   r    ztext,expected_tokensc                 C   "   dd | |D }||ksJ d S )Nc                 S   r   r   r   r   r   r   r   r      r   z+test_zh_tokenizer_jieba.<locals>.<listcomp>r   )zh_tokenizer_jiebar   expected_tokensr   r   r   r   test_zh_tokenizer_jieba      r$   c                 C   r!   )Nc                 S   r   r   r   r   r   r   r   r   %   r   z,test_zh_tokenizer_pkuseg.<locals>.<listcomp>r   )zh_tokenizer_pkusegr   r#   r   r   r   r   test_zh_tokenizer_pkuseg#   r%   r'   c                 C   s   t | jjj}| dg t | jjj}t|t|d ks J | jg dd t | jjj}t|dks6J tt |dg W d    d S 1 sMw   Y  d S )Nnonsense_asdf   T)resetr   )	r   
pkuseg_segpreprocessertriepkuseg_update_user_dictlenpytestwarnsUserWarning)r&   r   	user_dictupdated_user_dictreset_user_dictr   r   r   "test_zh_tokenizer_pkuseg_user_dict)   s   "r6   c                 C   s   | d}|d j dksJ d S )NzI   like cheese.r)   z  )orth_)r   r   r   r   r   test_zh_extra_spaces=   s   r8   c                  C   sJ   ddddiii} t t t|  W d    d S 1 sw   Y  d S )Nnlp	tokenizer	segmenterunk)r0   raisesr   r   from_config)configr   r   r   test_zh_unsupported_segmenterC   s   "r@   c                  C   sZ   ddddiii} t | }d|j_tt |d W d    d S 1 s&w   Y  d S )Nr9   r:   r;   charpkusegtest)r   r>   r:   r;   r0   r=   
ValueError)r?   r9   r   r   r   test_zh_uninitialized_pkusegI   s   

"rE   )r0   	thinc.apir   spacy.lang.zhr   r   TEXTSJIEBA_TOKENIZER_TESTSPKUSEG_TOKENIZER_TESTSmarkparametrizer    r$   r'   r6   r8   r@   rE   r   r   r   r   <module>   s,    	


