o
    i1                     @   sp  d dl Z d dlZd dlmZ d dlmZ d dlmZmZm	Z	m
Z
mZmZ d dlZddlmZ ddlmZmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZmZmZ ddl m!Z! ddl"m#Z# ddl$m%Z% dZ&dZ'G dd de(eZ)e)j*fde)fddZ+G dd deZ,G dd deZ-G dd deZ.dd Z/d ee( d!ee( fd"d#Z0d'd%d&Z1dgZ2dS )(    N)Enum)Path)AnyCallableDictIterableListOptional   )util)ErrorsWarnings)BaseDefaultsLanguage)Scorer)Doc)Examplevalidate_examples)DummyTokenizerload_config_from_strregistry)Vocab   )	LEX_ATTRS)
STOP_WORDSzinstall spacy-pkuseg with `pip install "spacy-pkuseg>=0.0.27,<0.1.0"` or `conda install -c conda-forge "spacy-pkuseg>=0.0.27,<0.1.0"`z
[nlp]

[nlp.tokenizer]
@tokenizers = "spacy.zh.ChineseTokenizer"
segmenter = "char"

[initialize]

[initialize.tokenizer]
pkuseg_model = null
pkuseg_user_dict = "default"
c                   @   s$   e Zd ZdZdZdZedd ZdS )	Segmentercharjiebapkusegc                 C   s   t | j S N)list__members__keys)cls r$   J/home/ubuntu/.local/lib/python3.10/site-packages/spacy/lang/zh/__init__.pyvalues,   s   zSegmenter.valuesN)__name__
__module____qualname__r   r   r   classmethodr&   r$   r$   r$   r%   r   '   s    r   	segmenterc                    s    fdd}|S )Nc                    s   t | j dS )Nr+   )ChineseTokenizervocab)nlpr,   r$   r%   chinese_tokenizer_factory2   s   z;create_chinese_tokenizer.<locals>.chinese_tokenizer_factoryr$   )r+   r0   r$   r,   r%   create_chinese_tokenizer1   s   r1   c                   @   s   e Zd ZejfdedefddZ	d&dddddeeg e	e
 f  d	ee d
ee dee fddZdedefddZd'dee defddZdd Zdeeef fddZi fdeeef ddfddZdd Zd d! Zd"d# Zd$d% ZdS )(r-   r.   r+   c                 C   s   || _ t|tr|jn|| _d | _d | _| jt vr4tj	j
d| jdt dd}t| tj| _| jtjkr@t | _d S d S )NChinese, 'char' (character segmentation)langr+   	supporteddefault)r.   
isinstancer   valuer+   
pkuseg_seg	jieba_segr&   r   W103formatjoinwarningswarnr   r   try_jieba_import)selfr.   r+   warn_msgr$   r$   r%   __init__9   s"   
zChineseTokenizer.__init__Nr8   )r/   pkuseg_modelpkuseg_user_dictget_examplesr/   rF   rG   c                C   s.   | j tjkr|d u r|}t||d| _d S d S )N)rF   rG   )r+   r   r   try_pkuseg_importr;   )rC   rH   r/   rF   rG   r$   r$   r%   
initializeL   s   zChineseTokenizer.initializetextreturnc                 C   s   | j tjkr%tdd | jj|ddD }t||\}}t| j	||dS | j tj
krK| jd u r5ttj| j|}t||\}}t| j	||dS | j tjkrftjjd| j dt dd	}t| t|}t||\}}t| j	||dS )
Nc                 S   s   g | ]}|r|qS r$   r$   ).0xr$   r$   r%   
<listcomp>]   s    z-ChineseTokenizer.__call__.<locals>.<listcomp>Fcut_all)wordsspacesr2   r3   r4   r5   )r+   r   r   r    r<   cutr   get_words_and_spacesr   r.   r   r;   
ValueErrorr   E1000r   r   r=   r>   r?   r&   r@   rA   )rC   rK   rR   rS   rD   r$   r$   r%   __call__[   s*   


zChineseTokenizer.__call__FrR   resetc                 C   s   | j tjkr7|r&zdd l}|d | j_W n ty%   dt }t|d w |D ]}| jj	|
 d q(d S tjjd| j d}t| d S )Nr   zEspacy_pkuseg not installed: unable to reset pkuseg user dict. Please  r   )targetcurrent)r+   r   r   spacy_pkusegPreprocesserr;   preprocesserImportError_PKUSEG_INSTALL_MSGinsertstripr   W104r>   r@   rA   )rC   rR   rY   r]   msgwordrD   r$   r$   r%   pkuseg_update_user_dictv   s"   
z(ChineseTokenizer.pkuseg_update_user_dictc                 C   s   t |d t|S )NChineseTokenizer.score)r   r   score_tokenization)rC   examplesr$   r$   r%   score   s   

rh   c                 C   s
   d| j iS Nr+   r,   rC   r$   r$   r%   _get_config   s   zChineseTokenizer._get_configconfigc                 C   s   | dtj| _d S rl   )getr   r   r+   )rC   ro   r$   r$   r%   _set_config   s   zChineseTokenizer._set_configc              	      s<  d dd j rt P}j j| j j| t|}t|d d}|  W d    n1 s6w   Y  t|d d}| W d    n1 sQw   Y  W d    n1 s`w   Y  t	j j
jj jjttj jjttj jjffdd fddfddfd	dd
}t|g S )N    features.msgpackrbweights.npzc                      s   t   S r   )srsly
json_dumpsrn   r$   rm   r$   r%   <lambda>   s    z+ChineseTokenizer.to_bytes.<locals>.<lambda>c                          S r   r$   r$   )pkuseg_features_br$   r%   rx          c                      ry   r   r$   r$   )pkuseg_weights_br$   r%   rx      r{   c                      s
   t  S r   )rv   msgpack_dumpsr$   )pkuseg_processors_datar$   r%   rx      s   
 cfgpkuseg_featurespkuseg_weightspkuseg_processors)r;   tempfileTemporaryDirectoryfeature_extractorsavemodelr   openread_get_pkuseg_trie_datar_   triepostprocesser
do_processsortedr    common_wordsother_wordsr   to_bytes)rC   kwargstempdirfilehserializersr$   )rz   r~   r|   rC   r%   r      s6   


	



zChineseTokenizer.to_bytesc              	      s  ddd d  fdd} fdd} fdd}fd	d
|||d}t ||g   d rˈ d rt e}t|}t|d d}| d  W d    n1 sTw   Y  t|d d}| d  W d    n1 srw   Y  zdd l}	W n ty   tdt	 d w |	
t|_W d    n1 sw   Y   d rˈ d }
|
\}}}}|	|j_|jj_t|jj_t|jj_S )Nrr   )
features_b	weights_bprocessors_datac                       |  d< d S )Nr   r$   bpkuseg_datar$   r%   deserialize_pkuseg_features      z@ChineseTokenizer.from_bytes.<locals>.deserialize_pkuseg_featuresc                    r   )Nr   r$   r   r   r$   r%   deserialize_pkuseg_weights   r   z?ChineseTokenizer.from_bytes.<locals>.deserialize_pkuseg_weightsc                    s   t |  d< d S )Nr   )rv   msgpack_loadsr   r   r$   r%   deserialize_pkuseg_processors   s   zBChineseTokenizer.from_bytes.<locals>.deserialize_pkuseg_processorsc                         t| S r   )rq   rv   
json_loadsr   rm   r$   r%   rx          z-ChineseTokenizer.from_bytes.<locals>.<lambda>r   r   r   rs   wbru   r   /spacy-pkuseg not installed. To use this model, r   )r   
from_bytesr   r   r   r   writer]   r`   ra   r   strr;   r^   r_   r   r   setr   r   )rC   datar   r   r   r   deserializersr   r   r]   r   	user_dictr   r   r   r$   )r   rC   r%   r      sP   


zChineseTokenizer.from_bytesc                    sT   t |}fdd fddfdd fddfddd	}t ||g S )
Nc                    s>    j r|  s| jdd  j j|   j j|  d S d S )NT)parents)r;   existsmkdirr   r   r   )pathrm   r$   r%   save_pkuseg_model   s   z3ChineseTokenizer.to_disk.<locals>.save_pkuseg_modelc                    sR    j r't j jj j jjtt j jjtt j jj	f}t
| | d S d S r   )r;   r   r_   r   r   r   r   r    r   r   rv   write_msgpack)r   r   rm   r$   r%   save_pkuseg_processors   s   z8ChineseTokenizer.to_disk.<locals>.save_pkuseg_processorsc                    s   t |   S r   )rv   
write_jsonrn   prm   r$   r%   rx      r   z*ChineseTokenizer.to_disk.<locals>.<lambda>c                        | S r   r$   r   )r   r$   r%   rx          c                    r   r   r$   r   )r   r$   r%   rx      r   r   rF   r   )r   ensure_pathto_diskrC   r   r   r   r$   )r   r   rC   r%   r      s   



zChineseTokenizer.to_diskc                    sX   t |}fdd fddfdd fddfddd	}t ||g  d S )
Nc                    sV   zdd l }W n ty    jtjkrtdt d Y nw |  r)||  _d S d S )Nr   r   )r]   r`   r+   r   r   ra   r   r;   )r   r]   rm   r$   r%   load_pkuseg_model   s    z5ChineseTokenizer.from_disk.<locals>.load_pkuseg_modelc                    s   zdd l }W n ty    jtjkrt jd Y nw  jtjkrIt| }|\}}}}|| j	_
| j	j_t| j	j_t| j	j_d S d S )Nr   )r]   r`   r+   r   r   _pkuseg_install_msgrv   read_msgpackr^   r;   r_   r   r   r   r   r   )r   r]   r   r   r   r   r   rm   r$   r%   load_pkuseg_processors  s   

z:ChineseTokenizer.from_disk.<locals>.load_pkuseg_processorsc                    r   r   )rq   rv   	read_jsonr   rm   r$   r%   rx     r   z,ChineseTokenizer.from_disk.<locals>.<lambda>c                    r   r   r$   r   )r   r$   r%   rx     r   c                    r   r   r$   r   )r   r$   r%   rx     r   r   )r   r   	from_diskr   r$   )r   r   rC   r%   r      s   



zChineseTokenizer.from_diskr   )F)r'   r(   r)   r   r   r   rE   r	   r   r   r   r   r   rJ   r   rX   r   boolrg   rk   r   r   rn   rq   r   r   r   r   r$   r$   r$   r%   r-   8   s2    
-r-   c                   @   s(   e Zd ZeeZeZeZ	ddddZ
dS )ChineseDefaultsltrF)	directionhas_casehas_lettersN)r'   r(   r)   r   DEFAULT_CONFIGro   r   lex_attr_gettersr   
stop_wordswriting_systemr$   r$   r$   r%   r     s
    r   c                   @   s   e Zd ZdZeZdS )r2   zhN)r'   r(   r)   r6   r   Defaultsr$   r$   r$   r%   r2   $  s    r2   c                  C   s>   zdd l } t| jddd | W S  ty   d}t|d w )Nr   u   作为FrP   znJieba not installed. To use jieba, install it with `pip  install jieba` or from https://github.com/fxsjy/jieba)r   r    rT   r`   )r   re   r$   r$   r%   rB   )  s   
rB   rF   rG   c                 C   sh   zdd l }W n ty   dt }t|d w z|j| |dW S  ty3   dt| p+d }t|d w )Nr   z+spacy-pkuseg not installed. To use pkuseg, )r   z"Unable to load pkuseg model from: rZ   )r]   r`   ra   r   FileNotFoundErrorr   )rF   rG   r]   re   r$   r$   r%   rI   9  s   

rI   rZ   c                 C   sJ   g }t | j D ]\}}|t|||  q	| jr#||| jf |S r   )r   childrenitemsextendr   iswordappendusertag)noder   r   c
child_noder$   r$   r%   r   G  s   r   )rZ   )3r   r@   enumr   pathlibr   typingr   r   r   r   r   r	   rv   rZ   r   errorsr   r   languager   r   scorerr   tokensr   trainingr   r   r   r   r   r.   r   	lex_attrsr   r   r   ra   r   r   r   r   r1   r-   r   r2   rB   rI   r   __all__r$   r$   r$   r%   <module>   s8     
 f

	