o
    i                     @   s   d dl mZmZmZ ddlmZmZ ddlmZ ddl	m
Z
mZ ddlmZ ddlmZ ddlmZmZmZ dd	lmZ d
dlmZ d
dlmZ d
dlmZ d
dlmZ dZdd ZG dd deZ G dd deZ!G dd deZ"dddZ#dd Z$dgZ%dS )    )AnyDictIterator   )BaseDefaultsLanguage)Scorer)POSX)Doc)validate_examples)DummyTokenizerload_config_from_strregistry)Vocab   )	LEX_ATTRS)TOKENIZER_INFIXES)
STOP_WORDS)TAG_MAPzA
[nlp]

[nlp.tokenizer]
@tokenizers = "spacy.ko.KoreanTokenizer"
c                  C   s   dd } | S )Nc                 S   s
   t | jS NKoreanTokenizervocab)nlp r   J/home/ubuntu/.local/lib/python3.10/site-packages/spacy/lang/ko/__init__.pykorean_tokenizer_factory   s   
z2create_tokenizer.<locals>.korean_tokenizer_factoryr   )r   r   r   r   create_tokenizer   s   r   c                   @   sf   e Zd ZdefddZedd Zdd Zded	e	fd
dZ
ded	eeeef  fddZdd ZdS )r   r   c                 C   s   || _ t | _d | _d S r   )r   try_mecab_import_mecab_mecab_tokenizer)selfr   r   r   r   __init__   s   
zKoreanTokenizer.__init__c                 C   s   | j d u r| d| _ | j S )Nz-F%f[0],%f[7])r!   r    r"   r   r   r   mecab_tokenizer$   s   
zKoreanTokenizer.mecab_tokenizerc                 C   s   t | jffS r   r   r$   r   r   r   
__reduce__0   s   zKoreanTokenizer.__reduce__textreturnc           
      C   s   t | |}dd |D }t| j|t t||d}t||D ]'\}}|d d\}}}	||_|jtv r?t|j t	 |_
nt|_
|d |_q dd |D |jd< |S )	Nc                 S      g | ]}|d  qS )surfacer   .0dtr   r   r   
<listcomp>5       z,KoreanTokenizer.__call__.<locals>.<listcomp>)wordsspacestag+lemmac                 S   r)   )r2   r   r+   r   r   r   r.   ?   r/   	full_tags)listdetailed_tokensr   r   check_spaceszip	partitiontag_r   r	   posr
   lemma_	user_data)
r"   r'   dtokenssurfacesdoctokendtoken	first_tagsep	eomi_tagsr   r   r   __call__3   s   
zKoreanTokenizer.__call__c           
      c   sp    | j j|ddD ],}| r d S |j}|j}|d\}}}|d\}}}	|dkr.|}|||dV  q	d S )NT)as_nodes,/*)r*   r4   r2   )r%   parseis_eosr*   featurer:   )
r"   r'   noder*   rN   r2   _exprr4   	remainderr   r   r   r7   B   s   zKoreanTokenizer.detailed_tokensc                 C   s   t |d t|S )NKoreanTokenizer.score)r   r   score_tokenization)r"   examplesr   r   r   scoreP   s   

rS   N)__name__
__module____qualname__r   r#   propertyr%   r&   strr   rG   r   r   r   r7   rV   r   r   r   r   r      s    
r   c                   @   s,   e Zd ZeeZeZeZ	ddddZ
eZdS )KoreanDefaultsltrF)	directionhas_casehas_lettersN)rW   rX   rY   r   DEFAULT_CONFIGconfigr   lex_attr_gettersr   
stop_wordswriting_systemr   infixesr   r   r   r   r\   U   s    r\   c                   @   s   e Zd ZdZeZdS )KoreankoN)rW   rX   rY   langr\   Defaultsr   r   r   r   rg   ]   s    rg   r(   Nc                  C   s,   z	ddl m}  | W S  ty   tdd w )Nr   MeCabzThe Korean tokenizer ("spacy.ko.KoreanTokenizer") requires [mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), [mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), and [natto-py](https://github.com/buruzaemon/natto-py))nattorl   ImportErrorrk   r   r   r   r   b   s   r   c                 c   sX    d}d}|D ]}|  ||}|dkr||kV  |t| }|}q|dkr*dV  d S d S )Nr   F)findlen)r'   tokensprev_endstartrB   idxr   r   r   r8   p   s   

r8   )r(   N)&typingr   r   r   languager   r   scorerr   symbolsr	   r
   rr   r   trainingr   utilr   r   r   r   r   	lex_attrsr   punctuationr   rd   r   tag_mapr   ra   r   r   r\   rg   r   r8   __all__r   r   r   r   <module>   s(    7

