o
    i$1                     @   s  d dl Z d dlmZ d dlmZ d dlmZmZmZm	Z	m
Z
 d dlZd dlmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZmZ ddl m!Z! ddlm"Z"m#Z#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z, ddl-m.Z. ddl/m0Z0 dZ1d>de	e2 fddZ3G dd de"Z4G dd deZ5G d d! d!eZ6e6j7d"d#d$ged%d%d&d'id(d)d)dd*d+d,ed-ed.e2d/e8d0e8d1e	e fd2d3Z9ed4g d5Z:d?d7d8Z;d9d: Z<d@d<d=Z=d!gZ>dS )A    N)
namedtuple)Path)AnyCallableDictOptionalUnion)Model   )util)Errors)BaseDefaultsLanguage)Morphologizer)DEFAULT_MORPH_MODEL)Scorer)POS)DocMorphAnalysis)validate_examples)DummyTokenizerload_config_from_strregistry)Vocab   )
STOP_WORDS)SYNTAX_ITERATORS)TAG_BIGRAM_MAP)TAG_MAP)TAG_ORTH_MAPzU
[nlp]

[nlp.tokenizer]
@tokenizers = "spacy.ja.JapaneseTokenizer"
split_mode = null

split_modec                    s    fdd}|S )Nc                    s   t | j dS )Nr    )JapaneseTokenizervocab)nlpr!    J/home/ubuntu/.local/lib/python3.10/site-packages/spacy/lang/ja/__init__.pyjapanese_tokenizer_factory$   s   z4create_tokenizer.<locals>.japanese_tokenizer_factoryr%   )r    r'   r%   r!   r&   create_tokenizer#   s   r(   c                   @   s   e Zd Zd#dedee ddfddZdd Zd	edefd
dZ	d$de
fddZdd Zdd Zdeeef fddZi fdeeef ddfddZdefddZdedd fddZdeeef ddfdd Zdeeef dd fd!d"ZdS )%r"   Nr#   r    returnc                 C   s0   || _ || _t| j| _|d u p|dk | _d S )NA)r#   r    try_sudachi_import	tokenizerneed_subtokens)selfr#   r    r%   r%   r&   __init__+   s   zJapaneseTokenizer.__init__c                 C   s   t | j| jffS N)r"   r#   r    r.   r%   r%   r&   
__reduce__2   s   zJapaneseTokenizer.__reduce__textc                 C   s6  | j |}| |}t||\}}|rt| ng gd \}}}}}	}
}t|}t| j||d}d }tt||D ]W\}\}}|j	|_
|rK||_d }nt|j|j	|d t|k r^||d  nd \|_}|jrj|jn|j|_i }|jrx|j|d< |j|_|jrtdd|j|d< t| j||_q9| jr||jd< |S )	N   )wordsspacesr   
Inflectionz[=|]_Reading
sub_tokens)r,   tokenize_get_dtokensget_dtokens_and_spacesziplistr   r#   	enumeratetagtag_posresolve_posorth_lenlemmasurfacelemma_infnormnorm_readingresubr   morphr-   	user_data)r.   r3   sudachipy_tokensdtokensr6   r5   tagsinflectionslemmasnormsreadingssub_tokens_listdocnext_posidxtokendtokenrP   r%   r%   r&   __call__5   s:   



zJapaneseTokenizer.__call__Tneed_sub_tokensc                    s>   |r|  |nd fddt|D   fddt D S )Nc                    s   g | ]@\}}t | d krt| ddd | dd D ddd | dd D | | |  r? | ndqS )r   -c                 S      g | ]}|d kr|qS *r%   .0xxr%   r%   r&   
<listcomp>d       z=JapaneseTokenizer._get_dtokens.<locals>.<listcomp>.<listcomp>N   ;c                 S   rb   rc   r%   re   r%   r%   r&   rh   e   ri   )rF   rH   DetailedTokenjoinpart_of_speechdictionary_formnormalized_formreading_form)rf   r\   r]   )rY   r%   r&   rh   a   s    z2JapaneseTokenizer._get_dtokens.<locals>.<listcomp>c                    sT   g | ]&\}}|d ks&|j  r&|jdks& |d  j  r& |d  jdkr|qS )r      空白r   )rH   isspacerA   )rf   r\   t)rS   r%   r&   rh   s   s    
)_get_sub_tokensr@   )r.   rR   r`   r%   )rS   rY   r&   r<   ]   s   

zJapaneseTokenizer._get_dtokensc                 C   s   | j sd S g }|D ]S}|| jjj}t|dkr|d  q	| jdkr/|| |dg q	|| jjj	}t|t|krM| |d}|||g q	|| |d| |dg q	|S )Nr   BF)
r-   splitr,   	SplitModer*   rF   appendr    r<   rv   )r.   rR   rY   r]   sub_asub_brS   r%   r%   r&   ru   }   s&   


z!JapaneseTokenizer._get_sub_tokensc                 C   s   t |d t|S )NJapaneseTokenizer.score)r   r   score_tokenization)r.   examplesr%   r%   r&   score   s   

r|   c                 C   s
   d| j iS Nr    r!   r1   r%   r%   r&   _get_config   s   
zJapaneseTokenizer._get_configconfigc                 C   s   | dd | _d S r   )getr    )r.   r   r%   r%   r&   _set_config   s   zJapaneseTokenizer._set_configc                    s   d fddi}t |g S )Ncfgc                      s   t   S r0   )srsly
json_dumpsr   r%   r1   r%   r&   <lambda>   s    z,JapaneseTokenizer.to_bytes.<locals>.<lambda>)r   to_bytes)r.   kwargsserializersr%   r1   r&   r      s   zJapaneseTokenizer.to_bytesdatac                    s.   d fddi}t ||g  t j _ S )Nr   c                         t| S r0   )r   r   
json_loads)br1   r%   r&   r          z.JapaneseTokenizer.from_bytes.<locals>.<lambda>)r   
from_bytesr+   r    r,   )r.   r   r   deserializersr%   r1   r&   r      s   zJapaneseTokenizer.from_bytespathc                    s,   t |}d fddi}t ||g  d S )Nr   c                    s   t |   S r0   )r   
write_jsonr   pr1   r%   r&   r      r   z+JapaneseTokenizer.to_disk.<locals>.<lambda>)r   ensure_pathto_diskr.   r   r   r   r%   r1   r&   r      s   
zJapaneseTokenizer.to_diskc                    s8   t |}d fddi}t ||g  t j _ S )Nr   c                    r   r0   )r   r   	read_jsonr   r1   r%   r&   r      r   z-JapaneseTokenizer.from_disk.<locals>.<lambda>)r   r   	from_diskr+   r    r,   r   r%   r1   r&   r      s
   
zJapaneseTokenizer.from_diskr0   )T)__name__
__module____qualname__r   r   strr/   r2   r   r_   boolr<   ru   r   r   r   r   r   bytesr   r   r   r   r   r   r%   r%   r%   r&   r"   *   s    ( r"   c                   @   s(   e Zd ZeeZeZeZ	ddddZ
dS )JapaneseDefaultsltrF)	directionhas_casehas_lettersN)r   r   r   r   DEFAULT_CONFIGr   r   
stop_wordsr   syntax_iteratorswriting_systemr%   r%   r%   r&   r      s
    r   c                   @   s   e Zd ZdZeZdS )JapanesejaN)r   r   r   langr   Defaultsr%   r%   r%   r&   r      s    r   morphologizerztoken.morphz	token.posTz@scorerszspacy.morphologizer_scorer.v1)model	overwriteextendscorerg      ?)pos_accmorph_micro_fmorph_per_feat)assignsdefault_configdefault_score_weightsr$   r   namer   r   r   c                 C   s   t | j|||||dS )N)r   r   r   )r   r#   )r$   r   r   r   r   r   r%   r%   r&   make_morphologizer   s   r   rl   )rH   rA   rJ   rG   rK   rM   r:   r*   c                 C   sj   z(ddl m}m} |jjj|jjj|jjj|jjjd|  } | j	| d}|W S  t
y4   t
ddw )zSudachiPy is required for Japanese support, so check for it.
    It it's not available blow up and explain how to fix it.
    split_mode should be one of these values: "A", "B", "C", None->"A".r   )
dictionaryr,   )Nr*   rv   C)modezJapanese support requires SudachiPy and SudachiDict-core (https://github.com/WorksApplications/SudachiPy). Install with `pip install sudachipy sudachidict_core` or install spaCy with `pip install spacy[ja]`.N)	sudachipyr   r,   	Tokenizerrx   r*   rv   r   
DictionarycreateImportError)r    r   r,   tokr%   r%   r&   r+      s$   r+   c                 C   st   |t v rt | }| |v r||  dfS |r2||f}|tv r2t| \}}|du r.t| t |fS ||fS t| t dfS )a2  If necessary, add a field to the POS tag for UD mapping.
    Under Universal Dependencies, sometimes the same Unidic POS tag can
    be mapped differently depending on the literal token or its context
    in the sentence. This function returns resolved POSs for both token
    and next_token by tuple.
    N)r   r   r   r   )orthrA   next_tagorth_map
tag_bigramcurrent_posr[   r%   r%   r&   rD      s   
rD   rr   c                 C   s  dd | D }d d | d | kr"ttjj||dg }g }d}t|dkr2||fS tdd |D dkrU| sCJ t||d||d d g}dg}||fS t	t
|| D ]t\}\}}	| rgq\z||d  |}
W n ty   ttjj||dd w |
dkr||||
  }|t||d||d d  |d ||
7 }||	 |d |t|7 }|d t| k r| |d  jd	krd
|d< |d7 }q\|t|k r||d  }|t||d||d d  |d ||fS )Nc                 S   s   g | ]}|j qS r%   )rH   )rf   xr%   r%   r&   rh   "  s    z*get_dtokens_and_spaces.<locals>.<listcomp> )r3   r5   r   c                 S   s   g | ]}|  s|qS r%   )rs   )rf   wordr%   r%   r&   rh   ,  ri   Fr    T)rm   rw   
ValueErrorr   E194formatrF   rs   rl   r@   r>   indexry   rH   )rS   r3   gap_tagr5   text_dtokenstext_spacestext_posir   r^   
word_startwr%   r%   r&   r=      sL   "


"
r=   r0   )r*   )rr   )?rN   collectionsr   pathlibr   typingr   r   r   r   r   r   	thinc.apir	   r   r   errorsr   languager   r   pipeliner   pipeline.morphologizerr   r   r   symbolsr   tokensr   r   trainingr   r   r   r   r#   r   r   r   r   r   tag_bigram_mapr   tag_mapr   tag_orth_mapr   r   r   r(   r"   r   r   factoryr   r   rl   r+   rD   r=   __all__r%   r%   r%   r&   <module>   sz    	 

"
5