o
    Si%                     @   sB  d dl mZmZ d dlZd dlZd dlZddlmZ ddlT dZdZ	d	Z
d
ZedZedZedZedZedZedZedejZdd Zejdr]e \ZZZZnddlmZ ddlmZ ddl mZ ddl!mZ G dd de"Z#G dd de"Z$e$ej%a%t%j&Z&dd Z'dd Z(d$d d!Z)d$d"d#Z*dS )%    )absolute_importunicode_literalsN   )viterbi   )*zprob_start.pzprob_trans.pzprob_emit.pzchar_state_tab.pu   ([一-鿕]+)z([\.0-9]+|[a-zA-Z0-9]+)u   ([一-鿕a-zA-Z0-9+#&\._]+)z(
|\s)z[a-zA-Z0-9]+z[\.0-9]+z^[a-zA-Z0-9]$c                  C   sL   t tdt} t tdt}t tdt}t tdt}|| ||fS )Nposseg)pickleloadget_module_resPROB_START_PPROB_TRANS_PPROB_EMIT_PCHAR_STATE_TAB_P)start_ptrans_pemit_pstate r   I/home/ubuntu/.local/lib/python3.10/site-packages/jieba/posseg/__init__.py
load_model   s
   r   java)Pc                   @   sT   e Zd Zdd Zdd Zdd Zdd Zd	d
 Zdd Zdd Z	dd Z
dd ZdS )pairc                 C   s   || _ || _d S Nwordflag)selfr   r   r   r   r   __init__.   s   
zpair.__init__c                 C      d| j | jf S )Nz%s/%sr   r   r   r   r   __unicode__2      zpair.__unicode__c                 C   r    )Nzpair(%r, %r)r   r!   r   r   r   __repr__5   r#   zpair.__repr__c                 C   s   t r	|  tS |  S r   )PY2r"   encodedefault_encodingr!   r   r   r   __str__8   s   zpair.__str__c                 C   s   t | j| jfS r   )iterr   r   r!   r   r   r   __iter__>   r#   zpair.__iter__c                 C   s   | j |j k S r   )r   r   otherr   r   r   __lt__A   s   zpair.__lt__c                 C   s"   t |to| j|jko| j|jkS r   )
isinstancer   r   r   r+   r   r   r   __eq__D   s   "zpair.__eq__c                 C   s
   t | jS r   )hashr   r!   r   r   r   __hash__G      
zpair.__hash__c                 C   s   |   |S r   )r"   r&   )r   argr   r   r   r&   J      zpair.encodeN)__name__
__module____qualname__r   r"   r$   r(   r*   r-   r/   r1   r&   r   r   r   r   r   ,   s    r   c                   @   s   e Zd Zd!ddZdd Zdd Zd!dd	Zd
d Zdd Zdd Z	dd Z
dd Zdd Zd"ddZdd Zdd Zd"ddZdd  ZdS )#POSTokenizerNc                 C   s"   |pt  | _| | j  d S r   )jieba	Tokenizer	tokenizerload_word_tagget_dict_file)r   r;   r   r   r   r   P   s   zPOSTokenizer.__init__c                 C   s
   d| j  S )Nz<POSTokenizer tokenizer=%r>)r;   r!   r   r   r   r$   T   r2   zPOSTokenizer.__repr__c                 C   s   |dv rt t| j|S )N)cut_for_searchlcut_for_searchtokenize)NotImplementedErrorgetattrr;   )r   namer   r   r   __getattr__W   s   zPOSTokenizer.__getattr__c                 C   s    | j | | | j   d S r   )r;   
initializer<   r=   )r   
dictionaryr   r   r   rE   ]   s   zPOSTokenizer.initializec              	   C   s   i | _ t|}t|dD ].\}}z| d}|sW q|d\}}}|| j |< W q ty:   td|||f w |  d S )Nr   zutf-8 z1invalid POS dictionary entry in %s at Line %s: %s)	word_tag_tabresolve_filename	enumeratestripdecodesplit	Exception
ValueErrorclose)r   ff_namelinenoliner   _tagr   r   r   r<   a   s   zPOSTokenizer.load_word_tagc                 C   s(   | j jr| j| j j i | j _d S d S r   )r;   user_word_tag_tabrH   updater!   r   r   r   makesure_userdict_loadedp   s   z%POSTokenizer.makesure_userdict_loadedc           	      c   s    t |tttt\}}d\}}t|D ]<\}}|| d }|dkr$|}q|dkr=t|||d  || d V  |d }q|dkrOt||| d V  |d }q|t|k rft||d  || d V  d S d S )N)r   r   r   BEr   S)r   char_state_tab_Pstart_Ptrans_Pemit_PrJ   r   len)	r   sentenceprobpos_listbeginnextiicharposr   r   r   __cutu   s&   
 
 zPOSTokenizer.__cutc                 c   s    t |}|D ]=}t |r| |D ]}|V  qqt|}|D ]"}|rDt|r2t|dV  q"t|r>t|dV  q"t|dV  q"qd S )Nmengx)re_han_detailrM   match_POSTokenizer__cutre_skip_detailre_numr   re_eng)r   rb   blocksblkr   tmprm   r   r   r   __cut_detail   s$   




zPOSTokenizer.__cut_detailc           	      c   s    | j |}i }| j ||| d}t|}d}||k rR|| d d }||| }t|r7||7 }|}n|rAt|dV  d}t|| j|dV  |}||k s|r^t|dV  d}d S d S )Nr    r   rl   rm   )	r;   get_DAGcalcra   re_eng1ro   r   rH   get)	r   rb   DAGrouterm   Nbufyl_wordr   r   r   __cut_DAG_NO_HMM   s.   
zPOSTokenizer.__cut_DAG_NO_HMMc                 c   s   | j |}i }| j ||| d}d}t|}||k r|| d d }||| }|| dkr6||7 }nF|rqt|dkrJt|| j|dV  n%| j j|s_| |}	|	D ]}
|
V  qXn|D ]}t|| j|dV  qad}t|| j|dV  |}||k s|rt|dkrt|| j|dV  d S | j j|s| |}	|	D ]}
|
V  qd S |D ]}t|| j|dV  qd S d S )Nr   rx   r   rm   )	r;   ry   rz   ra   r   rH   r|   FREQ_POSTokenizer__cut_detail)r   rb   r}   r~   rm   r   r   r   r   
recognizedtelemr   r   r   	__cut_DAG   sL   


zPOSTokenizer.__cut_DAGTc           
      c   s    |    t|}t|}|r| j}n| j}|D ]K}t|r+||D ]}|V  q$qt|}|D ]1}t|r@t|dV  q2|D ] }	t	|	rPt|	dV  qBt
|r\t|	dV  qBt|	dV  qBq2qd S )Nrm   rk   rl   )rY   	strdecodere_han_internalrM   _POSTokenizer__cut_DAG_POSTokenizer__cut_DAG_NO_HMMro   re_skip_internalr   rr   rs   )
r   rb   HMMrt   cut_blkru   r   rv   rm   xxr   r   r   __cut_internal   s2   





zPOSTokenizer.__cut_internalc                 C   s   t | |S r   list_POSTokenizer__cut_internalr   rb   r   r   r   _lcut_internal   r4   zPOSTokenizer._lcut_internalc                 C   s   t | |dS )NFr   r   r   r   r   _lcut_internal_no_hmm   r#   z"POSTokenizer._lcut_internal_no_hmmc                 c   s     | j ||dD ]}|V  qd S )Nr   )r   )r   rb   r   wr   r   r   cut   s   zPOSTokenizer.cutc                 O   s   t | j|i |S r   r   r   )r   argskwargsr   r   r   lcut   s   zPOSTokenizer.lcutr   )T)r5   r6   r7   r   r$   rD   rE   r<   rY   rp   r   r   r   r   r   r   r   r   r   r   r   r   r8   N   s     


(
r8   c                 C   
   t | S r   )dtr   sr   r   r   r   	  r2   r   c                 C   r   r   )r   r   r   r   r   r   r     r2   r   TFc                 c   s   t d }|rG|rG| du s| dks| dkrdS ddlm  m} |t| \}}t|D ]\}}|du s;|| du r<q-t||| V  q-dS tj	du r[t
j| |dD ]}	|	V  qSdS t| d}
|rltj	t|
}ntj	t|
}|D ]
}|D ]}	|	V  qyqudS )z
    Global `cut` function that supports parallel processing.

    Note that this only works using dt, custom POSTokenizer
    instances are not supported.
    is_paddle_installedNrx   r   r   T)check_paddle_installjieba.lac_small.predict	lac_smallpredict
get_resultr   rJ   r   r9   poolr   r   
splitlinesmapr   r   )rb   r   
use_paddler   r   sentstagsrg   sentr   partsresultrr   r   r   r     s2   
r   c                 C   s"   |r
t t| ddS t t| |S )NT)r   r   )rb   r   r   r   r   r   r   3  s   r   )TF)+
__future__r   r   r	   rer9   r   _compatr   r   r   r   compilern   rq   r   r   rs   rr   Ur{   r   sysplatform
startswithr]   r^   r_   r`   char_state_tabr   
prob_start
prob_trans	prob_emitobjectr   r8   r   rE   r   r   r   r   r   r   r   r   <module>   sB    





	" 
5
"