o
    SiaM                     @   s  d dl mZmZ dZdZd dlZd dlZd dlZd dlZd dl	Z	d dl
mZ d dlmZ ddlmZ dd	lT ejd
krBd dlmZ nejZdd ZdZdZeejZee Z!e!"ej# e!$e i Z%da&e'dej(Z)e'dej(Z*e'dej(Z+e'dej(Z,dd Z-G dd de.Z/e/ a0d-ddZ1t0j2Z2t0j3Z3t0j4a4t0j5Z5t0j6a6t0j7Z7t0j8Z8t0j9Z9t0j:Z:t0j;Z;t0j<Z<t0j=Z=t0j>Z>t0j?Z?t0j@Z@dd ZAdd ZBdd ZCdd ZAdd  ZDd!d" ZEd.d%d&ZFd/d'd(ZGd-d)d*ZHd+d, ZIdS )0    )absolute_importunicode_literalsz0.42.1MITN)md5)log   )finalseg)*nt)movec                 C   s   t jt jt  | S N)ospathnormpathjoingetcwd)r    r   B/home/ubuntu/.local/lib/python3.10/site-packages/jieba/__init__.py<lambda>   s    r   zdict.txtz^(.+?)( [0-9]+)?( [a-z]+)?$z[a-zA-Z0-9]u   ([一-鿕a-zA-Z0-9+#&\._%\-]+)z(
|\s)c                 C   s   t |  d S r   )default_loggersetLevel)	log_levelr   r   r   setLogLevel1      r   c                   @   s   e Zd ZefddZdd Zedd Zd5dd	Zd
d Z	dd Z
dd Zdd Zdd Zdd Zd6ddZd7ddZdd Zdd ZeZeZd d! Zd"d# Zd$d% Zd&d' Zd(d) Zd8d*d+Zd,d- Zd9d.d/Zd:d1d2Zd3d4 ZdS );	Tokenizerc                 C   sL   t  | _|tkr|| _nt|| _i | _d| _i | _d| _	d | _
d | _d S )Nr   F)	threadingRLocklockDEFAULT_DICT
dictionary_get_abs_pathFREQtotaluser_word_tag_tabinitializedtmp_dir
cache_file)selfr   r   r   r   __init__7   s   


zTokenizer.__init__c                 C   s
   d| j  S )Nz<Tokenizer dictionary=%r>)r   r'   r   r   r   __repr__D      
zTokenizer.__repr__c           
   	   C   s   i }d}t | }t| dD ]M\}}z9| d}|dd d \}}t|}|||< ||7 }tt|D ]}|d |d  }	|	|vrHd||	< q6W q tyZ   td|||f w | 	  ||fS )Nr   r   utf-8    z-invalid dictionary entry in %s at Line %s: %s)
resolve_filename	enumeratestripdecodesplitintxrangelen
ValueErrorclose)
flfreqltotalf_namelinenolinewordfreqchwfragr   r   r   
gen_pfdictG   s.   zTokenizer.gen_pfdictNc              
   C   s  |rt |}| j|kr| jrd S || _d| _n| j}| jR zt|  W d    n1 s/w   Y  W n	 ty>   Y nw | jrK	 W d    d S td|pQd  t }| j	r_| j	}n|t
krfd}ndt|dd  }tj| jp{t |}tj|}d}tj|r|t
kstj|tj|krtd	|  z"t|d
}t|\| _| _W d    n1 sw   Y  d}W n ty   d}Y nw |rUt|t }|t|< |Z |  | ! \| _| _td|  z0tj"|d\}	}
t#|	d}t$| j| jf| W d    n	1 sw   Y  t%|
| W n ty4   t&d Y nw W d    n	1 s@w   Y  zt|= W n
 tyT   Y nw d| _tdt |   td W d    d S 1 stw   Y  d S )NFz Building prefix dict from %s ...zthe default dictionaryzjieba.cachezjieba.u%s.cacher,   replaceTzLoading model from cache %srbzDumping model to file cache %s)dirwbzDump cache file failed.z Loading model cost %.3f seconds.z(Prefix dict has been built successfully.)'r    r   r$   r   DICT_WRITINGKeyErrorr   debugtimer&   r   r   encode	hexdigestr   r   r   r%   tempfile
gettempdirdirnameisfilegetmtimeopenmarshalloadr!   r"   	Exceptiongetr   r   rC   get_dict_filemkstempfdopendump_replace_file	exception)r'   r   abs_patht1r&   tmpdirload_from_cache_failcfwlockfdfpathtemp_cache_filer   r   r   
initialize]   s   

	

$zTokenizer.initializec                 C   s   | j s	|   d S d S r   )r$   rg   r)   r   r   r   check_initialized   s   zTokenizer.check_initializedc                    sZ   t }d|< tjt|d ddD ] t fdd|  D  < qd S )N)r   r   r   c                 3   sH    | ]}t j |d   pd  |d   d  |fV  qdS )r   r   N)r   r!   rW   ).0xidxlogtotalrouter'   sentencer   r   	<genexpr>   s    z!Tokenizer.calc.<locals>.<genexpr>)r6   r   r"   r5   max)r'   rp   DAGro   Nr   rl   r   calc   s   
zTokenizer.calcc                 C   s   |    i }t|}t|D ]=}g }|}|| }||k r@|| jv r@| j| r+|| |d7 }|||d  }||k r@|| jv s!|sG|| |||< q|S )Nr   )rh   r6   r5   r!   append)r'   rp   rs   rt   ktmplistifragr   r   r   get_DAG   s"   



zTokenizer.get_DAGc           
      c   s    |  |}d}d}d}t|D ]Z\}}|dkr$t|| s$d}|V  t|dkrV||krV|||d d  }t|rJ|dkrFd}|}n||7 }|dkrQ|V  |d }q|D ]}	|	|kri|||	d  V  |	}qXq|dkrt|V  d S d S )Nri   r    r   )r{   	iteritemsre_engmatchr6   )
r'   rp   dagold_jeng_scaneng_bufrw   Lr?   jr   r   r   	__cut_all   s8   



zTokenizer.__cut_allc           	      c   s    |  |}i }| ||| d}t|}d}||k rK|| d d }||| }t|r;t|dkr;||7 }|}n|rB|V  d}|V  |}||k s|rT|V  d}d S d S Nr   r|   r   )r{   ru   r6   r~   r   )	r'   rp   rs   ro   rk   rt   bufyl_wordr   r   r   __cut_DAG_NO_HMM   s.   
zTokenizer.__cut_DAG_NO_HMMc                 c   s.   |  |}i }| ||| d}d}t|}||k ri|| d d }||| }|| dkr4||7 }n/|r`t|dkrB|V  d}n| j|sVt|}	|	D ]}
|
V  qOn|D ]}|V  qXd}|V  |}||k s|rt|dkrv|V  d S | j|st|}	|	D ]}
|
V  qd S |D ]}|V  qd S d S r   )r{   ru   r6   r!   rW   r   cut)r'   rp   rs   ro   rk   r   rt   r   r   
recognizedtelemr   r   r   	__cut_DAG   sN   




zTokenizer.__cut_DAGFTc                 c   s   t d }t|}|r6|r6|du st|dkrdS ddlm  m} ||}|D ]
}|du r0q)|V  q)dS t}	t}
|r@| j	}n	|rF| j
}n| j}|	|}|D ]6}|sUqP|	|re||D ]}|V  q^qP|
|}|D ]}|
|rw|V  ql|s|D ]}|V  q{ql|V  qlqPdS )ab  
        The main function that segments an entire sentence that contains
        Chinese characters into separated words.

        Parameter:
            - sentence: The str(unicode) to be segmented.
            - cut_all: Model type. True for full pattern, False for accurate pattern.
            - HMM: Whether to use the Hidden Markov Model.
        is_paddle_installedNr   )check_paddle_install	strdecoder6   jieba.lac_small.predict	lac_smallpredictget_sentre_han_defaultre_skip_default_Tokenizer__cut_all_Tokenizer__cut_DAG_Tokenizer__cut_DAG_NO_HMMr3   r   )r'   rp   cut_allHMM
use_paddler   r   resultssentre_hanre_skip	cut_blockblocksblkr?   tmprk   xxr   r   r   r   !  sN   





zTokenizer.cutc                 c   s    | j ||d}|D ]I}t|dkr.tt|d D ]}|||d  }| j|r-|V  qt|dkrPtt|d D ]}|||d  }| j|rO|V  q<|V  q
dS )z8
        Finer segmentation for search engines.
        r   r.   r      N)r   r6   r5   r!   rW   )r'   rp   r   wordswry   gram2gram3r   r   r   cut_for_searchR  s"   zTokenizer.cut_for_searchc                 O      t | j|i |S r   )listr   r'   argskwargsr   r   r   lcutd     zTokenizer.lcutc                 O   r   r   )r   r   r   r   r   r   lcut_for_searchg  r   zTokenizer.lcut_for_searchc                 C   s   |  |ddS NFr   r'   rp   r   r   r   _lcut_no_hmmm  r   zTokenizer._lcut_no_hmmc                 C      |  |dS NTr   r   r   r   r   	_lcut_allp     zTokenizer._lcut_allc                 C   r   r   )r   r   r   r   r   _lcut_for_search_no_hmms  r   z!Tokenizer._lcut_for_search_no_hmmc                 C   s   | j tkr	ttS t| j dS )NrE   )r   r   get_module_resDEFAULT_DICT_NAMErS   r)   r   r   r   rX   v  s   
zTokenizer.get_dict_filec           	   	   C   s   |    t|tr|}t|d}nt|}t|dD ]I\}}| }t|ts?z
|d	d}W n t
y>   td| w |sBqt| \}}}|durT| }|dur\| }| ||| qdS )a  
        Load personalized dict to improve detect rate.

        Parameter:
            - f : A plain text file contains words and their ocurrences.
                  Can be a file-like object, or the path of the dictionary file,
                  whose encoding must be utf-8.

        Structure of dict file:
        word1 freq1 word_type1
        word2 freq2 word_type2
        ...
        Word type may be ignored
        rE   r   r,   u   ﻿z dictionary file %s must be utf-8N)rh   
isinstancestring_typesrS   r/   r0   r1   	text_typer2   lstripUnicodeDecodeErrorr7   re_userdictr   groupsadd_word)	r'   r9   r<   r=   lnr>   r?   r@   tagr   r   r   load_userdict|  s,   

zTokenizer.load_userdictc                 C   s   |    t|}|durt|n| |d}|| j|< |  j|7  _|r)|| j|< tt|D ]}|d|d  }|| jvrCd| j|< q/|dkrOt	
| dS dS )z
        Add a word to dictionary.

        freq and tag can be omitted, freq defaults to be a calculated value
        that ensures the word can be cut out.
        NFr   r   )rh   r   r4   suggest_freqr!   r"   r#   r5   r6   r   add_force_split)r'   r?   r@   r   rA   rB   r   r   r   r     s   



zTokenizer.add_wordc                 C   s   |  |d dS )z:
        Convenient function for deleting a word.
        r   N)r   )r'   r?   r   r   r   del_word  s   zTokenizer.del_wordc                 C   s   |    t| j}d}t|tr9|}| j|ddD ]}|| j|d| 9 }qtt	|| j d | j|d}n+t
tt|}d|}|D ]}|| j|d| 9 }qGtt	|| j | j|d}|rl| || |S )a  
        Suggest word frequency to force the characters in a word to be
        joined or splitted.

        Parameter:
            - segment : The segments that the word is expected to be cut into,
                        If the word should be treated as a whole, use a str.
            - tune : If True, tune the word frequency.

        Note that HMM may affect the final result. If the result doesn't change,
        set HMM=False.
        r   Fr   r|   r   )rh   floatr"   r   r   r   r!   rW   rr   r4   tuplemapr   r   minr   )r'   segmenttuneftotalr@   r?   segr   r   r   r     s    

$
zTokenizer.suggest_freqdefaultc           
      c   s<   t |ts
tdd}|dkr,| j||dD ]}t|}|||| fV  ||7 }qdS | j||dD ]h}t|}t|dkrdtt|d D ]}|||d  }| j|rc||| || d fV  qGt|dkrtt|d D ]}|||d  }	| j|	r|	|| || d fV  qr|||| fV  ||7 }q3dS )	a%  
        Tokenize a sentence and yields tuples of (word, start, end)

        Parameter:
            - sentence: the str(unicode) to be segmented.
            - mode: "default" or "search", "search" is for finer segmentation.
            - HMM: whether to use the Hidden Markov Model.
        z-jieba: the input parameter should be unicode.r   r   r   r.   r   r   N)r   r   r7   r   r6   r5   r!   rW   )
r'   unicode_sentencemoder   startr   widthry   r   r   r   r   r   tokenize  s6   
	

zTokenizer.tokenizec                 C   sX   | j  t|}tj|std| || _d| _W d    d S 1 s%w   Y  d S )Nzjieba: file does not exist: F)r   r    r   r   rQ   rV   r   r$   )r'   dictionary_pathr^   r   r   r   set_dictionary  s   "zTokenizer.set_dictionaryr   )FTFT)NN)F)r   T)__name__
__module____qualname__r   r(   r*   staticmethodrC   rg   rh   ru   r{   r   r   r   r   r   r   r   _lcut_lcut_for_searchr   r   r   rX   r   r   r   r   r   r   r   r   r   r   r   5   s8    

K
(
1
&

!r   c                 C   s   t j| |S r   )dtr!   rW   )rw   dr   r   r   r     s    c                 C   
   t | S r   r   r   sr   r   r   r     r+   r   c                 C   r   r   )r   r   r   r   r   r   r   "  r+   r   c                 C   r   r   )r   r   r   r   r   r   r   &  r+   r   c                 C   r   r   r   r   r   r   r   r   *  r+   c                 C   r   r   )r   r   r   r   r   r   r   .  r+   r   c                 C   r   r   )r   r   r   r   r   r   r   2  r+   r   FTc                 c   s^    t | d}|rtt|}n|rtt|}ntt|}|D ]
}|D ]}|V  q&q"d S r   )r   
splitlinespoolr   r   r   r   )rp   r   r   partsresultrr   r   r   r   _pcut6  s   r   c                 c   sL    t | d}|rtt|}ntt|}|D ]
}|D ]}|V  qqd S r   )r   r   r   r   r   r   )rp   r   r   r   r   r   r   r   r   _pcut_for_searchC  s   r   c                 C   sT   ddl m} tjdkrtdddl m} t  | du r | } || at	a
tadS )z
    Change the module's `cut` and `cut_for_search` functions to the
    parallel version.

    Note that this only works using dt, custom Tokenizer
    instances are not supported.
    r   )	cpu_countr
   z/jieba: parallel mode only supports posix system)PoolN)multiprocessingr   r   nameNotImplementedErrorr   r   rh   r   r   r   r   r   )
processnumr   r   r   r   r   enable_parallelN  s   	
r   c                   C   s    t rt   d a tjatjad S r   )r   r8   r   r   r   r   r   r   r   disable_parallele  s
   
r   r   )FTr   )J
__future__r   r   __version____license__rT   rerN   r   rK   hashlibr   mathr   r|   r   _compatr   r   shutilr   r\   renamer    r   r   loggingStreamHandlersysstderrlog_console	getLoggerr   r   r   DEBUG
addHandlerrH   r   compileUr   r~   r   r   r   objectr   r   get_FREQr   ru   r   r   r   r   r   r{   rX   rg   r   r   r   r   r#   r   r   r   r   r   r   r   r   r   r   r   r   r   <module>   sx    


   V



