o
    Qi6                     @   sh   d dl mZ d dl mZ d dlZd dlZd dlZd dlZd dlm	Z	 eddZ
dd ZG d	d
 d
ZdS )    )defaultdict)
namedtupleN)get_process_memoryScoreszcohesion_forward cohesion_backward left_branching_entropy right_branching_entropy left_accessor_variety right_accessor_variety leftside_frequency rightside_frequencyc                 C   sL   | sdS t |  }d}|  D ]}t|| }||t| 7 }qd| S )N        r   )sumvaluesfloatmathlog)dicsum_entropyfreqprob r   E/home/ubuntu/.local/lib/python3.10/site-packages/soynlp/word/_word.py_entropy   s   r   c                   @   s   e Zd Z								
d(ddZd)ddZd*ddZdd Zdd Zdd Zdd Z	e
fddZdd Zdd Zd d! Zd"d# Zd$d% Zd&d' ZdS )+WordExtractorN
         順 皙?r   \(\?r   Fc                 C   sv   || _ || _|| _i | _i | _i | _i | _|| _|| _|| _	|	| _
|
| _|| _|| _|| _|| _|r9| | d S d S N)max_left_lengthmax_right_lengthmin_frequencyLR_aL_aRverbosemin_cohesion_forwardmin_cohesion_backwardmax_droprate_leftside_frequencymin_left_branching_entropymin_right_branching_entropymin_left_accessor_varietymin_right_accessor_varietyremove_subwordstrain)selfsentsr   r   r   verbose_pointsr%   r&   max_droprate_cohesionr'   r(   r)   r*   r+   r,   r   r   r   __init__   s&   zWordExtractor.__init__Tc              	      s   fdd} fdd}|r+t t j _t t j _t t j _t t j _nt t _t t _t t _t t _t|D ]\}}tjj	dkrXt
t|  }n| }|D ]H}	|	rht|	dkriq^t|	}
tdt jd |
d D ]} j|	d |   d7  < qztdt jd |
D ]} j|	| d    d7  < qq^t|dkrqCt|d g|d d  ||dd  |d g D ]k\}}	} jd	|	|d f   d7  <  jd	|d |	f   d7  < t|	}
tdt jd |
D ]} jd	|	| d  |d f   d7  < qtdt jd |
D ]} jd	|d |	d | f   d7  < qq|dkrA|| dkrA|   jdkr]| j dkr]tjd
|t|t f  qC|  |   jdkrqtdt   t j _t j _t j _t j _d S )Nc                      sL   t dd  fdd j D  _t dd  fdd j D  _d S )Nc                   S      dS Nr   r   r   r   r   r   <lambda>9       zAWordExtractor.train.<locals>.prune_extreme_case.<locals>.<lambda>c                        i | ]\}}| j kr||qS r   r   .0wfr.   r   r   
<dictcomp>9        zCWordExtractor.train.<locals>.prune_extreme_case.<locals>.<dictcomp>c                   S   r3   r4   r   r   r   r   r   r5   :   r6   c                    r7   r   r8   r9   r=   r   r   r>   :   r?   )r   r    itemsr!   r   r=   r   r   prune_extreme_case8   s   $(z/WordExtractor.train.<locals>.prune_extreme_casec                      sD   t dd dd  j D  _t dd dd  j D  _d S )Nc                   S   r3   r4   r   r   r   r   r   r5   <   r6   zCWordExtractor.train.<locals>.prune_extreme_case_a.<locals>.<lambda>c                 S      i | ]\}}|d kr||qS    r   r9   r   r   r   r>   <       zEWordExtractor.train.<locals>.prune_extreme_case_a.<locals>.<dictcomp>c                   S   r3   r4   r   r   r   r   r   r5   =   r6   c                 S   rB   rC   r   r9   r   r   r   r>   =   rE   )r   r"   r@   r#   r   r=   r   r   prune_extreme_case_a;   s    $z1WordExtractor.train.<locals>.prune_extreme_case_a   rD   r   r   z%s %sz1training ... (%d in %d sents) use memory %.3f Gbz'training was done. used memory %.3f Gb)r   intr    r!   r"   r#   	enumeratesysversion_infomajormapunicodestripsplitlenrangeminr   r   zipr$   stdoutwriter   printdict)r.   r/   num_for_pruningcumulaterA   rF   num_sentsentwordswordword_leni	left_word
right_wordr   r=   r   r-   7   s`   



:**zWordExtractor.trainc                 C   s   |s|   }i }t| dd dD ]c\}}|j| jk s8|j| jk s8|j| jk s8|j	| j
k s8t|j|j| jk r9qt|dkrL|j| jk sK|j| jk rLq|||< | jsTq|d d }|| jvradn|j| j|  }|| jkru||v ru||= q|S )Nc                 S   s   t | d S r4   rQ   )xr   r   r   r5   w   s    z'WordExtractor.extract.<locals>.<lambda>)keyrG   r   r   )word_scoressortedr@   left_branching_entropyr(   right_branching_entropyr)   left_accessor_varietyr*   right_accessor_varietyr+   maxleftside_frequencyrightside_frequencyr   rQ   cohesion_forwardr%   cohesion_backwardr&   r,   r    r'   )r.   scoresscores_r^   scoresubworddroprate_leftside_frequencyr   r   r   extracts   s.   zWordExtractor.extractc           	      C   s   |   }|  }|  }i }|  D ]7}||d}||d}||d}t|d |d |d |d |d |d | j|d| j|d||< q|S )Nr   r   r   rD   )all_cohesion_scoresall_branching_entropyall_accessor_varietyr]   getr   r    r!   )	r.   cpsbesavsrq   r^   cpbeavr   r   r   rf      s   HzWordExtractor.word_scoresc                 C   s   i }|   }t|D ]4\}}| jdkr(|| j dkr(tjd|d t|f  | |}|d dkr:|d dkr:q
|||< q
| jdkrLtdt|  |S )Nr   z' cohesion probabilities ... (%d in %d)rD   z6all cohesion probabilities was computed. # words = %d)	r]   rI   r$   rJ   rU   rV   rQ   cohesion_scorerW   )r.   r|   r]   r`   r^   r   r   r   r   rx      s   


z!WordExtractor.all_cohesion_scoresc                 C   s   t |}|r
|dkrdS tt| |\}}|dkrdnt|| j|d   d|d  }|dkr3dnt|| j|d   d|d  }||fS )NrD   rw   r   r   )rQ   rM   r
   	frequencynppowerr    r!   )r.   r^   r_   l_freqr_freq
l_cohesion
r_cohesionr   r   r   r      s   ..zWordExtractor.cohesion_scorec                 C   s   | j |d| j|dfS r4   )r    r{   r!   )r.   r^   r   r   r   r      s   zWordExtractor.frequencyc                    s   dd }dd }dd } fdd}d	d
 }|||| j || j| jd | j | j}|||| j|| j| jd | j| j}|||}	| jdkrZ tkrNdnd}
td|
t	|	f  |	S )Nc                 S   s   | d d S )Nr   r   	extensionr   r   r   
parse_left      z7WordExtractor.all_branching_entropy.<locals>.parse_leftc                 S   s   | dd  S )NrD   r   r   r   r   r   parse_right   r   z8WordExtractor.all_branching_entropy.<locals>.parse_rightc                 S   s0   t dd }|  D ]}|t| | q
|S )Nc                   S      g S r   r   r   r   r   r   r5      r6   zMWordExtractor.all_branching_entropy.<locals>.sort_by_length.<locals>.<lambda>)r   keysrQ   append)countersorted_by_lengthr;   r   r   r   sort_by_length   s   z;WordExtractor.all_branching_entropy.<locals>.sort_by_lengthc                    s   t dd | D }i }td|D ]O}||g }	tdd }
|	D ]}|
| | | q"||d g }|D ]}|
| |dd | q8|
 D ]\}} fd	d
|D }|||< qLq|S )Nc                 s   s    | ]	\}}t |V  qd S r   rc   )r:   lengthr]   r   r   r   	<genexpr>   s    zQWordExtractor.all_branching_entropy.<locals>.get_entropy_table.<locals>.<genexpr>rG   c                   S   r   r   r   r   r   r   r   r5      r6   zPWordExtractor.all_branching_entropy.<locals>.get_entropy_table.<locals>.<lambda>rD     c                    s*   i | ]}|d |v r |n  |qS )r   r{   )r:   extr   	counter_ar   r   r>      s   * zRWordExtractor.all_branching_entropy.<locals>.get_entropy_table.<locals>.<dictcomp>)r   r@   rR   r{   r   r   replace)parser   sorted_by_length_a
max_lengthr   r   num_sumr   r_   r]   
extensionsr^   words_	root_wordextension_wordsextension_frequency	get_scorer   r   get_entropy_table   s   z>WordExtractor.all_branching_entropy.<locals>.get_entropy_tablec                    sB    fdd|   D }   D ]\}}|| v rqd|f||< q|S )Nc                    s"   i | ]\}}||  |d fqS )r   r   )r:   r^   vbe_rr   r   r>      s   " zFWordExtractor.all_branching_entropy.<locals>.merge.<locals>.<dictcomp>r   )r@   )be_lr   r   r^   r   r   r   r   merge   s
   
z2WordExtractor.all_branching_entropy.<locals>.mergerD   r   zbranching entropieszaccessor varietyz!all %s was computed # words = %d)
r!   r#   r   r    r"   r   r$   r   rW   rQ   )r.   r   r   r   r   r   r   r   r   r   
print_headr   r   r   ry      s   ((

z#WordExtractor.all_branching_entropyc                    s   t   fdd| j D }| fdd| j D   fdd| j D }| fdd| j D  |sBdnt|}|sJdnt|}||fS )Nc                    6   i | ]\}}t |d  kr|d d  kr||qS rD   Nrc   r9   r^   r_   r   r   r>         6 z3WordExtractor.branching_entropy.<locals>.<dictcomp>c                    r   rG   Nrc   r9   r   r   r   r>      r   c                    6   i | ]\}}t |d  kr|dd  kr||qS rD   Nr   rc   r9   r   r   r   r>      r   c                    r   rG   Nrc   r9   r   r   r   r>      r   r   )rQ   r!   r@   updater#   r    r"   r   )r.   r^   lsbrsbr   r   r   r   r   branching_entropy   s     zWordExtractor.branching_entropyc                 C   s   | j tdS )Nr   )ry   rQ   r=   r   r   r   rz      r   z"WordExtractor.all_accessor_varietyc                    s   t   fdd| j D }| fdd| j D   fdd| j D }| fdd| j D  |dkrDdnt |}|dkrNdnt |}||fS )Nc                    r   r   rc   r9   r   r   r   r>      r   z2WordExtractor.accessor_variety.<locals>.<dictcomp>c                    r   r   rc   r9   r   r   r   r>      r   c                    r   r   rc   r9   r   r   r   r>      r   c                    r   r   rc   r9   r   r   r   r>      r   Fr   )rQ   r!   r@   r   r#   r    r"   )r.   r^   r   r   av_lav_rr   r   r   accessor_variety   s     zWordExtractor.accessor_varietyc                    s:    fdd j  D }| fdd j D  |S )Nc                       h | ]}t | jkr|qS r   )rQ   r   r:   r^   r=   r   r   	<setcomp>   rE   z&WordExtractor.words.<locals>.<setcomp>c                    r   r   )rQ   r   r   r=   r   r   r      rE   )r    r   r   r!   )r.   r]   r   r=   r   r]      s   zWordExtractor.wordsc                 C   s   | j | j| j| j| j| j| j| j| j| j	| j
| jd}| j| j| j| jd}||d}t|d}t|| W d    d S 1 sBw   Y  d S )N)r   r   r   r0   r%   r&   r'   r(   r)   r*   r+   r,   )r    r!   aLaR)configurationdatawb)r   r   r   r$   r%   r&   r'   r(   r)   r*   r+   r,   r    r!   r"   r#   openpickledump)r.   fnamer   r   paramsr<   r   r   r   save   s0   "zWordExtractor.savec                 C   s   t |d}t|}W d    n1 sw   Y  |d }|d | _|d | _|d | _|d | _|d | _|d | _|d	 | _	|d
 | _
|d | _|d | _|d | _|d | _|d }|d | _|d | _|d | _|d | _~~~d S )Nrbr   r   r   r   r0   r%   r&   r'   r(   r)   r*   r+   r,   r   r    r!   r   r   )r   r   loadr   r   r   r$   r%   r&   r'   r(   r)   r*   r+   r,   r    r!   r"   r#   )r.   r   r<   r   r   r   r   r   r   r     s0   















zWordExtractor.load)Nr   r   r   r   r   r   r   r   r   r   r   r   F)r   Tr   )__name__
__module____qualname__r2   r-   rv   rf   rx   r   r   r   ry   r   rz   r   r]   r   r   r   r   r   r   r      s,    


<	(

r   )collectionsr   r   r   numpyr   r   rJ   soynlp.utilsr   r   r   r   r   r   r   r   <module>   s   
