o
    Qi"                     @   s2   d dl mZ d dlZd dlmZ G dd dZdS )    defaultdictN)get_process_memoryc                   @   sJ   e Zd ZdddZdddZd	d
 Zdd Zdd Zdd ZdddZ	dS )EojeolPatternTrainer
      Tc                 C   s4   || _ || _|| _|| _d | _d | _d | _d | _d S N)max_left_lengthmax_right_lengthmin_frequencyverboselrgraphrlgraph	wordset_l	wordset_r)selfr	   r
   r   r    r   W/home/ubuntu/.local/lib/python3.10/site-packages/soynlp/tokenizer/_tokenizer_builder.py__init__
   s   
zEojeolPatternTrainer.__init__Nc                 C   s0   |r|s|  |\}}| |||\| _| _d S r   )_scan_vocabulary_build_graphr   r   )r   sentsr   r   r   r   r   train   s   zEojeolPatternTrainer.trainc           
         sv  t t|d }tdd }tdd }t|D ]s\}}|dD ]<}|s&q!t|}tdt j|d D ]}||d|   d7  < q5tdt j|D ]}||| d   d7  < qMq! j	r|| dkrd	t ||  d
dt ||   d| t| dt
 f}	tjd|	  q fdd| D } fdd| D } j	rtd tdt|t|t
 f  ||fS )z
        Parameters
        ----------
            sents: list-like iterable object which has string
            
        It computes subtoken frequency first. 
        After then, it builds lr-graph with sub-tokens appeared at least min count
        (   c                   S      dS Nr   r   r   r   r   r   <lambda>&       z7EojeolPatternTrainer._scan_vocabulary.<locals>.<lambda>c                   S   r   r   r   r   r   r   r   r   '   r       Nr   #-      Y@%z!scanning: %s%s (%.3f %s) %.3f Gbc                       h | ]\}}| j kr|qS r   r   .0wfr   r   r   	<setcomp>6       z8EojeolPatternTrainer._scan_vocabulary.<locals>.<setcomp>c                    r$   r   r%   r&   r*   r   r   r+   7   r,   zscanning completedz+(L,R) has (%d, %d) tokens. memory = %.3f Gb)intlenr   	enumeratesplitrangeminr	   r
   r   r   sysstdoutwriteitemsprint)
r   r   _ckptr   r   isenttoken	token_lenargsr   r*   r   r      s.   
8z%EojeolPatternTrainer._scan_vocabularyc                 C   sz  || _ || _| jd tt|d }tdd }tdd }t|D ]v\}}| D ]@}	|	s1q,t|	}
tdt	| j
|
d D ]+}|	d | }|	|d  }||vsV||vrWq@|| |  d7  < || |  d7  < q@q,| jr|| dkrdt||  d	dt||   d
| t| dt f}tjd|  q$| jrtjdt   dd | D }dd | D }||fS )N r   c                   S      t dd S )Nc                   S   r   r   r   r   r   r   r   r   C   r   EEojeolPatternTrainer._build_graph.<locals>.<lambda>.<locals>.<lambda>r   r   r   r   r   r   C       z3EojeolPatternTrainer._build_graph.<locals>.<lambda>c                   S   r?   )Nc                   S   r   r   r   r   r   r   r   r   D   r   r@   r   r   r   r   r   r   D   rA   r   r   r    r!   r"   r#   z4building lr-graph: %s%s (%.3f %s), memory = %.3f Gbz.building lr-graph completed. memory = %.3f Gbc                 S   $   i | ]\}}|d d |  D qS )c                 S      i | ]\}}||qS r   r   r'   rr)   r   r   r   
<dictcomp>Y       @EojeolPatternTrainer._build_graph.<locals>.<dictcomp>.<dictcomp>r6   r'   lrdictr   r   r   rF   Y      $ z5EojeolPatternTrainer._build_graph.<locals>.<dictcomp>c                 S   rB   )c                 S   rC   r   r   r'   rK   r)   r   r   r   rF   Z   rG   rH   rI   r'   rE   ldictr   r   r   rF   Z   rM   )r   r   addr-   r.   r   r/   r0   r1   r2   r	   r   r   r3   r4   r5   r6   )r   r   r   r   r8   r   r   r9   r:   r;   r<   rK   rE   r=   r   r   r   r   >   s8   8z!EojeolPatternTrainer._build_graphc              	   C   s.  t |ddd}|d| j| j| j| jrdndf  |d | j D ]*\}}|d|t|	 f  t
| d	d
 ddD ]\}}|d||f  qAq%|d | j D ]*\}}|d|t|	 f  t
| dd
 ddD ]\}}|d||f  qvqZW d    d S 1 sw   Y  d S )Nr(   utf-8encodingz%d %d %d %d
r   r   z
# lrgraph
z
> %s (%d)
c                 S      | d S Nr   r   xr   r   r   r   c       z+EojeolPatternTrainer.save.<locals>.<lambda>T)keyreversez  - %s: %d
z
# rlgraph
c                 S   rU   rV   r   rW   r   r   r   r   h   rY   )openr5   r	   r
   r   r   r   r6   sumvaluessortedr   )r   fnamer)   rK   rL   rE   freqrP   r   r   r   save]   s    &

"zEojeolPatternTrainer.savec                 C   sl  t |dd$}t| }| }z3dd |D }t|dkr(tdt| |d | _|d | _|d	 | _	|d
 dkrB| j
dknd W n tyY } ztdt| d }~ww tdd }tdd }t| }|dksvtd| d }	|D ]B}
|
d d }
|
sqz|
dkr n2|
d d	 dkr|
d	|
d  }	qz|
d d dkr|
dd  d\}}t|}|||	 |< qzt| | _|D ]=}
|
d d }
|
sq|
d d	 dkr|
d	|
d  }	q|
d d dkr|
dd  d\}}t|}|||	 |< qt| | _dd | D }dd | D }|| _|| _W d    d S 1 s/w   Y  d S )NrR   rS   c                 S   s   g | ]}t |qS r   )r-   )r'   ar   r   r   
<listcomp>p   s    z-EojeolPatternTrainer.load.<locals>.<listcomp>   z'first line should be parameter info, %sr   r         TFc                   S   r?   )Nc                   S   r   r   r   r   r   r   r   r   z   r   =EojeolPatternTrainer.load.<locals>.<lambda>.<locals>.<lambda>r   r   r   r   r   r   z   rA   z+EojeolPatternTrainer.load.<locals>.<lambda>c                   S   r?   )Nc                   S   r   r   r   r   r   r   r   r   {   r   rh   r   r   r   r   r   r   {   rA   z	# lrgraphzCannot find lrgraph data, %sz	# rlgraphz> (z  - z: c                 S   rB   )c                 S   rC   r   r   rD   r   r   r   rF      rG   8EojeolPatternTrainer.load.<locals>.<dictcomp>.<dictcomp>rI   rJ   r   r   r   rF      rM   z-EojeolPatternTrainer.load.<locals>.<dictcomp>c                 S   rB   )c                 S   rC   r   r   rN   r   r   r   rF      rG   rk   rI   rO   r   r   r   rF      rM   )r\   nextstripr0   r.   
ValueErrorstrr	   r
   r   r   	Exceptionr   rindexr-   setkeysr   r   r6   r   r   )r   r`   r)   paramr=   er   r   	load_typekey1rowkey2ra   r   r   r   loadk   sh   


$zEojeolPatternTrainer.load'  ?-C6?c                    s  dd }| d krt jt j} }|t|  fdd|  D |t| fdd| D t|D ]}i  |  D ]\}}	tfdd|	 D }
|
 |< q=| || i | D ]\}}|dkrhq_tfd	d| D }||< q_|||t jrt	j
d
|d |f  t fdd D }|tfdd D 7 } ||| k rt jrtd|d    nq5t jrtd|d   fS )Nc                    sD   || t |    d| | t|   fdd|  D }|S )Nr   c                    s&   i | ]\}}|d kr| |  qS r>   r   )r'   wordrankfactorrestartr   r   rF      s   & zFEojeolPatternTrainer.train_hits.<locals>.normalize.<locals>.<dictcomp>)r]   r^   r.   r6   )gsum_of_rankdfg_r   r   r   	normalize   s   z2EojeolPatternTrainer.train_hits.<locals>.normalizec                    s   i | ]}| qS r   r   )r'   rK   r   r   r   rF      s    z3EojeolPatternTrainer.train_hits.<locals>.<dictcomp>c                    s   i | ]	}|d kr| qS r~   r   )r'   rE   r   r   r   rF      s    c                    s$   h | ]\}}|d kr| |  qS r~   r   )r'   rE   ra   )rank_rr   r   r+      rM   z2EojeolPatternTrainer.train_hits.<locals>.<setcomp>r>   c                    s   h | ]
\}}| |  qS r   r   )r'   rK   ra   )rank_lr   r   r+      s    ztrain hits ... %d in %dr   c                        g | ]\}}t | |  qS r   absr'   r(   r   )next_rank_lr   r   rd           z3EojeolPatternTrainer.train_hits.<locals>.<listcomp>c                    r   r   r   r   )next_rank_rr   r   rd      r   z$graph was converged at %d iterationz%computation was done at %d iteration)r   r   r   r.   rs   r1   r6   r]   r   r3   r4   r5   r7   )r   r   r   decaying_factormax_iter	tolerancer   n_iterrK   rL   	sum_rrankrE   rP   	sum_lrankdiffr   )r   r   r   r   r   r   
train_hits   sB   


zEojeolPatternTrainer.train_hits)r   r   r   T)NN)NNr{   r|   r   r}   )
__name__
__module____qualname__r   r   r   r   rb   rz   r   r   r   r   r   r      s    


$6r   )collectionsr   r3   soynlp.utilsr   r   r   r   r   r   <module>   s   