o
    Qi?                     @   s   d dl Z d dlZd dlZd dlmZ d dlmZ e jj	e j
e jee jjdd Zdd Zdd Zd	d
 Zdd ZdddZG dd dZG dd dZG dd dZdS )    N)defaultdict)pairwise_distancesc                  C   s   t  } d| j | j S )z(It returns remained memory as percentaged   )psutilvirtual_memory	availabletotal)mem r   F/home/ubuntu/.local/lib/python3.10/site-packages/soynlp/utils/utils.pyget_available_memory   s   r   c                  C   s   t t } |  jd S )z.It returns the memory usage of current processi   @)r   Processosgetpidmemory_inforss)processr   r   r   get_process_memory   s   r   c                 C   sL   t j| }|r |dkr"t j|s$t | td| d S d S d S d S )N.z
created {})r   pathdirnameexistsmakedirsprintformat)filepathr   r   r   r   
check_dirs   s
   
r   c                 C   sV  t jdd dkr-t| }dd |D }dd |D }W d    n1 s'w   Y  n$t| dd}d	d |D }d
d |D }W d    n1 sLw   Y  t jdd dkrt| d}t|D ]
}|d| qeW d    d S 1 s{w   Y  d S t| ddd}t|D ]
}|d| qW d    d S 1 sw   Y  d S )Nr   r   2c                 S      g | ]}|  qS r   strip.0docr   r   r   
<listcomp>"       z$sort_by_alphabet.<locals>.<listcomp>c                 S      g | ]}|r|qS r   r   r"   r   r   r   r%   #   r&   utf-8encodingc                 S   r   r   r    r"   r   r   r   r%   &   r&   c                 S   r'   r   r   r"   r   r   r   r%   '   r&   wz{}
)sysversionsplitopensortedwriter   )r   fdocsr$   r   r   r   sort_by_alphabet   s*   
""r4   
   c                    st   | | ddkrg S | dd}t||ddd    }|dkr-|d|d  } fdd|D }|S )	aM  
    :param query: str
        String type query word
    :param vector: numpy.ndarray or scipy.sparse.matrix
        Vector representation of row
    :param item_to_idx: dict
        Mapper from str type item to int type index
    :param idx_to_item: list
        Mapper from int type index to str type item
    :param topk: int
        Maximum number of similar items.
        If set top as negative value, it returns similarity with all words

    Returns
    ----------
    similars : list of tuple
        List contains tuples (item, cosine similarity)
        Its length is topk
    r      cosine)metricr   Nc                    s(   g | ]}|kr| d  |  fqS r6   r   )r#   idxdistidx_to_itemqr   r   r%   N   s   ( z most_similar.<locals>.<listcomp>)getreshaper   argsort)queryvectoritem_to_idxr=   topkqvecsim_idxssimilarsr   r;   r   most_similar1   s   rI   c                   @   s.   e Zd ZdddZdd Zdd	 Zd
d ZdS )DoublespaceLineCorpusr   Fr   c                 C   sJ   || _ d| _d| _|| _|| _|dks|dkr#| ||\| _| _d S d S Nr   )corpus_fnamenum_docnum_sent	iter_sentskip_header_check_length)selfrL   rM   rN   rO   rP   r   r   r   __init__R   s   zDoublespaceLineCorpus.__init__c           
   
   C   s0  d}z~zqt jdd dkrt| j}nt| jdd}t| jD ]}t| q t|D ]C\}}|dkrC||krC||f  W |	  W S |d}dd |D }|t
|7 }|dkrn||krn|d	 t||f  W |	  W S q+W |	  n|	  w |d	 |fW S  ty }	 zt|	 W Y d }	~	d
S d }	~	ww )Nr   r   r   r(   r)     c                 S   s   g | ]}|  r|qS r   r    )r#   sentr   r   r   r%   n   s    z7DoublespaceLineCorpus._check_length.<locals>.<listcomp>r6   )r   r   )r,   r-   r.   r/   rL   rangerP   next	enumeratecloselenmin	Exceptionr   )
rR   rM   rN   	num_sent_r2   _doc_idxr$   sentser   r   r   rQ   [   s6   


z#DoublespaceLineCorpus._check_lengthc           	   
   c   s:   zz~t jdd dkrt| j}nt| jdd}t| jD ]}t| qd\}}t|D ]B\}}|r6 nC| j	sM|
 V  | jdkrL|d | jkrLd}q.|d	D ]}| jdkrb|| jkrbd} n|
 }|ro|V  |d7 }qRq.W |  W d S W |  W d S |  w  ty } zt| W Y d }~d S d }~ww )
Nr   r   r   r(   r)   )r   Fr6   TrT   )r,   r-   r.   r/   rL   rV   rP   rW   rX   rO   r!   rM   rN   rY   r\   r   )	rR   r2   r^   rN   stopr_   r$   rU   ra   r   r   r   __iter__{   sD   

zDoublespaceLineCorpus.__iter__c                 C   s0   | j dkr| dd\| _ | _| jr| jS | j S )Nr   r   )rM   rQ   rN   rO   rR   r   r   r   __len__   s   
zDoublespaceLineCorpus.__len__N)r   r   Fr   )__name__
__module____qualname__rS   rQ   rc   re   r   r   r   r   rJ   Q   s
    
	 (rJ   c                   @   s   e Zd Z		d)ddZdd	 Zd
d Zdd Zdd Zedd Z	e	j
dd Z	edd Zedd Zd*ddZdd Zdd Zdd Zd+d!d"Zd+d#d$Zd%d& Zd'd( ZdS ),EojeolCounterNr6      r   Fc                 C   sb   || _ || _|| _|| _d| _|d u rdd }|| _|d ur%| || _ni | _d| _| 	  d S )N        c                 S   s   | S Nr   xr   r   r   <lambda>   s    z(EojeolCounter.__init__.<locals>.<lambda>r   )
	min_count
max_lengthfiltering_checkpointverbose	_coverage
preprocess_counting_from_sents_counter
_count_sum_set_count_sum)rR   r`   rp   rq   rr   rs   ru   r   r   r   rS      s   zEojeolCounter.__init__c                 C      | j |dS rK   rw   r?   rR   eojeolr   r   r   __getitem__      zEojeolCounter.__getitem__c                 C   
   t | jS rl   rZ   rw   rd   r   r   r   re         
zEojeolCounter.__len__c                 C   s   t | j | _d S rl   )sumrw   valuesrx   rd   r   r   r   ry      s   zEojeolCounter._set_count_sumc                    s  i }t |D ]a\}} |} jdkr/ jdkr/|dkr/| j dkr/ fdd| D }| D ]}|r>t| jkr?q3||dd ||< q3 j	rg|d dkrgt
dt||d dt  d	d
dd q fdd| D } j	rt
dt||d dt  d	d
d |S )Nr6   r   c                        i | ]\}}| j kr||qS r   rp   r#   kvrd   r   r   
<dictcomp>   s    
z6EojeolCounter._counting_from_sents.<locals>.<dictcomp>i i z9[EojeolCounter] n eojeol = {} from {} sents. mem={} Gb{}z%.3fz                    T )flushendc                    r   r   r   r   rd   r   r   r      s     )r   )rX   ru   rp   rr   itemsr.   rZ   rq   r?   rs   r   r   r   )rR   r`   rw   i_sentrU   r}   r   rd   r   rv      s6   


z"EojeolCounter._counting_from_sentsc                 C   s   | j S rl   )rt   rd   r   r   r   coverage   s   zEojeolCounter.coveragec                 C   s.   d|  krdkst d t d|| _d S )Nr   r6   zcoverage should be in [0, 1])
ValueErrorrt   )rR   valuer   r   r   r      s
   
c                 C   r   rl   r   rd   r   r   r   num_of_unique_uncovered_eojeols   s   
z-EojeolCounter.num_of_unique_uncovered_eojeolsc                 C   s   t | j S rl   )r   rw   r   rd   r   r   r   num_of_uncovered_eojeols   s   z&EojeolCounter.num_of_uncovered_eojeolsc                    s    fdd| j  D S )Nc                    s   i | ]\}}| kr||qS r   r   r   r   r   r   r          z7EojeolCounter.get_uncovered_eojeols.<locals>.<dictcomp>rw   r   )rR   rp   r   r   r   get_uncovered_eojeols   s   z#EojeolCounter.get_uncovered_eojeolsc                    s0    fdd| j  D | _ d| j| j  | _d S )Nc                    s   i | ]\}}| vr||qS r   r   r   eojeolsr   r   r      r   z8EojeolCounter.remove_covered_eojeols.<locals>.<dictcomp>r6   )rw   r   r   rx   r   )rR   r   r   r   r   remove_covered_eojeols   s   z$EojeolCounter.remove_covered_eojeolsc                 C   rz   rK   r{   r|   r   r   r   get_eojeol_count   r   zEojeolCounter.get_eojeol_countc                 C   s
   | j  S rl   r   rd   r   r   r   r      r   zEojeolCounter.itemsr5   	   c                 C   s   |  | j||S rl   )_to_lrgraphrw   )rR   l_max_lengthr_max_lengthignore_one_syllabler   r   r   
to_lrgraph  s   zEojeolCounter.to_lrgraphc                 C   s   t dd }| D ]:\}}|rt|dkrq
tdt|t|d D ] }|d | ||d  }	}
t|
|kr9q#||	 |
  |7  < q#q
dd | D }t|||d}|S )Nc                   S      t tS rl   r   intr   r   r   r   ro         z+EojeolCounter._to_lrgraph.<locals>.<lambda>r6   c                 S      i | ]	\}}|t |qS r   dictr#   lrdictr   r   r   r         z-EojeolCounter._to_lrgraph.<locals>.<dictcomp>)lrgraphr   r   )r   r   rZ   rV   r[   LRGraph)rR   counterr   r   r   _lrgraphr}   countra   r   rr   r   r   r   r     s   zEojeolCounter._to_lrgraphc                 C   s   t j|}|rt j|st | t|ddd"}t| j dd dD ]\}}|	d
|| q&W d    d S 1 s?w   Y  d S )Nr+   r(   r)   c                 S   s   | d  | d fS )Nr6   r   r   rm   r   r   r   ro     s    z$EojeolCounter.save.<locals>.<lambda>keyz{} {}
)r   r   r   r   r   r/   r0   rw   r   r1   r   )rR   r   r   r2   r}   r   r   r   r   save  s   
"zEojeolCounter.savec                 C   sp   d| _ i | _t|dd}|D ]}| \}}t|| j|< qW d    n1 s)w   Y  t| j | _d S )Nrk   r(   r)   )rt   rw   r/   r.   r   r   r   rx   )rR   r   r2   linewordr   r   r   r   load  s   zEojeolCounter.load)Nr6   rj   r   FN)r   )r5   r   F)rf   rg   rh   rS   r~   re   ry   rv   propertyr   setterr   r   r   r   r   r   r   r   r   r   r   r   r   r   ri      s0    







ri   c                   @   s   e Zd Zd$ddZdd Zdd	 Zd
d Zd%ddZd%ddZd%ddZ	d%ddZ
d&ddZd&ddZdd Zdd Zd'ddZd d! Zd"d# ZdS )(r   Nr5   r   c                 C   s   |dkr
t |tksJ |dkrt |tksJ || _|| _|r+|r&td| |}|r7| |\| _| _ni i | _| _dd | j	 D | _
d S )Nr6   r   zBInserted lrgraph will be ignored. Insert only one (lrgraph, sents)c                 S   $   i | ]\}}|d d |  D qS )c                 S      i | ]\}}||qS r   r   r#   r   cr   r   r   r   8      z/LRGraph.__init__.<locals>.<dictcomp>.<dictcomp>r   r   r   r   r   r   8      z$LRGraph.__init__.<locals>.<dictcomp>)typer   r   r   r   _construct_graph_check_lrgraph_lr_rlr   
_lr_origin)rR   r   r`   r   r   r   r   r   rS   &  s    
zLRGraph.__init__c                 C   s   t dd }|D ]<}| D ]5}| }tdtt|| jd D ]!}|d | ||d  }}t|| jkr8q!|| |  |7  < q!qqdd | D }|S )Nc                   S   r   rl   r   r   r   r   r   ro   <  r   z*LRGraph._construct_graph.<locals>.<lambda>r6   c                 S   r   r   r   r   r   r   r   r   E  r   z,LRGraph._construct_graph.<locals>.<dictcomp>)	r   r.   r!   rV   r[   rZ   r   r   r   )rR   r`   r   rU   r   ra   r   r   r   r   r   r   ;  s   zLRGraph._construct_graphc                 C   s   t |turzt|}W n   tdt |t t| d }|tkr1dd | D }n|tks<td|tdd }| D ]\}}| D ]\}}|sUqN|| |  |7  < qNqFdd | D }||fS )	Nz+lrgraph type should be dict of dict, not {}r   c                 S   r   r   r   r   r   r   r   r   Q  r   z*LRGraph._check_lrgraph.<locals>.<dictcomp>z(nested value type should be dict, not {}c                   S   r   rl   r   r   r   r   r   ro   U  r   z(LRGraph._check_lrgraph.<locals>.<lambda>c                 S   r   r   r   )r#   r   ldictr   r   r   r   [  r   )r   r   r   r   listr   r   r   )rR   r   nested_dict_typerlgraphr   r   r   r   r   r   r   r   H  s.   zLRGraph._check_lrgraphc                 C   s0   | j sd S | dd | j  D \| _| _d S )Nc                 S   r   )c                 S   r   r   r   r   r   r   r   r   c  r   z4LRGraph.reset_lrgraph.<locals>.<dictcomp>.<dictcomp>r   r   r   r   r   r   c  r   z)LRGraph.reset_lrgraph.<locals>.<dictcomp>)r   r   r   r   r   rd   r   r   r   reset_lrgraph^  s   zLRGraph.reset_lrgraphr6   c                 C   s8   | j | |  |7  < |r| j| |  |7  < d S d S rl   )r   r   )rR   r   r   r   r   r   r   add_lr_pairg  s   zLRGraph.add_lr_pairc                 C   D   t dt|d D ]}|d | ||d  }}| ||| q	d S Nr6   )rV   rZ   r   rR   r}   r   ir   r   r   r   r   
add_eojeoll     zLRGraph.add_eojeolc                 C   s   || j v r-| j | }||v r-||  |8  < || dkr-|| t|dkr-| j | || jv r\| j| }||v r^||  |8  < || dkr`|| t|dkrb| j| d S d S d S d S d S rK   )r   poprZ   r   )rR   r   r   r   r   r   r   r   r   remove_lr_pairq  s(   





zLRGraph.remove_lr_pairc                 C   r   r   )rV   rZ   r   r   r   r   r   remove_eojeol  r   zLRGraph.remove_eojeolc                 C   6   t | j|i  dd d}|dkr|d | }|S )Nc                 S   
   | d  S r   r   rm   r   r   r   ro        
 zLRGraph.get_r.<locals>.<lambda>r   r   )r0   r   r?   r   )rR   r   rE   rlistr   r   r   get_r     zLRGraph.get_rc                 C   r   )Nc                 S   r   r   r   rm   r   r   r   ro     r   zLRGraph.get_l.<locals>.<lambda>r   r   )r0   r   r?   r   )rR   r   rE   llistr   r   r   get_l  r   zLRGraph.get_lc                 C   s
   d| _ dS )zeRemove self._lr_origin. Be careful.
        When you excute freeze, you cannot reset_lrgraph anynore.N)r   rd   r   r   r   freeze  s   
zLRGraph.freezec                 C   s6   t | j| jd}| dd | j D \|_|_|S )z8It returns original LRGraph which has no self._lr_origin)r   r   c                 S   r   )c                 S   r   r   r   r   r   r   r   r     r   zFLRGraph.copy_compatified_lrgraph_origin.<locals>.<dictcomp>.<dictcomp>r   r   r   r   r   r     r   z;LRGraph.copy_compatified_lrgraph_origin.<locals>.<dictcomp>)r   r   r   r   r   r   r   r   )rR   lr_graphr   r   r   copy_compatified_lrgraph_origin  s   z'LRGraph.copy_compatified_lrgraph_originFc           	      C   sd   |r| j n| j}i }| D ]\}}| D ]
\}}|||| < qqtd }||_t| |_|S rl   )r   r   r   ri   rw   r   r   rx   )	rR   r   lrr   r   r   r   r   eojeol_counterr   r   r   to_EojeolCounter  s   zLRGraph.to_EojeolCounterc              
   C   s   t j|}|rt j|st | t|ddd*}t| j D ]\}}t| D ]\}}|	d
||| q,q"W d    d S 1 sGw   Y  d S )Nr+   r(   r)   z	{} {} {}
)r   r   r   r   r   r/   r0   r   r   r1   r   )rR   r   r   r2   r   r   r   r   r   r   r   r     s   
"zLRGraph.savec                 C   s   i | _ t|ddU}d}i }|D ]?}| }|d |ks%|r%|| j |< i }|d }t|dkr8t|d |d< qt|dkrIt|d ||d < qtd	||rW|| j |< W d    n1 saw   Y  | d
d | j  D \| _	| _
d S )Nr(   r)   r   r      r      r6   zWrong lr-graph format: {}c                 S   r   )c                 S   r   r   r   r   r   r   r   r     r   z+LRGraph.load.<locals>.<dictcomp>.<dictcomp>r   r   r   r   r   r     r   z LRGraph.load.<locals>.<dictcomp>)r   r/   r.   rZ   r   r   r   r   r   r   r   )rR   r   r2   r   r   r   sepr   r   r   r     s2   

zLRGraph.load)NNr5   r   r9   r5   )F)rf   rg   rh   rS   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   $  s     

	





	r   r   )r   r   r,   collectionsr   sklearn.metricsr   r   r   joinr   realpath__file__r.   installpathr   r   r   r4   rI   rJ   ri   r   r   r   r   r   <module>   s    $
 W|