o
    Qi)                     @   s`   d dl mZmZ d dlZd dlZd dlmZ d dlmZ d dl	m
Z
 eddZG dd	 d	ZdS )
    )defaultdict
namedtupleN)normalize_sent_for_lrgraph)WordExtractor)LRGraphNounScore_v1zfrequency score known_r_ratioc                   @   s   e Zd Z			d%ddZd	d
 Z		d&ddZd'ddZd'ddZdd Zd&ddZ	dd Z
dd Zd(ddZd)ddZdd  Zd!d" Zd#d$ ZdS )*LRNounExtractor
      NT   Fc           
      C   s   i | _ || _|| _|| _d | _d | _i | _|| _|| _|s@dd l	}d
|jtdddd d }d| g}|r@td |D ]}	|rQtd|	dd   | |	 qB|rdtd	t| j   d S d S )
Nr   /\z'%s/trained_models/noun_predictor_sejongzE[Noun Extractor] used default noun predictor; Sejong corpus predictorz[Noun Extractor] used %sz-[Noun Extractor] All %d r features was loaded)coefficientverbosemax_left_lengthmax_right_lengthlrgraphwords_substring_countermin_num_of_featuresensure_normalizedosjoinpathabspath__file__replacesplitprint_load_predictorlen)
selfr   r   predictor_fnamesr   r   r   r   	directoryfname r'   J/home/ubuntu/.local/lib/python3.10/site-packages/soynlp/noun/_noun_ver1.py__init__   s,   (
zLRNounExtractor.__init__c              
   C   s  zrt jjdkrt|}nt|dd}z[z.t|D ]'\}}| d\}}t|}|| jv r:t	| j| || j|< q|| j|< qW n t
y] } ztd|d | W Y d }~nd }~ww W |  W d S W |  W d S |  w  t
y } zt| W Y d }~d S d }~ww )N   zutf-8)encoding	z5[Noun Extractor] predictor parsing error line {} = {}r   )sysversion_infomajoropen	enumeratestripr   floatr   max	Exceptionr    formatclose)r#   r&   fnum_linelinerscoreer'   r'   r(   r!   *   s0   

 
zLRNounExtractor._load_predictor      ?   c                 C   s   |  || | |||S N)trainextract)r#   sentsmin_noun_scoremin_noun_frequencynoun_candidatesr'   r'   r(   train_extract?   s   zLRNounExtractor.train_extractc                 C   s2   |  ||\}}| |||}t|| _|| _d S r@   )_scan_vocabulary_build_lrgraphr   r   r   )r#   rC   rE   	wordset_l	wordset_rr   r'   r'   r(   rA   E   s   

zLRNounExtractor.trainc           
         s`  t dd }t dd }t|D ]i\}}| jst|}|dD ]<}|s%q t|}tdt| j|d D ]}||d|   d7  < q4tdt| j	|D ]}||| d   d7  < qLq | j
ry|d dkryd	|d t|}	td
|	dd q fdd| D | _t| j } fdd| D }| j
rdt|t|f }	td|	 ||fS )z
        Parameters
        ----------
            sents: list-like iterable object which has string
            
        It computes subtoken frequency first. 
        After then, it builds lr-graph with sub-tokens appeared at least min count
        c                   S      dS Nr   r'   r'   r'   r'   r(   <lambda>U       z2LRNounExtractor._scan_vocabulary.<locals>.<lambda>c                   S   rL   rM   r'   r'   r'   r'   r(   rN   V   rO    r   N    zscanning {} / {} sents[Noun Extractor] {} endc                    s   i | ]\}}| kr||qS r'   r'   .0wr8   min_frequencyr'   r(   
<dictcomp>g   s    z4LRNounExtractor._scan_vocabulary.<locals>.<dictcomp>c                    s   h | ]
\}}| kr|qS r'   r'   rW   rZ   r'   r(   	<setcomp>i   s    z3LRNounExtractor._scan_vocabulary.<locals>.<setcomp>z(L,R) has (%d, %d) tokensz&[Noun Extractor] scanning was done {})r   r1   r   r   r   r"   rangeminr   r   r   r6   r    itemsr   setkeys)
r#   rC   r[   rJ   rK   isenttoken	token_lenmessager'   rZ   r(   rH   K   s2   
z LRNounExtractor._scan_vocabularyc                 C   s  t dd }t|D ]i\}}| jst|}| D ]=}|sqt|}tdt| j|d D ](}|d | }	||d  }
|	|vr@q-t|
dkrK|
|vrKq-||	 |
  d7  < q-q| j	rs|d dkrsd
|d t|}td
|d	d
 q
| j	r~td
d dd | D }|S )Nc                   S   s   t dd S )Nc                   S   rL   rM   r'   r'   r'   r'   r(   rN   r   rO   zBLRNounExtractor._build_lrgraph.<locals>.<lambda>.<locals>.<lambda>)r   r'   r'   r'   r(   rN   r   s    z0LRNounExtractor._build_lrgraph.<locals>.<lambda>r   r   rQ   rR   z%building L-R graph from {} / {} sentsrS   rT   rU   z-[Noun Extractor] building L-R graph was donez                    c                 S   s$   i | ]\}}|d d |  D qS )c                 S   s   i | ]\}}||qS r'   r'   )rX   r;   r8   r'   r'   r(   r\      s    z=LRNounExtractor._build_lrgraph.<locals>.<dictcomp>.<dictcomp>)r`   )rX   lrdictr'   r'   r(   r\      s   $ z2LRNounExtractor._build_lrgraph.<locals>.<dictcomp>)r   r1   r   r   r   r"   r^   r_   r   r   r6   r    r`   )r#   rC   rJ   rK   r   rc   rd   re   nrh   r;   rg   r'   r'   r(   rI   q   s2   	zLRNounExtractor._build_lrgraphc                 C   s   |s| j }i }t|dd dD ]}t|dkrq| ||}|d |k r%q|||< q| |||}| |}| jrBtdt| |S )Nc                 S   s   t | S r@   r"   )rY   r'   r'   r(   rN      s    z)LRNounExtractor.extract.<locals>.<lambda>keyr   r   z'[Noun Extractor] {} nouns are extracted)	r   sortedr"   predict_postprocess_to_NounScorer   r    r6   )r#   rD   rE   rF   nounswordr<   nouns_r'   r'   r(   rB      s   

zLRNounExtractor.extractc                 C   s    | j |d}dd |D }|S )Nr   c                 S   s   g | ]}|d  r|qS )r   r'   )rX   featurer'   r'   r(   
<listcomp>   s    z3LRNounExtractor._get_r_features.<locals>.<listcomp>)r   get_r)r#   rs   featuresr'   r'   r(   _get_r_features   s   zLRNounExtractor._get_r_featuresc           
      C   s   i }t dt|D ]>}|d | }||d  }||v r/||v r/|| }|| }	t||	||< q	||v rG| j|d|krG| j|ddf||< q	|sLdS t| dd dd d S )Nr           r   )rz   r   c                 S   s   | d d  S )Nr   r   r'   xr'   r'   r(   rN      s    z4LRNounExtractor._get_subword_score.<locals>.<lambda>rl   )r^   r"   r4   r   getrn   r`   )
r#   rs   rD   rr   subword_scoresr=   subwordsuffixscore1score2r'   r'   r(   _get_subword_score   s   z"LRNounExtractor._get_subword_scorec                 C   s   |  |d |kS rM   )ro   )r#   rs   rD   r'   r'   r(   is_noun   s   zLRNounExtractor.is_nounc                 C   sF   |  |}t|| jkr| ||}|S |du ri }| |||}|S )z,Returns (noun_score, known_r_ratio)
        N)ry   r"   r   _predictr   )r#   rs   rD   rr   rx   r<   r'   r'   r(   ro      s   
zLRNounExtractor.predictc           	         s    fdd}	 d}d}d}|D ] \}}| j v r+|||s*|| j |  7 }||7 }q||7 }q|dkr6dn|| || dkrBdfS |||  fS )Nc                    s>   t t| d ddD ]}| |d  | }| jv r dS q
dS )Nr   r   TF)r^   r"   r   )rs   r;   r=   r   r#   r'   r(   exist_longer_r_feature   s   
z8LRNounExtractor._predict.<locals>.exist_longer_r_featurer   )r   )	r#   rx   rs   r   r<   normunknownr;   freqr'   r   r(   r      s"   



zLRNounExtractor._predictc                    s   fdd}fdd  fdd}t  D ]C}|d dks'|d d	kr-| qt|}|d
ks9||r:qtd
t|D ]}|d | }	||d  }
||	|
r[|  nqAqfdd D }|S )Nc                    s   | v oj |d kS )Nrz   )r   r}   )rh   r;   )rD   rr   r#   r'   r(   is_Noun_Josa   s   z2LRNounExtractor._postprocess.<locals>.is_Noun_Josac                    sL    j | d d}t| }|r|dkrdS t j | d| d|d  S )Nr   r   )r   r}   r"   mathpow)rs   baserj   r   r'   r(   cohesion   s
   "z.LRNounExtractor._postprocess.<locals>.cohesionc                    s    |  | d d kS )Nr   r'   )rs   )r   r'   r(   longer_has_larger_cohesion   s   z@LRNounExtractor._postprocess.<locals>.longer_has_larger_cohesionr   .,r*   c                    s"   i | ]\}}| v d kr||qS )Fr'   )rX   rs   r<   )removalsr'   r(   r\     s   " z0LRNounExtractor._postprocess.<locals>.<dictcomp>)ra   addr"   r^   r`   )r#   rr   rD   rE   r   r   rs   rj   r=   rh   r;   rt   r'   )r   rD   rr   r   r#   r(   rp      s*   


zLRNounExtractor._postprocessc           	      C   s   i }t |dd dD ]$}| j|d}tdd |D ||< |D ]\}}| j|| | q q
| j  i }| D ]\}}t|| |d |d ||< q:|S )	Nc                 S   s
   t |  S r@   rk   r{   r'   r'   r(   rN     s   
 z/LRNounExtractor._to_NounScore.<locals>.<lambda>rl   r   c                 s   s    | ]\}}|V  qd S r@   r'   )rX   rY   cr'   r'   r(   	<genexpr>  s    z0LRNounExtractor._to_NounScore.<locals>.<genexpr>r   r   )rn   r   rw   sumremove_eojeolreset_lrgraphr`   r   )	r#   rr   noun_frequenciesrs   r_countr;   countrt   r<   r'   r'   r(   rq     s   
zLRNounExtractor._to_NounScore)r	   r
   NTr   F)r>   r?   N)r?   )r>   )r>   N)__name__
__module____qualname__r)   r!   rG   rA   rH   rI   rB   ry   r   r   ro   r   rp   rq   r'   r'   r'   r(   r      s(    



&


 r   )collectionsr   r   r   r-   soynlp.normalizerr   soynlp.wordr   soynlp.utilsr   r   r   r'   r'   r'   r(   <module>   s   
