o
    QiUJ                     @   s0   d dl mZ eddZd dlZG dd dZdS )    )
namedtupleNewsNounScorezjscore frequency feature_proportion eojeol_proportion n_positive_feature unique_positive_feature_proportionNc                   @   s   e Zd Z		d:ddZdd Zd	d
 Z		d;ddZdd Zdd Z		d<ddZ	d=ddZ
		d>ddZdd Z		d?ddZd@d!d"ZdAd#d$ZdBd%d&ZdCd(d)Zd*d+ Zd,d- Zd.d/ ZdDd1d2ZdEd4d5ZdFd6d7Zd8d9 ZdS )GNewsNounExtractor
      NTc           	      C   s   || _ || _|| _i | _|rtni | _dd l}d|jt	
dddd d }|s8d| g}|r8td |D ]}| | q:dd | j D | _| jd	h t| t | _| j| d
|  | j| d|  d S )Nr   /\z'%s/trained_models/noun_predictor_sejongzCused default noun predictor; Sejong corpus based logistic predictorc                 S   s   h | ]
\}}|d kr|qS 皙? ).0rsr   r   J/home/ubuntu/.local/lib/python3.10/site-packages/soynlp/noun/_noun_news.py	<setcomp>   s    z-NewsNounExtractor.__init__.<locals>.<setcomp>   는z!%s/pos/dictionary/sejong/Verb.txtz&%s/pos/dictionary/sejong/Adjective.txt)max_left_lengthmax_right_lengthverboser_scoresnoun_dictionaryosjoinpathabspath__file__replacesplitprint_load_predictoritemsjosa_dictionaryupdateset_vdictionary_load_dictionary)	selfr   r   predictor_fnamesr   base_noun_dictionaryr   	directoryfnamer   r   r   __init__	   s&   (
zNewsNounExtractor.__init__c              
   C   s   zKzEt jjdkrt|}nt|dd}t|D ]'\}}| d\}}t|}|| jv r9t	| j| || j|< q|| j|< qW |
  W d S |
  w  tyb } zt| W Y d }~d S d }~ww )N   utf-8encoding	)sysversion_infomajoropen	enumeratestripr   floatr   maxclose	Exceptionr   )r'   r+   fnum_lineliner   scoreer   r   r   r    %   s"   

z!NewsNounExtractor._load_predictorc              
   C   s   z&z t jjdkrt|}nt|dd}dd |D }W |  |W S |  w  ty? } zt| t W  Y d }~S d }~ww )Nr-   r.   r/   c                 S   s   h | ]}|  d d qS )r1   r   )r7   r   )r   wordr   r   r   r   @       z5NewsNounExtractor._load_dictionary.<locals>.<setcomp>)r2   r3   r4   r5   r:   r;   r   r$   )r'   r+   r<   wordsr@   r   r   r   r&   9   s   

z"NewsNounExtractor._load_dictionary   皙?333333?c                 C   s   |  | | ||||S N)trainextract)r'   sentsmin_frequencymin_noun_scorenoun_candidatesmin_feature_proportionr   r   r   train_extractH   s   
zNewsNounExtractor.train_extractc                 C   s   | j rtd | |\| _| _| _dd | j D | _dd | j D | _| j r?td	t
| jt
| jt
| j d S d S )Nzscan vocabulary ... c                 S      i | ]\}}|t | qS r   sumvaluesr   kdr   r   r   
<dictcomp>T   rB   z+NewsNounExtractor.train.<locals>.<dictcomp>c                 S   rP   r   rQ   rT   r   r   r   rW   U   rB   z(done (Lset, Rset, Eojeol) = ({}, {}, {}))r   r   _build_graphlrgraphrlgrapheojeolsr!   lcountrcountformatlen)r'   rJ   r   r   r   rH   O   s   
zNewsNounExtractor.trainc                    s   ddl m  ddl m} dd }| j| j |fdd|D }  fdd}  fd	d}| D ]A\}}t|}	td
t| j|	d
 D ]-}
|	|
 | jkrRqH|d |
 ||
d  }}|| |  |7  < |ru|| |  |7  < qHq5|||||fS )Nr   defaultdict)Counterc                 S   s   dd |   D S )Nc                 S   s   i | ]	\}}|t |qS r   )dictrT   r   r   r   rW   ^       zDNewsNounExtractor._build_graph.<locals>.<lambda>.<locals>.<dictcomp>)r!   )ddr   r   r   <lambda>^   s    z0NewsNounExtractor._build_graph.<locals>.<lambda>c                 3   s.    | ]}|  D ]}t| kr|V  qqd S rG   )r   r_   )r   senteojeol)max_eojeol_lengthr   r   	<genexpr>b   s    z1NewsNounExtractor._build_graph.<locals>.<genexpr>c                          dd S )Nc                   S      dS Nr   r   r   r   r   r   rf   f       BNewsNounExtractor._build_graph.<locals>.<lambda>.<locals>.<lambda>r   r   r`   r   r   rf   f       c                      rk   )Nc                   S   rl   rm   r   r   r   r   r   rf   g   rn   ro   r   r   r`   r   r   rf   g   rp      )	collectionsra   rb   r   r   r!   r_   rangemin)r'   rJ   rb   dictdictizer[   rY   rZ   rh   countnr@   lr   r   )ra   ri   r   rX   [   s*   zNewsNounExtractor._build_graphc                    s*     |sdd j D } fdd|D }i }t|D ]&\}}|||< jrG|d d dkrGd}tj|	|d t
| q!jrRtd	d	 td
t
| |||}tdt
|   jD ]}	|	jvr~|	j|	< qoj D ]	\}	}
|
j|	< q``jS )Nc                 S   s    g | ]\}}t |d kr|qS )r-   r_   r   rx   cr   r   r   
<listcomp>y        z-NewsNounExtractor.extract.<locals>.<listcomp>c                    s,   g | ]}j |d  kr|jvr|qS r   )r\   getr   r   rx   rK   r'   r   r   r|   z   s    rq     r   z"predicting noun score ... {} / {}z!predicting noun score was done{}z(                                        zbefore postprocessingzafter postprocessing)_pre_eojeol_analysisr\   r!   r6   predictr   r2   stdoutwriter^   r_   r   _postprocessing_post_eojeol_analysisr   _noun_scores_postprocessed_noun_scores_)r'   rL   rK   rM   rN   noun_scoresirx   messagenounr?   r   r   r   rI   s   s4   



zNewsNounExtractor.extractGz?c                    s   dfdd	} fddj  D }t| D ]=\}\}}jrC|d d d	krCtj|d t|f}d
}	tj|	j	|  ||}
|
sJqj 
|
d	 d	j|
d	 < qjrld}	tj|		tj d S d S )N      ?rE   c                    sn   t | }|dk r
d S td|d D ]#}| d | | |d  } } | d |kr4| jv r4| |f  S qd S )N   r-   rq   r   )r_   rs   r   r%   )rx   max_eojeol_proportionrL   rw   r@   r   r'   r   r   eojeol_to_NV   s   z<NewsNounExtractor._pre_eojeol_analysis.<locals>.eojeol_to_NVc                    4   i | ]\}}|krj |d |  kr||qS r~   r[   r   rz   min_eojeol_proportionrK   r'   r   r   rW          z:NewsNounExtractor._pre_eojeol_analysis.<locals>.<dictcomp>rq   r   r   z@extracting {} nouns using verb/adjective dictionary ... {} / {}z3extracted {} nouns using verb/adjective dictionary)r   rE   )r\   r!   r6   r   r_   r   r2   r   r   r^   r   )r'   rK   r   r   
candidatesr   rx   r{   argsr   nvr   r   r   r      s   	z&NewsNounExtractor._pre_eojeol_analysisc                    s   fddj  D }tj}t| D ]Q\}\}}jrC|d d dkrCtj| |d t|f}	d}
tj|
j	|	  |j
v rIq|rOq|rUq|r[q|rj|rj|j|< qjrd}
tj|
	tj|  d S d S )Nc                    r   r~   r   rz   r   r   r   rW      r   z;NewsNounExtractor._post_eojeol_analysis.<locals>.<dictcomp>rq   r   r   z1extracting {} compounds from eojeols ... {} / {}z$extracted {} compounds from eojeols)r\   r!   r_   r   r6   r   r2   r   r   r^   r   
_is_NJsubJ_is_NJ_is_NV_hardrule_suffix_filter_is_compound)r'   rK   r   rL   r   beginr   rx   r{   r   r   r   r   r   r      s,   





 z'NewsNounExtractor._post_eojeol_analysisc                 C   s   d\}}}}}| j |i  D ],\}}||7 }|| jvrq||7 }||| j|  7 }|d7 }|| j| dkr9dnd7 }q|rC|| nd}| j|d}	||	 dkrX|||	  nd}
|dkrb|	| nd}|dkrjdn|| }t|||
|||S )N)r   r   r   r   r   rq   r   )rY   r   r!   r   r[   r   )r'   rx   normr?   _totaln_positive_feature	n_featurer   	frequencyn_eojeolfeature_proportioneojeol_proportion"unique_positive_feature_proportionr   r   r   r      s   
zNewsNounExtractor.predictc           
         sT  t t fdd| _tdtj jr"d}tj	| fddjD fddjD }fddjD }jrJtj	d	 i _
tj D ]K\}\}}	jrv|d
 d dkrvd}tj	||d
 tj |v s||v s||v rqT|s|sqT|sqT|sqT|	j
|< qTjrtd j
S )Nc                    s0   | d d ko| d d  kot | d dkS )Nrq   r   ry   )x)rN   rL   r   r   rf      s    z3NewsNounExtractor._postprocessing.<locals>.<lambda>r   u[   finding NJsubJ (대학생(으)+로), NsubJ (떡볶+(이)), NVsubE (사기(당)+했다) ... c                    s   h | ]	}  |r|qS r   )r   r   r   r   r   r      rd   z4NewsNounExtractor._postprocessing.<locals>.<setcomp>c                    s(   h | ]} |D ]}| vr	|q	qS r   )_find_NsubJ)r   rx   l0)njsubjsr'   r   r   r      s   ( c                    s.   h | ]}  |r |r |s|qS r   )
_is_NVsubE	_is_NWsubr   r   r   r   r   r      s   . donerq   r   r   zchecking hardrules ... {} / {}zchecking hardrules ... done)rc   filterr!   r   r   r_   r   r2   r   r   r   r6   r^   _hardrule_unijosa_filterr   r   _hardrule_dang_hada_filter)
r'   r   rL   rN   r   nsubsnvsubesr   r   r?   r   )rN   rL   r   r'   r   r      s@   	

z!NewsNounExtractor._postprocessing皙?ffffff?c                    s   fddfddt dtD  fddD s#dS dt }|d	kr4dS | |kr?dS fd
dji  D }tfdd| D t|  }||kS )u!   ### NJsub + J: 대학생으 + 로c                    s&    D ]}| |d  j v r dS qdS )NTFr"   )tokenr@   )l0_candidatesr'   r   r   match_NJsubJ  s
   z2NewsNounExtractor._is_NJsubJ.<locals>.match_NJsubJc                    s   h | ]} d | qS rG   r   r   r@   rx   r   r   r   #  s    z/NewsNounExtractor._is_NJsubJ.<locals>.<setcomp>r-   c                    s0   h | ]}|j v s|d   krt|qS r~   )r   r   r_   r   r   )candidate_noun_thresholdr'   r   r   r   $     0 FNr   c                    s   i | ]	\}} | |qS r   r   r   r   r{   r   r   r   rW   ,  rd   z0NewsNounExtractor._is_NJsubJ.<locals>.<dictcomp>c                 3   s     | ]\}} |r|V  qd S rG   r   )r   r   r{   )r   r   r   rj   -  s    z/NewsNounExtractor._is_NJsubJ.<locals>.<genexpr>)	rs   r_   l_frequencyr9   rY   r   r!   rR   rS   )r'   rx   r   njsub_proportion_thresholdmin_frequency_dropratebasetokenspropr   )r   rx   r   r   r'   r   r     s    &zNewsNounExtractor._is_NJsubJc                    sH   fddfddt dtD } fdd|D }|S )u   ### Nsub + J: 떡볶 + 이c                    s     |   | S rG   r   )r   rx   r   r   r   rf   2  s    z/NewsNounExtractor._find_NsubJ.<locals>.<lambda>c                    s*   h | ]} |d  j v r d | qS rG   r   r   rx   r'   r   r   r   3  s   * z0NewsNounExtractor._find_NsubJ.<locals>.<setcomp>r-   c                    s0   h | ]} |d   kr|kr|qS r~   )r   r   r   rx   nsubj_proportion_threshold
proportionr'   r   r   r   4  r   )rs   r_   )r'   rx   r   r   r   r   r   r   r   0  s   zNewsNounExtractor._find_NsubJc                    s   fdd j |i }|sdS t|}td|dkrdndD ]7}|d|  || d }|jvs<|jvr=q!t fdd	| D t|  }|krX d
S q!dS )uH   is_NVsubE('성심당') # False
           is_NVsubE('폭행당') # True c                    s:   t j| i  }|sdS tdj| d|  kS )NFrq   r   )rR   rY   r   rS   rt   r[   )wsum_)r   r'   r   r   r   ;  s   $z7NewsNounExtractor._is_NVsubE.<locals>.eojeol_proportionFrq   rD   r-   Nc                    s    g | ]\}} | | qS r   r   r   )r   r0r   r   r|   H  r}   z0NewsNounExtractor._is_NVsubE.<locals>.<listcomp>T)	rY   r   r_   rs   r   r   rR   r!   rS   )r'   rx   r   r_extensionsrw   br   r_extension_as_eojeolr   )r   r   r   r'   r   r   7  s   (zNewsNounExtractor._is_NVsubEr   c                    sv    fdd}t dt dkrdndD ]$} d |   | d  }}|jv s/|jv r8|||k r8 dS qdS )Nc                    s&    | dkr	dS    |  S rm   r   )r   r   r   r   frequency_droprateN  s   &z7NewsNounExtractor._is_NWsub.<locals>.frequency_dropraterq   rD   r-   TF)rs   r_   r   r   )r'   rx   r   r   r   r   r   r   r   r   r   M  s    zNewsNounExtractor._is_NWsubc           	      C   s  t |}|dk r
dS td|D ]&}|d | ||d  }}|| jv s(|| jv r5|| jv s2|| jv r5 dS q|dkrtd|d D ]D}t|d |d D ]8}|d | ||| ||d  }}}|| jv sl|| jv r|| jv sv|| jv r|| jv s|| jv r  dS qLqAdS )Nr   Fr-   T   rD   rq   )r_   rs   r   r   )	r'   rx   rw   r@   r   l1e1e2l2r   r   r   r   W  s    ((<zNewsNounExtractor._is_compoundc                 C   sf   t dt|d D ]'}|d | ||d  }}|| jv s"|| jv r0|r-| j|ddkr0 dS q	dS )Nr-   rq   r   TF)rs   r_   r   r   r   r   )r'   r   r@   rx   r   r   r   r   r   h  s   *zNewsNounExtractor._is_NJc                 C   sT   t dt|D ] }|d | | jv s|d | | jv r'||d  | jv r' dS qdS )Nr-   TF)rs   r_   r   r   r%   )r'   rx   r@   r   r   r   r   o  s
   6zNewsNounExtractor._is_NVrq   c                    s   dd  |j vrdS j | d |krSj | d |krSj|i }|s(dS t fddj|  D }td	d j|  D }|rM|| nd
}|dkS dS )Nc                 S   s   h d}| |v S )N>*      들   를   뿐   임   한   할   되고   되는   되다   된다   들에   들의   로는   로도   로서   부터   뿐만   뿐이   뿐인   에게   에도   에서   에와   이나   이라   이었   처럼   하다   했고   했다   했던	   으로의   과r      로   와   은   의   이   인   들이   으로r   )r   passsetr   r   r   has_passsetv  s   z?NewsNounExtractor._hardrule_unijosa_filter.<locals>.has_passsetTrq   r   Fc                 3   s6    | ]\}}|r |d d s |r|V  qd S )Nr-   )r   r   r  r'   r   r   rj     s   4 z=NewsNounExtractor._hardrule_unijosa_filter.<locals>.<genexpr>c                 s   s    | ]	\}}|r|V  qd S rG   r   r   r   r   r   rj         r   r   )r   rY   r   rR   r!   )r'   rx   rK   max_num_of_josardict
n_passjosan_nonemptyrpassset_propr   r  r   r   u  s   

$"z*NewsNounExtractor._hardrule_unijosa_filterr   c           
      C   s   ddl m} |d dks |d d | jv s|d d | jv r dS | j|i }tdd | D }d}| D ]\}}|s?q8||d }	|	rQ|	d dkrQ||7 }q8|dkrXdS || |k S )	Nr   )	decomposeu   당Tc                 s   s    | ]	\}}|r|V  qd S rG   r   r   r   r   r   rj     r  z?NewsNounExtractor._hardrule_dang_hada_filter.<locals>.<genexpr>u   ㅎ)soynlp.hangler  r   r   rY   r   rR   r!   )
r'   rx   max_h_proportionr  r	  n_basen_hr   r{   
rdecomposer   r   r   r     s   0z,NewsNounExtractor._hardrule_dang_hada_filterc                    s   fdd}| j v rdS h d}|dd  |v rdS |d dkr9||d||d	 ||d
 ||d dkr9dS |d dkrH||ddkrHdS |d dkrf||d||d ||d ||d dkrfdS |d dks|d dks|d dks|d dks|d dkr|d d  jvrdn j|d j|d d   }||k rdS |dd  dks|dd  dks|dd  dks|dd  dkr|d d  jvrdn j|d j|d d   }||k rdS |dd  dkrt|dks
|d d  jv s
|d d  jv rdS d S )!Nc                    s<   t  j| i  }|dkrdS  j| i |d| S rm   )rR   rY   r   rS   )rx   r   r   r   r   r   prop_r  s   &z9NewsNounExtractor._hardrule_suffix_filter.<locals>.prop_rF>      갔다   겠지   보는   이는r	   r  u   으r   u   로써u   로만u   로의r   u   지u   만u   없r   r   u   다u   었다r  r   r   r   r   r   r  u   들은u   들도u   들을r  rD   T)r%   r\   r   r_   r   r   )r'   rx   r   r  stoplsubdroprater   r   r   r     s,   
88<6@6Hz)NewsNounExtractor._hardrule_suffix_filterc                 C   s   | j |dS rm   )r\   r   )r'   rx   r   r   r   r     s   zNewsNounExtractor.l_frequency)r   r   NTN)rD   rE   NrF   )rE   rD   NrF   )rD   r   )rD   r   rE   )rE   rF   )rE   r   r   )r   r   )r   r
   )r   rq   )r   )r   )__name__
__module____qualname__r,   r    r&   rO   rH   rX   rI   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r      sB    



*


0






r   )rr   r   r   r2   r   r   r   r   r   <module>   s   
