o
    QiF                     @   s   d dl mZ d dlmZ d dlmZ d dlmZ d dlmZ d dl	m
Z
 d dl	mZ d dlmZ d d	lmZ d d
lmZ d dlmZ G dd dZdS )    )defaultdict)	decompose)lemma_candidate)	conjugate)LRNounExtractor_v2)load_default_adverbs)stem_to_adverb)
Predicator)PredicatorExtractor)MaxScoreTokenizer)LRGraphc                   @   s   e Zd Zd7ddZ					d8d	d
Z					d8ddZdd Z		d9ddZ		d:ddZdd Z	dd Z
dd Zd;ddZdd Zdd Zd d! Zd"d# Zd$d% Zd&d' Zd(d) Zd*d+ Zd,d- Zd.d/ Zd0d1 Zd2d3 Zd4d5 Zd6S )<NewsPOSExtractorTc                 C   s   || _ || _|| _d S N)_verbose_ensure_normalized_extract_eomi)selfverboseensure_normalizedextract_eomi r   H/home/ubuntu/.local/lib/python3.10/site-packages/soynlp/pos/_news_pos.py__init__   s   
zNewsPOSExtractor.__init__      333333?   Fc                 C   s$   |  |||||||||	|

 |  S r   )trainextractr   sentsmin_num_of_noun_features!max_frequency_when_noun_is_eojeolmin_noun_scoremin_noun_frequencymin_eojeol_frequencymin_predicator_frequencymin_num_of_eomi_featuresmin_eomi_scoremin_eomi_frequencydebugr   r   r   train_extract   s   zNewsPOSExtractor.train_extractc                 C   s   |  ||||||| _| |||||	|
\| _| _| jj| _| jj| _	| jj
| _| jj| _t | _| jt| j | jt| j	 | jjjdd| _| jj| _d S )NTreset_lrgraph)_train_noun_extractornouns_train_predicator_extractor
adjectivesverbspredicator_extractor_adjective_stemsadjective_stems_verb_stems
verb_stems_eomiseomis_josasjosasr   adverbsupdater   noun_extractorlrgraphto_EojeolCountereojeols_counterr   r   r   r   r   )   s"   



zNewsPOSExtractor.trainc           	      C   sh   |   \}}}}}}}| || j| j| j}| || j| j| j}|| j|| j|| j|| j||d
}|S )N)
NounEomi	AdjectiveAdjectiveStemVerbVerbStemAdverbJosaIrrecognizableConfusedNouns)_count_matched_patterns_as_predicatorr1   r5   r9   r2   r7   r;   )	r   r/   r1   r2   r<   r;   irrecognizedconfused_nounswordtagsr   r   r   r   G   s&   zNewsPOSExtractor.extractc              	   C   s@   t ddd| j| j||d| _| j|| | jj||dd}|S )NF)extract_pos_featureextract_determinerextract_compoundr   r   min_num_of_featuresr"   r,   )r   r   r   r>   r   r   )r   r    rU   r"   r#   r$   r%   r/   r   r   r   r.   _   s   
z&NewsPOSExtractor._train_noun_extractor   c                 C   sz   t | jjj}dd | jjD }|dd | jjD  t| j|| j	d| j
d| _| j||dd |d|||	\}	}
|	|
fS )Nc                 S      h | ]}|qS r   r   .0rr   r   r   	<setcomp>{       z?NewsPOSExtractor._train_predicator_extractor.<locals>.<setcomp>c                 S   rW   r   r   rX   r   r   r   r[   |   r\   F)r   extract_stemr   i T)r   r>   r?   _lr_pos_featuresr=   _common_featuresr
   r/   r   r   r3   r+   )r   r    r&   r%   r'   r(   r)   predicator_lrgraphnoun_pos_featuresr1   r2   r   r   r   r0   s   s    z,NewsPOSExtractor._train_predicator_extractorc              	      s  | j }t| }| |\}} | || | j\}} }| | \} | | \} | || |\}} }| 	|}| 
|| |\}} fdd| D fdd| D }| jrz| || || | ||fS )Nc                    s.   i | ]\}}| v s|v s|v r||qS r   r   rY   wordcount)r1   r<   r2   r   r   
<dictcomp>   s    z<NewsPOSExtractor._count_matched_patterns.<locals>.<dictcomp>c                       i | ]\}}| vr||qS r   r   rc   )rP   r   r   rf      s    )rA   sumvalues_match_word_match_noun_and_wordr;   _match_predicator_compounds_lemmatizing_predicators_match_syllable_noun_and_r_remove_irregular_words_match_compound_nounsitemsr   _print_stats)r   rA   total_frequencyr/   r;   r   )r1   r<   rP   r2   r   rM      s4   





z(NewsPOSExtractor._count_matched_patternsc           	      C   sJ   i }|  D ]\}}||v r|| j}n| |||}t||||< q|S r   )rq   lemma
_lemmatizer	   )	r   counter
lemma_dictstemr9   predicatorsrd   re   rt   r   r   r   rN      s   zNewsPOSExtractor._as_predicatorc                 C   s   ||v S r   r   )r   rd   	referencer   r   r   _match   s   zNewsPOSExtractor._matchc                 C   sN   t |t|D ]}|d | ||d  }}||v r$||v r$||f  S qd S r   )rangelen)r   rd   lsetrsetbeginilrZ   r   r   r   _separate_lr   s   zNewsPOSExtractor._separate_lrc                 C   s&   |D ]\}}| |d| ||< q|S )Nr   )get)r   rv   
word_countrd   re   r   r   r   _cumulate_counter   s   z"NewsPOSExtractor._cumulate_counterc                    s    fdd|  D S )Nc                    rg   r   r   rc   removalsr   r   rf          z7NewsPOSExtractor._remove_recognized.<locals>.<dictcomp>rq   )r   rA   r   r   r   r   _remove_recognized   s   z#NewsPOSExtractor._remove_recognizedc                    sj    fdd}t t|d ddD ]!}zt|d | ||d  }W n   Y q||}|r2|  S qd S )Nc                    s    fdd| D S )Nc                    s(   g | ]}|d  v r|d  v r|qS )r   r   r   )rY   rt   r9   stemsr   r   
<listcomp>   s    zDNewsPOSExtractor._lemmatize.<locals>.only_knowns.<locals>.<listcomp>r   )lemmasr   r   r   only_knowns   s   z0NewsPOSExtractor._lemmatize.<locals>.only_knownsr   r   )r|   r}   r   )r   rd   r   r9   r   r   r   r   r   r   ru      s   zNewsPOSExtractor._lemmatizec           
         s   dd  t  }i }i }| D ]J\}}|||d u rq|d  j}	fdd|	D }	 fdd|	D }	fdd|	D }	|	sDqt||	||< |dd |	D  |||< q|||fS )	Nc                 S   s   t | d }t |d }td}td}|d |v r"|d |v r"dS |d dkrB|d d	krB|d d
krB|d dks@|d dkrBdS dS )Nr   r   u   ㄹㅂu   ㄴㄹㅁㅂrV   Fr   u   ㅡ u   ㅇu   ㅓu   ㅏT)r   set)rx   eomir   rZ   	jongcho_l	jongcho_rr   r   r   check_suffix_prefix   s   <zINewsPOSExtractor._parse_predicator_compounds.<locals>.check_suffix_prefixr   c                    s    h | ]\}} d  | |fqS r   r   rY   rx   r   )lrr   r   r[           z?NewsPOSExtractor._parse_predicator_compounds.<locals>.<setcomp>c                    s"   h | ]\}} ||r||fqS r   r   r   )r   r   r   r[         " c                    s,   h | ]\}}| j vr| jvr||fqS r   )r7   r5   r   r   r   r   r[      s    c                 S   s   h | ]\}}|qS r   r   )rY   rx   _r   r   r   r[          )r   rq   r   rt   r	   r=   )
r   rA   ry   baser   predicator_compoundsrv   rd   re   r   r   )r   r   r   r   _parse_predicator_compounds   s$   

z,NewsPOSExtractor._parse_predicator_compoundsc                    s    j rtdt|  fdd| D } fdd| D } fdd| D } fdd| D }||||fD ]} ||}q>|||||fS )NzL[POS Extractor] matching "Noun, Adjective, Verb, and Adverb" from {} eojeolsc                    $   i | ]\}}  | jr||qS r   )r{   r/   rc   r   r   r   rf         $ z0NewsPOSExtractor._match_word.<locals>.<dictcomp>c                    r   r   )r{   r1   rc   r   r   r   rf      r   c                    r   r   )r{   r2   rc   r   r   r   rf     r   c                    r   r   )r{   r<   rc   r   r   r   rf     r   )r   printformatr}   rq   r   )r   rA   r/   r1   r2   r<   r   r   r   r   rj      s   zNewsPOSExtractor._match_wordc                    s    j rtdt|  fdd}||||i t \}}}||||||\}}}||||||\}}} ||}|||||fS )NzE[POS Extractor] matching "Noun + Josa/Adjective/Verb" from {} eojeolsc                    sr    fdd|   D }dd |D } dd |D  |dd |D }|dd |D   ||fS )Nc                    s"   g | ]\}} | |fqS r   )r   rc   )r/   r   r   r   r   r     r   zPNewsPOSExtractor._match_noun_and_word.<locals>.match_process.<locals>.<listcomp>c                 S   s    g | ]\}}|d ur||fqS r   r   rc   r   r   r   r     r   c                 S      g | ]
\}}|d  |fqS r   r   rc   r   r   r   r         c                 S   r   r   r   rc   r   r   r   r     r   c                 S   s   h | ]	\}}d  |qS ) )join)rY   rd   r   r   r   r   r[     s    zONewsPOSExtractor._match_noun_and_word.<locals>.match_process.<locals>.<setcomp>)rq   r   r=   )rA   r/   r   rcounterr   noun_rr   )r/   r   r   match_process  s   
z<NewsPOSExtractor._match_noun_and_word.<locals>.match_process)r   r   r   r}   r   r   )r   rA   r/   r1   r2   r;   r   r   r   r   r   rk   	  s   z%NewsPOSExtractor._match_noun_and_wordc                 C   s4  | j rtdt| t| j }|t| j  t| jt| j}}| 	||| j\}}}	| j| | j
| | ||	 }| 	||| j\}}}	| j| | j| | ||	 }dd |D }
|
dd |D  | ||
}| j rt| jt| j}}td||||f  |||fS )NzF[POS Extractor] matching "Predicator + Adjective/Verb" from {} eojeolsc                 S   rW   r   r   rY   rd   r   r   r   r[   3  r\   z?NewsPOSExtractor._match_predicator_compounds.<locals>.<setcomp>c                 S   rW   r   r   r   r   r   r   r[   4  r\   z'    adjective: %d -> %d, verb: %d -> %d)r   r   r   r}   r   r1   keysr=   r2   r   r5   r   rq   r7   r   )r   rA   r1   r2   ry   
before_adjbefore_verb	compoundsr   rv   r   	after_adj
after_verbr   r   r   rl     s4   


z,NewsPOSExtractor._match_predicator_compoundsc                    s   dfdd	}j rtdt| ||jjj   fdd| D }|| }j	  ||j
jj fdd| D }|| }j	 ||}||}|||fS )	NTc           
         st   i }t | }t|  D ]+\}\}}|r$|d dkr$td||dd  |||}	|	d u r0qt||	||< q|S )Ni  r   z    lemmatizing {} / {}r   )end)r}   	enumeraterq   r   r   ru   r	   )
rA   r   r9   r   
predicatornr   rd   re   r   r   r   r   	lemmatize>  s   z<NewsPOSExtractor._lemmatizing_predicators.<locals>.lemmatizez.    lemmatizing Adjective/Verb from {} eojeolsc                       i | ]\}}| v r||qS r   r   rc   )new_adjectivesr   r   rf   N  r   z=NewsPOSExtractor._lemmatizing_predicators.<locals>.<dictcomp>c                    r   r   r   rc   )	new_verbsr   r   rf   S  r   )T)r   r   r   r}   r5   r9   rq   r   r1   r=   r7   r2   r   )r   rA   r1   r2   r   counter_adjcounter_verbr   )r   r   r   r   rm   =  s   
z)NewsPOSExtractor._lemmatizing_predicatorsc                    sl    fdd}| j rtdt  || j|t }|| j||}|| j||}|  |  |||fS )Nc                    sj      D ].\}}|d d |dd  }}||v r2|d| |< ||d| ||< || q|S )Nr   r   )rq   r   add)r   r   r   rd   re   r   rZ   rA   r/   r   r   syllable_noun_and_r]  s   
zHNewsPOSExtractor._match_syllable_noun_and_r.<locals>.syllable_noun_and_rzF[POS Extractor] parse 1 syllable Noun + Adj/Verb/Josa from {} eojeols)	r   r   r   r}   r1   r   r2   r;   r   )r   rA   r/   r1   r2   r;   r   r   r   r   r   rn   \  s   	z+NewsPOSExtractor._match_syllable_noun_and_rc                    s"   fdd  fdd|  D S )Nc                    s    |  j v p|  jv pt| dkS )Nr   )r;   r9   r}   )rd   r   r   r   remover  s    z8NewsPOSExtractor._remove_irregular_words.<locals>.removec                    s   i | ]\}} |s||qS r   r   rc   )r   r   r   rf   t  r   z<NewsPOSExtractor._remove_irregular_words.<locals>.<dictcomp>r   )r   rA   r   )r   r   r   ro   q  s   z(NewsPOSExtractor._remove_irregular_wordsc           	      C   s   | j rtdt| dd |D }|dd |D  |dd |D  |dd |D  | |||\}}| || }| ||}||fS )Nz6[POS Extractor] extract compound nouns from {} eojeolsc                 S   rW   r   r   r   r   r   r   r[   z  r\   z9NewsPOSExtractor._match_compound_nouns.<locals>.<setcomp>c                 S   rW   r   r   r   r   r   r   r[   {  r\   c                 S   rW   r   r   r   r   r   r   r[   |  r\   c                 S   rW   r   r   r   r   r   r   r[   }  r\   )	r   r   r   r}   r=   _extract_compound_nounsr   rq   r   )	r   rA   r/   r1   r2   r;   suffixr   r   r   r   r   rp   v  s   z&NewsPOSExtractor._match_compound_nounsc                    s   t d  fdd}t|||dft|||dft|||dft|||dft|||dft|||d	fg}	|	D ]	}
t d
j|
  q>d S )Nz
[POS Extractor] ## statisticsc                    s   dt |     S )Nd   )rh   ri   )dicrs   r   r   <lambda>  r   z/NewsPOSExtractor._print_stats.<locals>.<lambda>zNoun + [Josa/Predicator]z[Noun] + Adjectivez[Noun] + Verbz[Noun] + JosarI   rK   z*[POS Extractor] ({}, {:.3f} %) words in {})r   r}   r   )r   rs   r/   r1   r2   r<   r;   rA   
as_percentstatsargsr   r   r   rr     s   zNewsPOSExtractor._print_statsc                    s    fdd}t dd |D d}i t }}| D ]$\}}	|j|ddd }
||
}|d ur>||d|	 ||< || q||fS )	Nc                    s   | d d D ]}|d dkr d S qt | dkr.| d d  v r.ddd | d d D S | d d dkr@ddd | D S d S )Nr      r   r   c                 s       | ]}|d  V  qdS r   Nr   rY   tr   r   r   	<genexpr>      zSNewsPOSExtractor._extract_compound_nouns.<locals>.parse_compound.<locals>.<genexpr>c                 s   r   r   r   r   r   r   r   r     r   )r}   r   )tokenstokenr   r   r   parse_compound  s   z@NewsPOSExtractor._extract_compound_nouns.<locals>.parse_compoundc                 S   s   i | ]}t |d kr|d qS r   )r}   )rY   nounr   r   r   rf     r   z<NewsPOSExtractor._extract_compound_nouns.<locals>.<dictcomp>)scoresF)flattenr   )r   r   rq   tokenizer   r   )r   rA   r/   r   r   	tokenizerr   r   rd   re   r   r   r   r   r   r     s   
z(NewsPOSExtractor._extract_compound_nounsN)TTT)
r   r   r   r   r   r   r   r   r   F)r   r   r   r   r   )r   rV   r   r   r   )rV   )__name__
__module____qualname__r   r+   r   r   r.   r0   rM   rN   r{   r   r   r   ru   r   rj   rk   rl   rm   rn   ro   rp   rr   r   r   r   r   r   r      sL    




#

!r   N)collectionsr   soynlp.hangler   soynlp.lemmatizerr   r   soynlp.nounr   
soynlp.posr   r   soynlp.predicatorr	   r
   soynlp.tokenizerr   soynlp.utilsr   r   r   r   r   r   <module>   s    