o
    QiH                     @   s   d Z ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlm	Z	 ddl
mZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ eddZG dd dZdS )u6   TERM DEFINITION
(l, r) : L and R position subwords
stem : stem of Adjective and Verb
ending : suffix, canonical form of ending

stems : set of stem including Adjectives and Verbs
composable_stems : stems that can be compounded with other prefix
    - [] + 하다 : 덕질+하다, 냐옹+하다, 냐옹+하냥
endings : set of ending
pos_l_features : canonical form set of stems (L subwords)
pos_composable_l_features : canonical form set of composable stems (L subwords)
lrgraph : L-R graph including [stem + Ending], Adverbs, 
          and maybe some Noun + Josa
    )defaultdict)
namedtuplecharacter_is_complete_korean)EojeolCounter)get_process_memory)LRGraphinstallpath)	conjugate)lemma_candidate)_conjugate_stem)normalize_sent_for_lrgraph   )EomiExtractor)StemExtractor)conjugate_as_present)conjugate_as_imperative)conjugate_as_pleasure)rule_classify
Predicatorzfrequency lemmac                   @   s   e Zd Z			d8ddZdd Zd9d	d
Zd:ddZdd Zdd Zd;ddZ	e
dd Z						d<dd Z				d=d!d"Z		d>d#d$Zd9d%d&Zd?d'd(Zd)d* Z		d@d+d,Z		dAd-d.Zd?d/d0ZdBd1d2Zd3d4 ZdCd6d7ZdS )DPredicatorExtractorNFTc
           
      C   s   |s|   }|d u s|d u r|  \}}|d u r|  }|| _|| _|| _dd |D | _| j|| _|| _|| _	|| _
|| _|	| _|  | _d | _| |}|| _dd | jD | _d S )Nc                 S      h | ]}|qS  r   .0stemr   r   Q/home/ubuntu/.local/lib/python3.10/site-packages/soynlp/predicator/_predicator.py	<setcomp>6       z/PredicatorExtractor.__init__.<locals>.<setcomp>c                 S   r   r   r   r   eomir   r   r   r   C   r   )_load_default_josa_load_default_stems_load_default_eomis_josas_adjective_stems_verb_stems_stemsunion_eomisverboseextract_eomiextract_stemensure_normalized_transform_stem_as_surfaces_stem_surfaceseojeol_counter_remove_stem_prefix_nouns_eomis_)
selfnounsjosas
adjectivesverbseomisr,   r-   r+   r.   r   r   r   __init__&   s*   

zPredicatorExtractor.__init__c                 C   sH   dt  }t|dd}dd |D }W d    |S 1 sw   Y  |S )Nz2%s/postagger/dictionary/default/Josa/josa_chat.txtutf-8encodingc                 S   s   h | ]}|  qS r   )strip)r   wordr   r   r   r   H   s    z9PredicatorExtractor._load_default_josa.<locals>.<setcomp>)r
   open)r5   pathfr7   r   r   r   r"   E   s   
z&PredicatorExtractor._load_default_josa   c                    s4    fdd}dt  }|d| }|d| }||fS )Nc                    sf   t  }t| dd }|D ]}| \}}t| k rq|| qW d    |S 1 s,w   Y  |S )Nr<   r=   )setrA   splitintadd)rB   stemsrC   liner@   	frequencymin_frequencyr   r   loadL   s   
z5PredicatorExtractor._load_default_stems.<locals>.loadz%%s/lemmatizer/dictionary/default/Stemz%s/Adjective.txtz%s/Verb.txtr	   )r5   rM   rN   dirsr8   r9   r   rL   r   r#   K   s
   
z'PredicatorExtractor._load_default_stems   c                 C   sn   dt  }t }t|dd }|D ]}| \}}t||k rq|| qW d    |S 1 s0w   Y  |S )Nz.%s/lemmatizer/dictionary/default/Eomi/Eomi.txtr<   r=   )r
   rE   rA   rF   rG   rH   )r5   rM   rB   r:   rC   rJ   r@   rK   r   r   r   r$   [   s   
z'PredicatorExtractor._load_default_eomisc                    s2    fddfdd| j D fdd D S )Nc                    s<   | d d  v r| d d S | d d  v r| d d S d S )Nr   )r   )r6   r   r   
parse_noung   s
   z;PredicatorExtractor._remove_stem_prefix.<locals>.parse_nounc                    s    h | ]} |d ur |qS Nr   r   )rS   r   r   r   n   s    z:PredicatorExtractor._remove_stem_prefix.<locals>.<setcomp>c                       h | ]}| vr|qS r   r   )r   noun)removalsr   r   r   p       r(   )r5   r6   r   )r6   rS   rW   r   r2   f   s   z'PredicatorExtractor._remove_stem_prefixc                 C   sf   t  }| jD ]*}zt|D ]}|| qW q ty0 } ztd|| W Y d }~qd }~ww |S )NzException stem = {}, {})rE   r(   r   rH   	Exceptionprintformat)r5   surfacesr   ler   r   r   r/   r   s   
z/PredicatorExtractor._transform_stem_as_surfacesc                 C   sL   d}|rt d|||rdnddd d S t d|||rdnddd d S )Nz[Predicator Extractor]z{} {}
 T)endflushz{} {})r[   r\   )r5   messagereplacenewlineheaderr   r   r   _print}   s   

zPredicatorExtractor._printc                 C   s   | j S rT   )lrgraphr5   r   r   r   
is_trained   s   zPredicatorExtractor.is_trained順 r      333333?
         ?      ?ffffff?d   c                 C   s.   |  ||||||	|
|||| | ||}|S rT   )trainextract)r5   inputsmin_eojeol_frequencyfiltering_checkpoint
candidatesmin_predicator_frequencyreset_lrgraphmin_num_of_featuresmin_eomi_scoremin_eomi_frequencymin_num_of_unique_R_charmin_entropy_of_R_charmin_entropy_of_Rmin_stem_scoremin_stem_frequencypredicatorsr   r   r   train_extract   s   
	z!PredicatorExtractor.train_extractc                 C   s   t |tr| | | nt |tr| || n| ||| | js'| jr+|  }| jr6| 	|||| | jrJ| jr@|
  | ||||	|
| | jrYd}| j|ddd d S d S )Nzhas been trainedFTre   rf   )
isinstancer   _train_with_eojeol_counterto_EojeolCounterr   _train_with_sentencesr,   r-   _prepare_predicator_lrgraph_extract_eomir{   _extract_stemr+   rh   )r5   rv   rw   rx   r|   r}   r~   r   r   r   r   r   ri   rd   r   r   r   rt      s8   

zPredicatorExtractor.trainc                 C   sV   |dk}| j rd}| j|ddd | jrdd }nt}t||| j |d}| | d S )Nr   zcounting eojeols ... Fr   c                 S   s   | S rT   r   )xr   r   r   <lambda>       z;PredicatorExtractor._train_with_sentences.<locals>.<lambda>)	min_countr+   
preprocess)r+   rh   r.   r   r   r   )r5   	sentencesrw   rx   checkrd   r   r1   r   r   r   r      s   
z)PredicatorExtractor._train_with_sentencesc                    s|    fdd|j  D |_ |  t|| _d| _|j| _d| _|| _	| j
r<dt  }d| j|}| j|ddd d S d S )Nc                    s   i | ]\}}| kr||qS r   r   r   eojeolcountrw   r   r   
<dictcomp>   s
    zBPredicatorExtractor._train_with_eojeol_counter.<locals>.<dictcomp>r   %.3fz#eojeols={}, mem={} GbTr   )_counteritems_set_count_sumlen_num_of_eojeols_num_of_covered_eojeols
_count_sum_count_of_eojeols_count_of_covered_eojeolsr1   r+   r   r\   rh   )r5   r1   rw   memrd   r   r   r   r      s   


z.PredicatorExtractor._train_with_eojeol_counterc                 C   s(   d| _ | ||}| |\}}||fS )z*candidates is EojeolCounter or dict formatr   )r   _extract_predicator_separate_adjective_verb)r5   ry   rz   r   r8   r9   r   r   r   ru      s   zPredicatorExtractor.extractc                    sH   fdd dd j j D } fdd| D }t |}|S )Nc                    s8   t | }td|d D ]}| d |  jv r dS qdS )NrD   r   TF)r   ranger3   )r   nr_   rj   r   r   contains_noun   s   zFPredicatorExtractor._prepare_predicator_lrgraph.<locals>.contains_nounc                 S   s   i | ]\}}||qS r   r   )r   kvr   r   r   r     s    zCPredicatorExtractor._prepare_predicator_lrgraph.<locals>.<dictcomp>c                    s*   i | ]\}}t |d kr |s||qS r   r   r   )r   r   r   r     s    )r1   r   r   r   _to_lrgraph)r5   eojeolsri   r   )r   r5   r   r      s
   z/PredicatorExtractor._prepare_predicator_lrgraphc           
         s   t | j j| jd d}|jd ||dd} fdd|D }t j} j| t j} jrBd||}	 j	|	ddd d S d S )	N)ri   rI   r6   r|   r+   logpathT)	conditionr}   r~   r{   c                       h | ]	}| j vr|qS r   )r*   r    rj   r   r   r         z4PredicatorExtractor._extract_eomi.<locals>.<setcomp>zeomis: {} -> {}Fr   )
r   r(   r3   r+   ru   r   r*   updater\   rh   )
r5   ri   r|   r}   r~   eomi_extractorextracted_eomisn_beforen_afterrd   r   rj   r   r     s,   	

z!PredicatorExtractor._extract_eomic                    s   t | j j|||d}|jd ||d} fdd|D }t j}	 j| t j}
 jr@d|	|
} j|ddd d S d S )	N)ri   rI   r:   r   r   r   )L_ignorer   r   c                    r   r   rY   r   rj   r   r   r   7  r   z4PredicatorExtractor._extract_stem.<locals>.<setcomp>zstems: {} -> {}FTr   )	r   r(   r*   ru   r   r   r+   r\   rh   )r5   ri   r   r   r   r   r   stem_extractorextracted_stemsr   r   rd   r   rj   r   r   %  s*   	

z!PredicatorExtractor._extract_stemc                    s`   dd  |d u s
|s fdd| j  D }| |}| jr.dt|}| j|ddd |S )Nc                 S   s   | D ]	}t |s dS qdS )NFTr   )scr   r   r   !all_character_are_complete_koreanC  s
   zRPredicatorExtractor._extract_predicator.<locals>.all_character_are_complete_koreanc                    s&   i | ]\}}|kr |r||qS r   r   r   r   rM   r   r   r   J  s
    z;PredicatorExtractor._extract_predicator.<locals>.<dictcomp>z{} predicators are extractedTr   )r1   r   _as_lemma_candidatesr+   r\   r   rh   )r5   r1   rM   lemmasrd   r   r   r   r   A  s   
z'PredicatorExtractor._extract_predicatorc                    s   fdd}d _ d _i }tdd }t|}t| D ]\}\}} jr=|d dkr=d|d	 |}	 j|	d
dd ||rBqt|}
t	 }t
d	|
d	 D ]*}|d | ||d  }}t||D ]\}}| jv ry| jv ry|||f qdqPt	 }|D ]\}}|t||v r|||f q|rt||||< |D ]\}}|| ||f q  j d	7  _   j|7  _q ||} jrdd j  j  }d}	 j|	d
d
d |S )Nc                    sB   t dt| D ]}| d |  jv r| |d   jv r dS qdS )Nr   TF)r   r   r3   r%   )r   irj   r   r   is_noun_josa\  s   z>PredicatorExtractor._as_lemma_candidates.<locals>.is_noun_josar   c                   S   s   g S rT   r   r   r   r   r   r   g  r   z:PredicatorExtractor._as_lemma_candidates.<locals>.<lambda>i  i  zlemmatizing {} / {} wordsr   TFr   r   rs   zlemma candidating was done)r   r   r   r   	enumerater   r+   r\   rh   rE   r   r   r(   r*   rH   r   r   append_remove_wrong_eomisr   )r5   r1   r   r   eomi_to_word_count
num_eojeolr   r   r   rd   r   lemma_candidatesr^   rr   r!   lemma_candidates_percr   rj   r   r   Z  sN   z(PredicatorExtractor._as_lemma_candidatesc                    sP  fdd}t  i }| D ]9\ }t dkrq||\}}|dk r%q|dkr8  tdd |D }nt fdd|D }|| < qfd	d
jD _jrrdd
 | D }	dtt|	}
j	|
ddd | D ]/\ }	|	D ](}||vrq||| }t|j
dkr|| q|t|j fdd
|j
D ||< q|qv|S )Nc                    sZ   t dd | D }t  fdd| D }d}|dkr)|| }|t dd | D  }||fS )Nc                 s   s$    | ]\}}t |d krdV  qdS rD   r   Nr   r   wr   r   r   r   	<genexpr>     " zSPredicatorExtractor._remove_wrong_eomis.<locals>.noun_proportion.<locals>.<genexpr>c                 3   s.    | ]\}}| j v rt|d krdV  qdS r   )r3   r   r   rj   r   r   r     s   , r   c                 s   s    | ]\}}d V  qdS )r   Nr   r   r   r   r   r     s    )sum)
word_countsum_prop	prop_len2rj   r   r   noun_proportion  s   z@PredicatorExtractor._remove_wrong_eomis.<locals>.noun_proportion   rp   r   c                 s   s$    | ]\}}t |d kr|V  qdS )rD   Nr   r   r@   _r   r   r   r     r   z:PredicatorExtractor._remove_wrong_eomis.<locals>.<genexpr>c                 3   sF    | ]\}}t |d kr|jv st |dkrt  d kr|V  qdS )rD   r   N)r   r3   r   )r!   r5   r   r   r     s    c                    rU   r   r   r    )remove_eomisr   r   r     rX   z:PredicatorExtractor._remove_wrong_eomis.<locals>.<setcomp>c                 S   s   h | ]	}|D ]}|qqS r   r   )r   wordsr@   r   r   r   r     r   z,{} eomis are removed, {} words are modified.Tr   c                    s   h | ]
}|d   kr|qS r   r   )r   lemma)r!   r   r   r     s    )rE   r   r   rH   tupler*   r+   valuesr\   rh   r   popr   rK   )r5   r   r   r   remove_morphsr   r   r   remove_wordsr   rd   r@   
predicatorr   )r!   r   r5   r   r     sH   

	
z'PredicatorExtractor._remove_wrong_eomisr   c                    s4  i }i }   D ]\}}|j}|j}t }	t }
|D ]h}|d | jv r)|
| q|d | jv r6|	| qt|d }|du rF|
| q|du rP|	| qt|d }|	t
|d  |	t|d   fdd|D }t|dkr}|	| q|
| q|	rt||	||< |
rt||
||< q||fS )Nr   Verb	Adjectivec                    s   h | ]}| v r|qS r   r   )r   surfacer   r   r   r     s    z?PredicatorExtractor._separate_adjective_verb.<locals>.<setcomp>r   )r   rK   r   rE   r'   rH   r&   r   r   r   r   r   r   r   )r5   r   num_thresholdr8   r9   r@   r   rK   r   adjr   r   answerr]   r   r   r   r     sD   



z,PredicatorExtractor._separate_adjective_verb)NNNNFFTF)rD   )rP   )FT)rD   rl   Nr   Trm   rn   r   ro   rp   rq   rr   rs   )
rD   rl   rm   rn   r   ro   rp   rq   rr   rs   )rD   rl   )Nr   )rm   rn   r   )ro   rp   rq   rr   rs   rT   )r   )__name__
__module____qualname__r;   r"   r#   r$   r2   r/   rh   propertyrk   r   rt   r   r   ru   r   r   r   r   r   r   r   r   r   r   r   r   $   sR    



	


&






4Br   N)__doc__collectionsr   r   soynlp.hangler   soynlp.utilsr   r   r   soynlp.utils.utilsr
   soynlp.lemmatizerr   r   r   soynlp.normalizerr   _eomir   _stemr   _adjective_vs_verbr   r   r   r   r   r   r   r   r   r   <module>   s(    
