o
    Qiz#                     @   s2   d dl Z d dlmZ d dlmZ G dd dZdS )    N)	conjugate)lemma_candidatec                   @   s   e Zd Z		d&ddZd'dd	Zd
d Z		d(ddZdd Zd)ddZdd Z	dd Z
dd Zdd Zdd Zd d! Zd"d# Zd$d% ZdS )*StemExtractor
         ?      ?Tc                 C   sN   || _ || _|| _|| _|| _|| _|| _| |||\| _| _	h d| _
d S )N>      게   고   는   아   은   거나   게는   게도   고도   고만   다가   서는   다)lrgraphstemseomismin_num_of_unique_R_charmin_entropy_of_R_charmin_entropy_of_Rverbose_conjugate_stem_and_eomiLR_josa)selfr   r   r   r   r   r   r    r!   K/home/ubuntu/.local/lib/python3.10/site-packages/soynlp/predicator/_stem.py__init__   s   zStemExtractor.__init__Fc                 C   sL   d}|rt d|||rdnddd d S t d|||rdnddd d S )Nz[Stem Extractor]z{} {}
 T)endflushz{} {})printformat)r    messagereplacenewlineheaderr!   r!   r"   _print   s   

zStemExtractor._printc                 C   s  |  }t }t }t|}t|}t|D ]X\}	}
| jr2|	d dkr2d|	||}| j|ddd t|
}|D ]5}z.t|
|D ]&}|| dksNt||krOq@|d | ||d  }}|| || q@W q8   Y q8q| jrdt|t|d}| j|ddd ~||fS )	Nd   r   z0Checking combination of {} / {} stems + {} eomisTFr+   r,   z2Initializing was done with {} stems and {} eomis{}z
          )	to_EojeolCountersetlen	enumerater   r)   r.   r   add)r    r   r   r   eojeol_counterstem_surfaceseomi_surfacesn_stemsn_eomisistemr*   stem_leneomiwordlrr!   r!   r"   r   $   s>   
z&StemExtractor._conjugate_stem_and_eomiNffffff?r/   c           
         s   |d u ri }i }| j D ]"}| j|dD ]\}}|| jv s!||v r"q||d| ||< qq fdd| D }| jrKdt|}| j	|ddd | 
|| }	| |	\| _| _| | j| _| jr}d	t| jt| jt| j}| j	|ddd | jS )
Nr   c                    s   i | ]\}}| kr||qS r!   r!   ).0r@   countmin_stem_frequencyr!   r"   
<dictcomp>U       z)StemExtractor.extract.<locals>.<dictcomp>z"batch prediction for {} candidatesFTr0   z){} stems, {} surfacial stems, {} removals)r   r   get_lr   getitemsr   r)   r3   r.   _batch_prediction_post_processingr7   removals_to_stemr   )
r    L_ignoremin_stem_scorerG   
candidatesrA   r@   rE   r*   r7   r!   rF   r"   extractG   s4   
zStemExtractor.extractc                    s   dd  j D }t|dd dD ]5}| j v s,| jv s,t|dks,|d dks,||v r-q |||\}}||k s>||k r?q||f||< q fd	d| D }|S )
Nc                 S   s   i | ]}|d qS Nr!   )rD   r@   r!   r!   r"   rH   p   s    z3StemExtractor._batch_prediction.<locals>.<dictcomp>c                 S   s
   t |  S rU   r3   xr!   r!   r"   <lambda>s   s   
 z1StemExtractor._batch_prediction.<locals>.<lambda>key   rC   r   c                    s    i | ]\}}| j vr||qS r!   )r   rD   r@   scorer    r!   r"   rH           )r   sortedr   r3   predictrL   )r    rS   rR   min_frequency	extractedr@   r^   freqr!   r_   r"   rM   l   s    

zStemExtractor._batch_predictionr\   c              	   C   s   | j |d}| |}t|}| t| || }| ||\}	}
}|	|
 dkr4|	|
 |	|
  nd}||kr<|	n|
| }|rPt	d
|	|
|t||| || jk sZ|| jk r\dS ||k rdd|fS ||fS )NrC   r   z?pos={}, neg={}, unk={}, n_features_={}, n_char={}, entropy_r={}r   r   )r   get_r_count_first_charsr3   _entropytuple_select_pos_featuresvalues_predictr(   r)   r   r   )r    r@   rR   rc   debugfeatures
char_countunique_of_charentropy_of_charposnegunkr^   re   r!   r!   r"   rb      s&   
 

zStemExtractor.predictc                 C   s   d\}}}|D ]4\}}|| j v rq|s||7 }q|| jv r"||7 }q| |r,||7 }q| ||r7||7 }q||7 }q|||fS )N)r   r   r   )r   r   _r_is_predicator_exist_longer_eomi)r    r@   ro   rs   rt   ru   rA   re   r!   r!   r"   rm      s   









zStemExtractor._predictc                 C   s8   dd |D }i }|D ]\}}| |d| ||< q|S )Nc                 S   s    g | ]\}}|r|d  |fqS )r   r!   rD   rA   rE   r!   r!   r"   
<listcomp>   r`   z4StemExtractor._count_first_chars.<locals>.<listcomp>r   )rK   )r    ro   rp   countercharrE   r!   r!   r"   rh      s
   z StemExtractor._count_first_charsc                    sD   t |dkrdS t|  fdd|D }dtdd |D  }|S )Nr\   r   c                    s   g | ]}|  qS r!   r!   )rD   vsum_r!   r"   ry      s    z*StemExtractor._entropy.<locals>.<listcomp>rC   c                 s   s    | ]
}|t | V  qd S rU   )mathlog)rD   pr!   r!   r"   	<genexpr>   s    z)StemExtractor._entropy.<locals>.<genexpr>)r3   sum)r    countsentropyr!   r}   r"   ri      s   zStemExtractor._entropyc                    s    fdd  fdd|D S )Nc                    s.   | j v s| s | rdS | jv S )NF)r   rv   rw   r   )rA   )r@   r    r!   r"   is_pos   s    
z2StemExtractor._select_pos_features.<locals>.is_posc                    s   i | ]\}} |r||qS r!   r!   rx   )r   r!   r"   rH      s    z6StemExtractor._select_pos_features.<locals>.<dictcomp>r!   )r    r@   ro   r!   )r   r@   r    r"   rk      s   z"StemExtractor._select_pos_featuresc                 C   sF   t |}td|D ]}|d | | jv r ||d  | jv r  dS q	dS Nr\   TFr3   ranger   r   )r    rA   nr;   r!   r!   r"   rv      s   $zStemExtractor._r_is_predicatorc                 C   s:   t dt|d D ]}|| d  | | jv r dS q	dS r   )r   r3   r   )r    r@   rA   r;   r!   r!   r"   rw      s
   z StemExtractor._exist_longer_eomic                    sn    fdd} fdd}t  t dd dD ]}||s"||r'| qfdd	  D   fS )
Nc                    sr   t | }td|D ]-}| d | jv s| d |  v sq	t|d |d D ]}| || jv r5  dS q&q	dS r   r   )r@   r   r;   j)rd   r    r!   r"   is_stem_and_eomi   s   "z8StemExtractor._post_processing.<locals>.is_stem_and_eomic                    s.   t dt| D ]}| d |  v r dS qdS )N   TF)r   r3   )r@   r;   )rd   r!   r"   exist_subword   s
   z5StemExtractor._post_processing.<locals>.exist_subwordc                 S   s   t | S rU   rV   rW   r!   r!   r"   rY      s    z0StemExtractor._post_processing.<locals>.<lambda>rZ   c                    s   i | ]\}}| vr||qS r!   r!   r]   )rO   r!   r"   rH      rI   z2StemExtractor._post_processing.<locals>.<dictcomp>)r2   ra   r5   rL   )r    rd   r   r   r@   r!   )rd   rO   r    r"   rN      s   

zStemExtractor._post_processingc              	   C   s   dd }i }|  D ]8\}\}}| j|dD ]*\}}z!t||D ]\}	}
|
| jv r+q!|||g||	dR  ||	< q!W q   Y qq
|S )Nc                 S   s    | | ||  ||  | |  fS rU   r!   )freq0score0freq1score1r!   r!   r"   merge_score   s    z+StemExtractor._to_stem.<locals>.merge_scorerC   rf   )rL   r   rg   r   r   rK   )r    surfacesr   r   r@   r   r   rA   rE   r<   r>   r!   r!   r"   rP      s$   


zStemExtractor._to_stem)r   r   r   T)FT)NrB   r/   )rB   r\   F)__name__
__module____qualname__r#   r.   r   rT   rM   rb   rm   rh   ri   rk   rv   rw   rN   rP   r!   r!   r!   r"   r      s&    

	#
%
 r   )r   soynlp.lemmatizerr   r   r   r!   r!   r!   r"   <module>   s    