o
    Qi^[                     @   s   d dl mZ d dl mZ d dlZd dlmZ d dlmZ d dlmZ d dlm	Z	 d dlm
Z
 d d	lmZ d
dlmZ d
dlmZ d
dlmZ d
dlmZ eddZG dd dZdS )    )defaultdict)
namedtupleN)normalize_sent_for_lrgraph)
check_dirs)EojeolCounter)LRGraph)get_process_memory)MaxScoreTokenizer   )extract_domain_pos_features)detaching_features)ignore_features)check_N_is_NJ	NounScorezfrequency scorec                   @   s
  e Zd Z					dBd	d
Zedd Zdd Zdd Zdd Z		dCddZ	dDddZ
dDddZdd ZdEddZdd  Z			!	"	$	%dFd&d'ZdGd(d)Zd*d+ Zd,d- Zd.d/ ZdHd0d1Zd2d3 ZdId4d5Z	dJd6d7ZdJd8d9Zd:d; Zd<d= Zd>d? Zd@dA ZdS )KLRNounExtractor_v2
   	   NTr
        Fc                 C   s   || _ || _d | _|| _|| _|| _|| _|| _|	| _|
| _	|| _
|| _|r*t| |s1g d}n	t|tkr:|g}|| _|sC|  }| | d S )N)r   r   	ignore_NJ)max_left_lengthmax_right_lengthlrgraphverbosemin_num_of_features!max_frequency_when_noun_is_eojeol#eojeol_counter_filtering_checkpointextract_compoundextract_pos_featureextract_determinerensure_normalizedlogpathr   
isinstancestrpostprocessing_set_default_predictor_header_load_predictor)selfr   r   predictor_headersr   r   r   r   r   r   r   r    r$   r!    r)   J/home/ubuntu/.local/lib/python3.10/site-packages/soynlp/noun/_noun_ver2.py__init__   s,   
zLRNounExtractor_v2.__init__c                 C   s   | j S N)r   r'   r)   r)   r*   
is_trained9   s   zLRNounExtractor_v2.is_trainedc                 C   sF   | j rtd dtjtdddd d }d	|g}|S )Nz'[Noun Extractor] use default predictors/\z%{}/trained_models/noun_predictor_ver2)
r   printjoinospathabspath__file__replacesplitformat)r'   dirnamepredictor_headerr)   r)   r*   r%   =   s
   (z0LRNounExtractor_v2._set_default_predictor_headerc              	      s&  t |tkr	|g}t t }}|D ]L}d|}t|dd}|dd |D  W d    n1 s4w   Y  d|}t|dd}|dd |D  W d    n1 sYw   Y  q||  fdd|D } fd	d|D }| jrtd
t	|t	|t	  || _
|| _ | _d S )Nz{}_posutf-8encodingc                 S      h | ]}|  qS r)   strip.0featurer)   r)   r*   	<setcomp>R       z5LRNounExtractor_v2._load_predictor.<locals>.<setcomp>z{}_negc                 S   r@   r)   rA   rC   r)   r)   r*   rF   W   rG   c                       h | ]}| vr|qS r)   r)   rC   commonr)   r*   rF   ]       c                    rH   r)   r)   rC   rI   r)   r*   rF   ^   rK   z8[Noun Extractor] num features: pos={}, neg={}, common={})typer#   setr:   openupdateintersectionr   r2   len_pos_features_neg_features_common_features)r'   headersposnegheaderpos_pathfneg_pathr)   rI   r*   r&   G   s.   



z"LRNounExtractor_v2._load_predictorc                    s   fdd}| \}}}|dkr'fdd|D  j  fdd|D  n&|dkrBfdd|D  j fd	d|D  n|d
krI| ntdj  | \}}}	jrod||||||	}
td|
 d S d S )Nc                      s   t  jt  jt  jfS r,   )rQ   rR   rS   rT   r)   r-   r)   r*   check_feature_sizej   s   z?LRNounExtractor_v2._append_features.<locals>.check_feature_sizerV   c                       h | ]	}| j v r|qS r)   )rS   rD   rZ   r-   r)   r*   rF   s       z6LRNounExtractor_v2._append_features.<locals>.<setcomp>c                    rH   r)   r)   r^   commonsr)   r*   rF   u   rK   rW   c                    r]   r)   )rR   r^   r-   r)   r*   rF   x   r_   c                    rH   r)   r)   r^   r`   r)   r*   rF   z   rK   rJ   z3Feature type was wrong. Choice = [pos, neg, common]z+pos={} -> {}, neg={} -> {}, common={} -> {}z&[Noun Extractor] features appended. {})rR   rO   rS   
ValueErrorrT   r   r:   r2   )r'   feature_typefeaturesr\   n_posn_negn_commonn_pos_n_neg_	n_common_messager)   )ra   r'   r*   _append_featuresh   s.   z#LRNounExtractor_v2._append_features333333?c                 C   s   |  || | |||S r,   )trainextract)r'   inputsmin_noun_scoremin_noun_frequencymin_eojeol_frequencyreset_lrgraphr)   r)   r*   train_extract   s   z LRNounExtractor_v2.train_extractc                 C   s@   t |tr| | d S t |tr| | d S | || d S r,   )r"   r   _train_with_lrgraphr   _train_with_eojeol_counter_train_with_sentences)r'   rp   rs   r)   r)   r*   rn      s
   

zLRNounExtractor_v2.trainc                 C   sP   | j rtd | jsdd }nt}t||| j| j | j| j |d}| | d S )Nz![Noun Extractor] counting eojeolsc                 S   s   | S r,   r)   xr)   r)   r*   <lambda>   s    z:LRNounExtractor_v2._train_with_sentences.<locals>.<lambda>)	min_count
max_lengthfiltering_checkpointr   
preprocess)	r   r2   r    r   r   r   r   r   rw   )r'   	sentencesrs   r   eojeol_counterr)   r)   r*   rx      s   

	z(LRNounExtractor_v2._train_with_sentencesc                 C   s4   | | j| j}|j}| jrtd | || d S )Nz4[Noun Extractor] complete eojeol counter -> lr graph)
to_lrgraphr   r   
_count_sumr   r2   rv   )r'   r   r   num_of_eojeolsr)   r)   r*   rw      s   z-LRNounExtractor_v2._train_with_eojeol_counterc                 C   sH   || _ d| _|dkr| j}|| _| jr"td|dt   d S d S )Nr   r   z9[Noun Extractor] has been trained. #eojeols={}, mem={} Gb%.3f)	r   _num_of_covered_eojeolsto_EojeolCounterr   _num_of_eojeolsr   r2   r:   r   )r'   r   r   r)   r)   r*   rv      s   


z&LRNounExtractor_v2._train_with_lrgraphc                 C   s   t r,   )NotImplementedr-   r)   r)   r*   _extract_determiner   s   z&LRNounExtractor_v2._extract_determinerd              ?      ?c                 C   s   | j rtd |s|  }| ||}| j  t|| j| j|||||||	|
| _|r2| 	d| j | j rAtd
t| j d S d S )Nz<[Noun Extractor] batch prediction for extracting pos featurerV   z/[Noun Extractor] {} pos features were extracted)r   r2   '_noun_candidates_from_positive_features_batch_predicting_nounsr   rt   r   rR   _pos_features_extractedrl   r:   rQ   )r'   noun_candidatesr   append_extracted_featuresrq   rr   min_pos_scoremin_pos_feature_frequencymin_num_of_unique_lastcharmin_entropy_of_lastcharmin_noun_entropyprediction_scoresr)   r)   r*   r      s8   

z.LRNounExtractor_v2.extract_domain_pos_featuresc              
      s  d| _ |  }| jr| jrtd | | | |}| jr[t| jd ddd*}|	d t
| dd	 d
D ]\}}|	d||d |d  q9W d    n1 sVw   Y  | jrqdd | jj D }	| |	|}
ni }
fdd| D }||
  fdd| D }| |||
}| jrtdt|t|
 dd tdddd | | || _|r| j  dd | D }|S )Nr   z0[Noun Extractor] extract and append pos featuresz_prediction_score.logwr=   r>   znoun score frequency
c                 S   s   | d d  S )Nr
   r)   ry   r)   r)   r*   r{         z,LRNounExtractor_v2.extract.<locals>.<lambda>keyz	{} {} {}
r
   c                 S   s*   i | ]\}}t |d kr|t| qS )r   )rQ   sumvaluesrD   lrdictr)   r)   r*   
<dictcomp>
  s    z.LRNounExtractor_v2.extract.<locals>.<dictcomp>c                    "   i | ]\}}|d   kr||qS r
   r)   rD   nounscorerq   r)   r*   r         c                    r   r   r)   r   )rr   r)   r*   r     r   z>[Noun Extractor] {} nouns ({} compounds) with min frequency={}Tflushz[Noun Extractor] flushing ...  r   endc                 S   s$   i | ]\}}|t |d  |d qS )r   r
   )r   r   r)   r)   r*   r   ,  s   $ )r   r   r   r   r2   r   r   r!   rN   writesorteditemsr:   r   r   _lrextract_compoundsrO   _post_processingrQ   _check_covered_eojeols_nounsrt   )r'   rq   rr   rt   r   r   rZ   wordr   
candidates	compoundsnounsnouns_r)   )rr   rq   r*   ro      sR   





zLRNounExtractor_v2.extractc                    s    fdd|D S )Nc                    s@   g | ]\}}| j v r |r| jv r |s|qS r)   )rR   _exist_longer_posrS   _exist_longer_neg)rD   r_r'   r   r)   r*   
<listcomp>0  s
    z=LRNounExtractor_v2._get_nonempty_features.<locals>.<listcomp>r)   )r'   r   rd   r)   r   r*   _get_nonempty_features/  s   z)LRNounExtractor_v2._get_nonempty_featuresc                 C   :   t t|d ddD ]}||d  | | jv r dS q
dS Nr
   r   TF)rangerQ   rR   r'   r   r   er)   r)   r*   r   4  
   z$LRNounExtractor_v2._exist_longer_posc                 C   r   r   )r   rQ   rS   r   r)   r)   r*   r   :  r   z$LRNounExtractor_v2._exist_longer_negc              	   C   s  | j |d}| ||\}}}}}	|| }
|
dkrdn|| |
 }||kr,||	 | n||	 | }| ||}t|}|rJtd|||||	| || jkrS||fS || | | |	 }|dkre|dfS |	| jkrr||krr||fS |dksz|dkr|	| dkr||kr|| |	 }||| fS t	 }|D ]/\}}|sq|| j
v s|| jv r| ||s||d  || j
v s|| jv s||d  qt|dkr|| |	 }||| fS |dfS )Nr   r   z9pos={}, common={}, neg={}, unk={}, end={}, n_features_={}rm      )r   get_r_predictr   rQ   r2   r:   r   r   rM   rR   rT   r   add)r'   r   rq   debugrd   rV   rJ   rW   unkr   baser   support	features_n_features_sum_first_charsr   r   r)   r)   r*   predict@  sF    
$	zLRNounExtractor_v2.predictc           
      C   s   d\}}}}}|D ]=\}}	|dkr||	7 }q	|  ||rq	| ||r$q	|| jv r.||	7 }q	|| jv r8||	7 }q	|| jv rB||	7 }q	||	7 }q	|||||fS )N)r   r   r   r   r   r   )r   r   rT   rR   rS   )
r'   r   rd   rV   rJ   rW   r   r   r   freqr)   r)   r*   r   }  s"   






zLRNounExtractor_v2._predictc                    sv    fdd}i }| j D ]-}| j|dD ]#\}} s%||d| ||< q||t s-q||d| ||< qq|S )Nc                    s   | d |  kS r,   r)   )r   r   	conditionr)   r*   satisfy  s   zKLRNounExtractor_v2._noun_candidates_from_positive_features.<locals>.satisfyr   r   )rR   r   get_lgetrQ   )r'   r   r   N_from_Jr   r   cr)   r   r*   r     s   
z:LRNounExtractor_v2._noun_candidates_from_positive_featuresc                 C   s   i }t |}tt|dd dD ]I\}}| jr2|d dkr2dd|d  |  }td	||d
dd | ||\}}	||	f||< |	|krY| j|dD ]\}
}| j	||
 | qKq| jrftd|d
d |S )Nc                 S   
   t |  S r,   rQ   ry   r)   r)   r*   r{        
 z<LRNounExtractor_v2._batch_predicting_nouns.<locals>.<lambda>r   r     r   r   r
   z'  -- batch prediction {} % of {} wordsTr   r   r   z=[Noun Extractor] batch prediction was completed for {} wordsr   )
rQ   	enumerater   r   r2   r:   r   r   r   remove_eojeol)r'   r   rq   r   nir   
percentager   r   r   countr)   r)   r*   r     s.   z*LRNounExtractor_v2._batch_predicting_nounsc                    s  fdd  D td| _fdd| jj  D }t|}i }i  i }tt|  dd dD ]\}\}}	| jrU|d d	krUd
d| |  }
t	d
|
ddd | jj|ddd }| |}|rd|}|||< tfdd|D }t||d|||<  |d|	  |< tdt|D ]}|d | }||vrq||d|	 ||< q| j| q4| jrt	d
t|  fdd|  D }|| _|S )Nc                    s2   i | ]\}}|d   krt |d kr|t |qS r   r   r   r   r)   r*   r     s    z8LRNounExtractor_v2.extract_compounds.<locals>.<dictcomp>)scoresc                    s2   i | ]\}}t |d kr| vr||ddqS )r   r   r   )rQ   r   r   )noun_scoresr)   r*   r     s    c                 S   s   t | d  S )Nr   r   ry   r)   r)   r*   r{     r   z6LRNounExtractor_v2.extract_compounds.<locals>.<lambda>r   r   r   %.2fr   z  -- check compound {} %Tr   r   Fflattenr   c                 3   s     | ]}  |d d V  qdS ))r   r   r
   Nr   rD   t)r   r)   r*   	<genexpr>  s    z7LRNounExtractor_v2.extract_compounds.<locals>.<genexpr>r   z<[Noun Extractor] checked compounds. discovered {} compoundsc                    s"   i | ]\}}||  |d fqS r   r   r   )compounds_countsr)   r*   r     s    )r   r	   _compound_decomposerr   
_lr_originrQ   r   r   r   r2   r:   tokenize_parse_compoundr3   maxr   r   r   _compounds_components)r'   r   r   rq   r   compounds_scorescompounds_componentsr   r   r   r   tokenscompound_partsr   compound_scorer   subwordr   r)   )r   rq   r   r   r*   r     sF   $


z$LRNounExtractor_v2.extract_compoundsc                 C   s,   | j j|ddd }| |}|s|fS |S )NFr   r   )r   r   r   )r'   r   r   r   r)   r)   r*   decompose_compound  s   
z%LRNounExtractor_v2.decompose_compoundc                 C   s   |dd D ]}|d dkr dS qt |dkr.|d d | jv r.tdd |dd D S |d d dkr?tdd |D S dS )zCheck Noun* or Noun*JosaNr      r   c                 s       | ]}|d  V  qdS r   Nr)   r   r)   r)   r*   r         z5LRNounExtractor_v2._parse_compound.<locals>.<genexpr>c                 s   r  r  r)   r   r)   r)   r*   r      r  )rQ   rR   tuple)r'   r   tokenr)   r)   r*   r     s   z"LRNounExtractor_v2._parse_compoundc                 C   s  dd }| j r| j d nd }|r,t|ddd}|d W d    n1 s'w   Y  | jD ]W}|dkrKd	}t|| j||\}}	| jrJ||||	 q/|d
krodd | jD }
|
| j t	||
|\}}	| jrn||||	 q/|dkrt
|| j|d\}}	| jr||||	 q/|S )Nc                 S   s*   t |}|t | }td| || d S )Nz-[Noun Extractor] postprocessing {} : {} -> {})rQ   r2   r:   )methodr   removalsn_aftern_beforer)   r)   r*   print_status'  s
   
z9LRNounExtractor_v2._post_processing.<locals>.print_statusz_postprocessing.logr   r=   r>   r   r   z6## Ignore noun candidates from detaching pos features
r   c                 S   s   h | ]}|qS r)   r)   r^   r)   r)   r*   rF   A  s    z6LRNounExtractor_v2._post_processing.<locals>.<setcomp>r   )r!   )r!   rN   r   r$   r   rR   r   rO   rT   r   r   r   )r'   r   r   r   r  r!   rZ   r	  	logheaderr
  rd   r)   r)   r*   r   %  s8   
z#LRNounExtractor_v2._post_processingc           
      C   sL  | j   |  }t|}tt|dd dD ]g\}}| jr8|d dkr8dd|d  |  }td	|d
dd ||vr=q| j 	|dD ]9\}}t|dkr_| j 
|| | |  j|7  _qD|dksm|| jv sm|| jv r}| j 
|| | |  j|7  _qDq| jrtddt  dd
d dd| j | j  }	td|	d
d d S d S )Nc                 S   r   r,   r   ry   r)   r)   r*   r{   Y  r   z;LRNounExtractor_v2._check_covered_eojeols.<locals>.<lambda>r   r   r   r   r   r
   z$[Noun Extractor] flushing ...  {} %Tr   r   r   z0[Noun Extractor] flushing was done. mem={} Gb{}z                    r   r   z)[Noun Extractor] {} % eojeols are covered)r   rt   r   rQ   r   r   r   r2   r:   r   r   r   rR   rT   r   r   )
r'   r   r   r   r   r   r   r   r   coverager)   r)   r*   r   R  sF   




z)LRNounExtractor_v2._check_covered_eojeols)r   r   NTr
   r   r   TFFFNN)rm   r
   r
   Tr   )r   )
NNTrm   r   rm   r   r   r   r   )rm   r
   T)rm   Fr,   )rm   )__name__
__module____qualname__r+   propertyr.   r%   r&   rl   ru   rn   rx   rw   rv   r   r   ro   r   r   r   r   r   r   r   r   r  r   r   r   r)   r)   r)   r*   r      sR    
&

!$





'?
=


#:-r   )collectionsr   r   r4   soynlp.normalizerr   soynlp.utilsr   r   r   r   soynlp.tokenizerr	   _josar   _noun_postprocessingr   r   r   r   r   r)   r)   r)   r*   <module>   s    
