o
    Qib>                     @   s|   d dl Z e jdkree  e d d dlmZ d dlZd dlZG dd dZG dd dZ	G d	d
 d
Z
G dd dZdS )    N)      zutf-8)pprintc                   @   s2   e Zd Zdd ZdddZdddZdd	d
ZdS )RegexTokenizerc              	   C   sf   dt dt jfdt dt jfdt dt jfdt dt jfd	t d
t jfg| _t d| _d S )Nnumberz[-+]?\d*[\.]?[\d]+|[-+]?\d+koreanu
   [가-힣]+jaumu
   [ㄱ-ㅎ]+moumu
   [ㅏ-ㅣ]+zenglish & latinu&   [a-zA-ZÀ-ÿ]+[[`']?s]*|[a-zA-ZÀ-ÿ]+z\s+)recompileUNICODE	_patternsdoublewhite_patternself r   O/home/ubuntu/.local/lib/python3.10/site-packages/soynlp/tokenizer/_tokenizer.py__init__   s   zRegexTokenizer.__init__Tc                 C      |  |||S Ntokenize)r   sdebugflattenr   r   r   __call__      zRegexTokenizer.__call__Fc                    .    fdd|  D }|rdd |D }|S )u  
        Usage
        
        s = "이거에서+3.12같은34숫자나-1.2like float해해 같은aÀÿfafAis`s-1찾아서3.1.2.1해ㅋㅋㅜㅠ봐 Bob`s job.1"
        tokenizer = RegularTokenizer()
        tokenizer.tokenize(s)

        [['이거에서', '+3.12', '같은', '34', '숫자나', '-1.2', 'like'],
         ['float', '해해'],
         ['같은', 'aÀÿfafAis`s', '-1', '찾아서', '3.1', '.2', '.1', '해', 'ㅋㅋ', 'ㅜㅠ', '봐'],
         ['Bob`s'],
         ['job', '.1']]
        c                    s   g | ]} | qS r   	_tokenize.0tr   r   r   r   
<listcomp>)   s    z+RegexTokenizer.tokenize.<locals>.<listcomp>c                 S      g | ]}|D ]}|r|qqS r   r   r!   tokensubtokenr   r   r   r$   +       split)r   r   r   r   tokensr   r#   r   r      s   zRegexTokenizer.tokenizec                 C   s   | j D ]i\}}||}|sq|rtd|  t| |d}t|}d}d}	t|D ]=\}
}|	|
kr5q,||
|
|  |kre|d||
|
|   7 }|
| }	|s[|||	d  7 } n|d}t|}q,||7 }q,|}q| jd| 	 }|S )Nz
%sr    z %s  )
r   findallprintpoplen	enumerater   substripr+   )r   r   r   namepatternfoundsfound	len_founds_bicr   r   r   r   .   s6   



zRegexTokenizer._tokenizeNTTFTF)__name__
__module____qualname__r   r   r   r   r   r   r   r   r      s
    

r   c                   @   s*   e Zd ZdddZdddZdd	d
ZdS )
LTokenizerN        c                 C   s   |r|ni | _ || _d S r   )_scores_ds)r   scoresdefault_scorer   r   r   r   Y   s   
zLTokenizer.__init__TFc                 C   s   |  ||||S r   r   )r   sentence	tolerancer   remove_rr   r   r   r   ]   s   zLTokenizer.__call__c                    sV   d	 fdd	fdd|  D }|rdd |D }|r)|dkr)dd |D }|S )
NrF   c                    s   t }|dkrdfS fddtd|d D }fdd|D }dkrFtdd |D   fd	d|D }t|d
d ddd }nt|dd ddd }|d |d fS )Nr   r-   c                    s$   g | ]} d |  |d  fqS r   r   )r!   e)r'   r   r   r$   e      $ z<LTokenizer.tokenize.<locals>.token_to_lr.<locals>.<listcomp>   c                    s.   g | ]} j |d   j|d  |d fqS )r   rP   rG   getrH   r    r   r   r   r$   f   s   . r   c                 S      g | ]}|d  qS r   r   r!   r>   r   r   r   r$   h       c                    s    g | ]} |d   kr|qS rT   r   rU   )	max_scorerL   r   r   r$   i   s     c                 S   s   t | d S NrP   r2   xr   r   r   <lambda>j   s    z:LTokenizer.tokenize.<locals>.token_to_lr.<locals>.<lambda>T)keyreversec                 S   s   | d t | d fS Nr   rP   rY   rZ   r   r   r   r\   l   rV   )r2   rangemaxsorted)r'   rL   length
candidatesbestr   )rW   r'   rL   r   token_to_lrb   s   z(LTokenizer.tokenize.<locals>.token_to_lrc                    s   g | ]} |qS r   r   r!   r'   )rf   rL   r   r   r$   o       z'LTokenizer.tokenize.<locals>.<listcomp>c                 S   rS   rT   r   rg   r   r   r   r$   r   rV   Fc                 S   r%   r   r   r&   r   r   r   r$   u   r)   )rF   r*   )r   rK   rL   r   rM   r,   r   )r   rf   rL   r   r   `   s   zLTokenizer.tokenize)NrF   )rF   TF)rB   rC   rD   r   r   r   r   r   r   r   rE   W   s    

rE   c                   @   s\   e Zd ZdddZdddZdd	d
ZdddZdd Zdd Zdd Z	dd Z
dd ZdS )MaxScoreTokenizerN
   rF   c                 C   s   |r|ni | _ || _|| _d S r   )rG   _max_lengthrH   )r   rI   
max_lengthrJ   r   r   r   r   |   s   
zMaxScoreTokenizer.__init__Tc                 C   s   |  ||S r   r   )r   rK   r   r   r   r   r      s   zMaxScoreTokenizer.__call__c                    s,    fdd|  D }|rdd |D }|S )Nc                    s   g | ]}  |qS r   )_recursive_tokenizerg   r   r   r   r$      rh   z.MaxScoreTokenizer.tokenize.<locals>.<listcomp>c                 S   s   g | ]}|D ]}|d  qqS rT   r   r&   r   r   r   r$      r)   r*   )r   rK   r   r,   r   r   r   r      s   zMaxScoreTokenizer.tokenizer   Fc                 C   s   t |}|dkr|d|| j|fgS |dkrt| j|}| |||}|r(t| | |}| ||}|d d |krC|| ||7 }|d d dkrS|| 	||7 }t
|| dd dS )Nr   r   rP   c                 S      | d S rX   r   rZ   r   r   r   r\          z7MaxScoreTokenizer._recursive_tokenize.<locals>.<lambda>r]   )r2   rH   minrk   _initializer   _find_add_inter_subtokens_add_last_subtoken_add_first_subtokenrb   )r   r'   range_lr   rc   rI   resultaddsr   r   r   rm      s   
z%MaxScoreTokenizer._recursive_tokenizec           
   	   C   s~   g }t d|d D ]-}t d|d D ]#}|| }||krq||| }| j|| j}	|||||	|f qq	t|dd dS )Nr   rP   r   c                 S   s   | d  | d  | d fS )N      rP   r   rZ   r   r   r   r\          z/MaxScoreTokenizer._initialize.<locals>.<lambda>rq   )r`   rG   rR   rH   appendrb   )
r   r'   rx   rc   rI   r<   rrN   r(   scorer   r   r   rs      s   
zMaxScoreTokenizer._initializec                 C   s   g }d}|rW| d\}}}}}||||||f |sn:g }	t|D ]\}
\}}}}}||k r4||k s<||k rA||krA|	|
 q#t|	D ]}
||
= qF|d7 }|dkrUn|st|dd dS )Nr   rP   d   c                 S   ro   rX   r   rZ   r   r   r   r\      rp   z)MaxScoreTokenizer._find.<locals>.<lambda>rq   )r1   r~   r3   reversedrb   )r   rI   ry   num_iterwordr<   rN   r   r   removalsr=   _1b_e__2_3r   r   r   rt      s$    

zMaxScoreTokenizer._findc           	   	   C   sz   g }t |d d D ]0\}}|d ||d  d krq
|d }||d  d }||| }||||| j|| f q
|S )Nrn   r   rP   )r3   r~   rH   )	r   r'   ry   rz   r=   baser<   rN   r(   r   r   r   ru      s   z&MaxScoreTokenizer._add_inter_subtokensc                 C   s8   |d d }|d| }| j || j}|d|||fgS r_   rQ   )r   r'   ry   rN   r(   r   r   r   r   rw      s   z%MaxScoreTokenizer._add_first_subtokenc                 C   s@   |d d }||d  }| j || j}||t||t|fgS )Nrn   r   )rG   rR   rH   r2   )r   r'   ry   r<   r(   r   r   r   r   rv      s   z$MaxScoreTokenizer._add_last_subtoken)Nrj   rF   )T)r   F)rB   rC   rD   r   r   r   rm   rs   rt   ru   rw   rv   r   r   r   r   ri   z   s    



ri   c                   @   s   e Zd Z					d$ddZd%ddZd&d
dZd'ddZdd Zdd Zdd Z	dd Z
dd Zdd Zdd Zdd Zdd Zd d! Zd"d# ZdS )(MaxLRScoreTokenizerN333333?      ?c                    sp  dd  |r|ni | _  fdd| j  D | _|r|| j ni i f\| _| _|s+i }|s/i }t|tks<dd |D }| j| t|tksOdd |D }| j| |rY|ni | _|r`|ni | _	| jD ]}|| jvrrd| j|< qf| j	D ]}|| jvrd| j|< qv| jrt
dd	 | jD nd
| _| jrt
dd	 | jD nd
| _t| jd| _|| _|| _|	| _|
| _d S )Nc                    s"   t |    fdd|  D S )Nc                    s   i | ]	\}}||  qS r   r   )r!   r   r>   sum_r   r   
<dictcomp>   r}   z>MaxLRScoreTokenizer.__init__.<locals>.norm.<locals>.<dictcomp>)sumvaluesitems)rdictr   r   r   norm   s   z*MaxLRScoreTokenizer.__init__.<locals>.normc                    s   i | ]	\}}| |qS r   r   )r!   lr   r   r   r   r      r}   z0MaxLRScoreTokenizer.__init__.<locals>.<dictcomp>c                 S      i | ]}|d qS       ?r   )r!   r   r   r   r   r          c                 S   r   r   r   )r!   r   r   r   r   r      r   r   c                 s       | ]}t |V  qd S r   rY   r!   wr   r   r   	<genexpr>      z/MaxLRScoreTokenizer.__init__.<locals>.<genexpr>r   c                 s   r   r   rY   r   r   r   r   r     r   )rI   )lrgraphr   lrgraph_normDlDrtypedictupdatePlPrra   lmaxrmaxri   base_tokenizermax_lscore_differencemax_lscore_diffratioensurable_score_lensurable_score_lr_diff)r   r   r   preference_lpreference_rr   tokenizer_builderr   r   r   r   r   r   r   r   r   r      s:   





  
zMaxLRScoreTokenizer.__init__Tc                 C   r   r   r   )r   sentr   r   r   r   r   r     r   zMaxLRScoreTokenizer.__call__Fc                    r   )Nc                    s   g | ]
}|r | qS r   r   r    r#   r   r   r$     s    z0MaxLRScoreTokenizer.tokenize.<locals>.<listcomp>c                 S   s   g | ]	}|D ]}|qqS r   r   )r!   wordsr   r   r   r   r$     r}   r*   )r   r   r   r   sent_r   r#   r   r     s   zMaxLRScoreTokenizer.tokenizec                 C   sj   |  |}| |}| |}| |}|r| ||}n| |d}|s3dd |D }dd |D }|S )Nr   c                 S   s$   g | ]}|d  df|d dfgqS )r   LrP   Rr   )r!   pr   r   r   r$   *  rO   z1MaxLRScoreTokenizer._tokenize.<locals>.<listcomp>c                 S   s"   g | ]}|D ]}|d  r|qqS rT   r   )r!   r   r   r   r   r   r$   +     " )rs   _remove_l_subset_score
_find_best_postprocessing_base_tokenizing_subword)r   r"   r   rd   candidates_rI   re   postr   r   r   r     s   



zMaxLRScoreTokenizer._tokenizec                 C   s   |  |}| ||}|S r   )_initialize_L_initialize_LR)r   r"   rd   r   r   r   rs   .  s   
zMaxLRScoreTokenizer._initializec              	   C   sp   t |}g }t|D ]+}t|d t||| j d D ]}||| }|| jvr)q|||||| g qq
|S rX   )r2   r`   rr   r   r   r~   )r   r"   nrd   r<   rN   r   r   r   r   r   3  s   "
	z!MaxLRScoreTokenizer._initialize_Lc                 C   s   t |}g }|D ]?\}}}}tt| j|| d D ],}	|dkr%|	dkr%q||||	  }
|
r5|
| jvr5q|||
||||	 ||	||	 g qqt|dd dS )NrP   r   c                 S   ro   )Nr|   r   rZ   r   r   r   r\   U  rp   z4MaxLRScoreTokenizer._initialize_LR.<locals>.<lambda>rq   )r2   r`   rr   r   r   r~   rb   )r   r"   rd   r   expandedr   r<   rN   len_llen_rr   r   r   r   r   B  s(   z"MaxLRScoreTokenizer._initialize_LRc           	      C   s  |D ]}| | j|d d | | j|d d qt|dd d}g }|r|d}|d |d |d }}}d	}|D ]<}|d |ksZ|d |k sZ|d |k s[|d |ks[q@||d  | jk sx| jd
 |k r||d |d d  | jk r|d} nq@|s| | |s)|S )Nr   rP   c                 S   
   | d  S )Nr   rZ   r   r   r   r\   [     
 z6MaxLRScoreTokenizer._remove_l_subset.<locals>.<lambda>rq   r   r{   r   Fr   gh㈵>T)	r~   r   rR   r   rb   r1   r   r   r   )	r   rd   r>   r   re   r<   rN   lscoreexist_longerr   r   r   r   W  s,   
0
z$MaxLRScoreTokenizer._remove_l_subsetc              
   C   s8  ddl m} |dd }|D ]}||d  | qt|}g }t|dd d}|r|d}|\
}}}}	}
}}}}}|rvd}t|	|
D ].}|rJ n)||g D ]!}|d	 | j|d d | }| j	|d	 ksm|| j
krqd
} nqPqD|rvq(|s||d n|| | j|d | j|d }|| || |s*|S )Nr   )defaultdictc                   S   s   g S r   r   r   r   r   r   r\   t  s    z,MaxLRScoreTokenizer._score.<locals>.<lambda>r   c                 S   s"   | d  | d  | d | d  fS )Nr   rn   r      r   rZ   r   r   r   r\   {  r   rq   Fr   T)collectionsr   r~   r   rb   r1   r`   rR   r   r   r   r   )r   rd   r   begin_to_wordsr>   scoredr   r   p0p1p2r   r   len_lrscore_lscore_roverlapppedr<   r   
score_difftotal_scorer   r   r   r   q  s:   
0

zMaxLRScoreTokenizer._scorec                    s   g }t |dd d}|r9||d |d d |d d   fdd	t|D }t|D ]}||= q1|st |d
d dS )Nc                 S   r   )Nrn   r   rZ   r   r   r   r\     r   z0MaxLRScoreTokenizer._find_best.<locals>.<lambda>rq   r   rn   r   r|   c                    s,   g | ]\}} |d  k r|d kr|qS )r|   r   r   )r!   r=   r>   r<   rN   r   r   r$     s   , z2MaxLRScoreTokenizer._find_best.<locals>.<listcomp>c                 S   ro   Nr   r   rZ   r   r   r   r\     rp   )rb   r~   r1   r3   r   )r   rI   re   sorted_r   idxr   r   r   r     s   zMaxLRScoreTokenizer._find_bestc                 C   s   t |}g }|r|d d dkr|| ||7 }|r+|d d |k r+|| |||7 }|| ||7 }dd |D | }t|dd d	S )
Nr   r   rn   r{   c                 S   s   g | ]}|qS r   r   r   r   r   r   r$     s    z7MaxLRScoreTokenizer._postprocessing.<locals>.<listcomp>c                 S   ro   r   r   rZ   r   r   r   r\     rp   z5MaxLRScoreTokenizer._postprocessing.<locals>.<lambda>rq   )r2   _add_first_subword_add_last_subword_add_inter_subwordsrb   )r   r"   r   r   rz   r   r   r   r   r     s   z#MaxLRScoreTokenizer._postprocessingc           	      C   sp   g }t |d d D ]+\}}|d ||d  d krq
|d }||d  d }||| }|| ||7 }q
|S )Nrn   r|   rP   r   )r3   r   )	r   r"   r   rz   r=   r   r<   rN   subwordr   r   r   r     s   z'MaxLRScoreTokenizer._add_inter_subwordsc                 C   s$   |d d }||d  }|  ||S )Nrn   r{   r   )r   r"   r   r   r<   r   r   r   r   r        z%MaxLRScoreTokenizer._add_last_subwordc                 C   s$   |d d }|d| }|  |dS )Nr   r   r   )r   r"   r   rN   r   r   r   r   r     r   z&MaxLRScoreTokenizer._add_first_subwordc                 C   s   | j |}g }d}|D ]:}t|}|| jv r1|d||| || || d||d| j j| g
 q||d|| || || |d|ddg
 q|S )Nr   r-   )r   r   r2   r   r~   rI   )r   r"   r<   r   words_r   r   r   r   r   r   r     s   
4,z,MaxLRScoreTokenizer._base_tokenizing_subword)
NNNNNNr   r   r   r   r?   r@   rA   )rB   rC   rD   r   r   r   r   rs   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r      s*    

1

"r   )sysversion_inforeloadsetdefaultencodingr   r
   numpynpr   rE   ri   r   r   r   r   r   <module>   s   

L#i