o
    
im                     @   sR  d dl Z ddlmZmZ d dlmZmZ d dl Z ddlmZ d dl	m
Z
 ddlmZ 	 i ddd	d	d
ddddddddddddddddddddddddddddd d!d"d#d$d%d&d&dd'd(d$d)Zedd$d*d d+ i d,d,d-d,d.d,d/d,d0d,d1d,d2d3d4d3d5d3d6d3d7d3d8d3d9d9d:d9d;d9d<d9d=d9i d>d9d?d@dAd@dBd@dCd@dDd@dEd@dFd?dGd?dHd?dId?dJd?dKd?dLdLdMdLdNdLdOdLi dPdLdQdLdRdSdTdSdUdSdVdSdWdSdXdSdYdRdZdRd[dRd\dRd]dRd^dRd_d`dad`dbd`i dcd`ddd`ded`dfdfdgdfdhdfdidfdjdfdkdfdldmdndmdodmdpdmdqdmdrdmdsdLdtdLi dudLdvdLdwdLdxdLdydydzdyd{dyd|dyd}dyd~dyddddddddddddddi ddddddddddddddddddddddddddddddddddi ddSddSddSddSddSddSddSddSddSddSddSddSddRddRddRddRddRi ddRddRddRddRddRddRddRddddddddddddddddddddi ddddddddddddddddddddddddddddddɓddɓddɓdddddddddd̜	Zi ddΓddΓddΓddΓddΓddΓddՓddՓddՓddՓddՓddՓddܓddܓddܓddܓddܓi ddܓddddddddddddddddddddddddddddddddi dddddyddzdd{dd|dd}dd~ddddddddddddddd dd dd i dd dd dd ddddd	dd
dddddddddddddddddddddi ddddddddddddddddddddd d!d"d!d#d!d$d!d%d!d&d!d'd(i d)d(d*d(d+d(d,d(d-d(d.d/d0d/d1d/d2d/d3d/d4d/d5d6d7d6d8d6d9d6d:d6d;d6i d<d6d=d6d>d6d?d6d@d6dAd6dBdCdDdCdEdCdFdCdGdCdHdCdIdJdKdJdLdJdMdJdNdJdJdOdOdOdOdOdOdPZi dQdRdSdRdTdRdUdRdVdRdWdRdXdRdYdRdZdRd[dRd\dRd]d^d_d^d`d^dad^dbd^dcd^i dddedfdedgdedhdedidedjdedddedkdedldedmdedndedodeddRdpdRdqdRdrdRdsdRi dtdRdud^dvd^dwd^dxd^dyd^dzd^d{d|d}d|d~d|dd|dd|dd|ddeddeddeddei ddeddeddddddddddddddddddddddddddddddi ddddddddddddddddddddddddddddddddddi ddddddddddddddddddddddddddddddddddddddddZi ddddfddgddhddiddjdddΓddΓddΓddΓddΓddΓddՓddՓddՓddՓddՓi ddՓddܓddܓddܓddܓddܓddܓddddddddddddddddddddi ddddddddddddddddddΓddΓddΓddΓddΓddΓddՓddՓddՓdddddddddd	Zd"d	d&dddddd&d	Zi d-dd.dd/dd0dd1dd4dd5dd6dd7dd8dd:dd;dd<dd=dd>ddAddBdi dCddDddEddGddHddIddJddKddMddNddOddPddQddTddUddVddWdi dXddZdd[dd\dd]dd^ddaddbddcdddddeddgddhddiddjddkddnddddddddddd	ZdddddddZdddddddZi d,dddddddd?dLddddd!ddLd͓d$dd&dddydddddRdd"dddd,d dddldddddd
Zi d,d,d9d-d2dadd	dd
ddddd?d?dFdFddddd!ddLdLd$dd&dddddi dddRdRdYdYd_d_d"dddddddd	ddfdfdldlddd*ddddsdLd dZg dZe ejZe dej d e jZe d d d! ejD e jZ!d"d# Z"d$d% Z#d&d' Z$G d(d) d)Z%e%egZ&dS (*      N   )G2P
LINK_REGEX)tokenizeregex_tokenize)	ViCleaner)deque)MTokenbtth   tʰu   đdchckhxg   ɣlmnngh   ŋnh   ɲngphfvszhpkwj   ʈk   ʐ   ʂ)r"   r#   qugitrr'   r   ghrr    r+   w)r   r%   r/   r!   a   á   àu   ả   ãu   ạ   â   ɤ̆u   ấu   ầu   ẩu   ẫu   ậ   ău   ắu   ằu   ẳu   ẵu   ặe   ɛ   é   èu   ẻu   ẽu   ẹ   êu   ếu   ều   ểu   ễu   ệi   í   ìu   ỉu   ĩu   ịo   ɔ   ó   òu   ỏ   õu   ọ   ôu   ốu   ồu   ổu   ỗu   ộu   ơ   ɤu   ớu   ờu   ởu   ỡu   ợu   ú   ùu   ủu   ũu   ụ   ư   ɯu   ứ   ừ   ử   ữ   ựy   ý   ỳ   ỷ   ỹ   ỵeou   éou   èou   ẻou   ẽou   ẹou   êu   ɛuu   ếuu   ềuu   ểuu   ễuu   ệuia   iəu   íau   ìau   ỉau   ĩau   ịau   iáu   iàu   iảu   iãu   iạu   iêu   iếu   iều   iểu   iễu   iệoou   óou   òou   ỏou   õou   ọou   oóu   oòu   oỏu   oõu   oọu   ôôu   ốôu   ồôu   ổôu   ỗôu   ộôu   ôốu   ôồu   ôổu   ôỗu   ôộua   uəu   úau   ùau   ủau   ũau   ụau   uôu   uốu   uồu   uổu   uỗu   uộu   ưa   ɯəu   ứau   ừau   ửau   ữau   ựau   ươu   ướu   ườu   ưởu   ưỡu   ượu   yê   iɛu   yếu   yề)	u   yểu   yễu   yệ   uơ   uở   uờr_      uỡ   uợaiaju   áiu   àiu   ảiu   ãiu   ạiay   ăju   áyu   àyu   ảyu   ãyu   ạyaoawu   áou   àou   ảou   ãou   ạoau   ăwu   áuu   àuu   ảuu   ãuu   ạuu   ây   ɤ̆ju   ấyu   ầyu   ẩyu   ẫyu   ậyu   âu   ɤ̆wu   ấuu   ầuu   ẩuu   ẫuu   ậuewiuiwu   íuu   ìuu   ỉuu   ĩuu   ịuoi   ɔju   óiu   òiu   ỏiu   õiu   ọiu   ôioju   ốiu   ồiu   ổiu   ỗiu   ộiuiuju   úiu   ùiu   ủiu   ũiu   ụiuy   ʷiu   úyu   ùyu   ủyu   ũyu   ụyu   uýu   uỳu   uỷu   uỹu   uỵu   ơi   ɤju   ớiu   ờiu   ởiu   ỡiu   ợiu   ưi   ɯju   ứiu   ừiu   ửiu   ữiu   ựiu   ưu   ɯwu   ứuu   ừuu   ửuu   ữuu   ựuu   iêu   iəwu   iếuu   iềuu   iểuu   iễuu   iệuu   yêuu   yếuu   yềuu   yểuu   yễuu   yệuu   uôi   uəju   uốiu   uồiu   uổiu   uỗiu   uộiu   ươi   ɯəju   ướiu   ườiu   ưởiu   ưỡi   ɯəw)u   ượiu   ươuu   ướuu   ườuu   ưởuu   ưỡuu   ượuoa   ʷau   oáu   oàu   oảu   oãu   oạu   óau   òau   ỏau   õau   ọau   oă   ʷău   oắu   oằu   oẳu   oẵu   oặoe   ʷɛu   oéu   oèu   oẻu   oẽu   oẹu   óeu   òeu   ỏeu   õeu   ọeu   uáu   uàu   uảu   uãu   uạu   uău   uắu   uằu   uẳu   uẵu   uặu   uâ   ʷɤ̆u   uấu   uầu   uẩu   uẫu   uậueu   uéu   uèu   uẻu   uẽu   uẹu   uê   ʷeu   uếu   uều   uểu   uễu   uệr^      ʷɤu   uớr`   r_   ra   rb   uya   ʷiəu   uyáu   uyàu   uyảu   uyãu   uyạu   uyêu   uyếu   uyều   uyểu   uyễu   uyệuyu   ʷiuu   uyúu   uyùu   uyủu   uyũu   uyụu   uýuu   uỳuu   uỷuu   uỹuu   uỵuoen   ʷenu   oénu   oènu   oẻnu   oẽnu   oẹnoet   ʷet)u   oétu   oètu   oẻtu   oẽtu   oẹt   ɛjoaiu   oáiu   oàiu   oảiu   oãiu   oạioayu   oáyu   oàyu   oảyu   oãyu   oạyoaou   oáou   oàou   oảou   oãou   oạooeou   oéou   oèou   oẻou   oẽou   oẹou   óeou   òeou   ỏeou   õeou   ọeoueou   uéou   uèou   uẻou   uẽou   uẹouaiu   uáiu   uàiu   uảiu   uãiu   uạiuayu   uáyu   uày)	u   uảyu   uãyu   uạyu   uâyu   uấyu   uầyu   uẩyu   uẫyu   uậy   tʃ)	r#   r   r   r   r   r   r   r   r'                  )	rK   rL   rM   rN   rP   rQ   rR   rS   rT   zi)r+   u   gí   gìr   u   gĩu   gịkwi)quyu   qúyu   qùyu   qủyu   qũyu   qụybisiu   điu   épgiyu   hếchu   giâyu   câyemenpiqkiuu   éttidiuviu
   đắp liuu   ítquaiu   giét)
r.   r    r   rF   rI   r   r/   r   rO   r!   u   bêu   cêu   dêu   đêu   phờu   gờu   hờkau   lờu   mờu   nờu   pờr   r.   u   rờu   sờu   tờu   xờgia)r|   rk   r   rl   r}   r   rz   r{   r   u   tʰwr   r   r      ŋ͡m   k͡pcwjwr[   u   eəbwrr   rv   vwrj   u   ʈwu   ʂwu   aʊfwrV   r   r   u   ɔɪxwr   r5   u   ŋwu   ʊər   r   dwu   eɪu   aɪrm   rX   u   ɣwzwrx   r   ry   rw   u   ɔ:u   əʊr   mwu   ɑ:hwrq   rt   lwu   ɪərf   zu:rh   r   ro   rd   u   ɜ:r$   nwu   ɲwrU   swtwu   ʐwr]   r   zi:r\   r   u   θu   ʌr   r/   1u   ɪrJ   r   r#   u   ərF   r?   3r   !   ðu   ʧ6u   ʒr(   r!   r   r   r6      ærE   2u   ʤr<   .r
   r"   r   r)   r@   r8   r'   r   5 r   r%   r   r&   ,4u   ʊr    r   r0   u   ʃ?r.   :r   ;r7   r   '   –u   ^[!-~“”–z]+$|c                 c   s"    | ]}|  st|V  qd S )N)isasciireescape).0r    r   =/home/ubuntu/.local/lib/python3.10/site-packages/misaki/vi.py	<genexpr>   s     r   c                    s  t tttttttf\}}}}	}
}}}|s|rt}| d}d}d}d}d}d}t	}|dkrdd |v rB|dd  }d}n!dd |v rU|dd  }d}nd |v rc|d  }d}|d | |v rz||d |  }d}n|d  |v r||d   }d}dd |v r|rt	dkrd}d}n|||  }||v r|dkr|dkrǈd |vrd||  }n|| }n|| }n|| }n||	v r|d	kr|	| }|r|d
 }nod
}nl||	v r|d	kr|	| }n_||v r|| d }|| dd }|d	kr|r|d
 }n?d
}n<||
v r+|
| d }|
| d d }n(|v r=| d }| d }n|v rQ| d d }| d }ndS |dkr|dkru|dkri|dkrid}|dkru|dkrud}|dvr|dkrd}n|dkr|dv r|dkrd}|dkr|dkr|dv rd}n'|dv r|dkrd}|dkrd}n|dv r|dkrd}|dkrd}|dkr|dv r|dkrd}|dkrd}|dkrd} fdd t
d|D }|rt|t	|d  }n|s|s|dkrtd!}n	td"}ntd#}|dkr|dks.|dkr:|d$kr:|d%v r:d&}|dkrD|d'ksN|dkrU|d(krU|d%v rUd)}|dkrr|d*krf|d%v rfd+}|d,krr|d%v rrd-}|d.v r|dkr~d/}|dkrd0}||||fS d S )1N r   r   r   r   r<   r!   u   ʔr$   r/   )NNNNr   r0   r'   r8   r   )r<   r7   r8   r   )r<   r7   r   r   )
rX   r\   r[   rF   rJ   rE   r?   r@   r6   r5   r    )r   r#   rX   r[   rF   r\   rJ   c                    s$   g | ]}|  v r |  qS r   r   )r   r<   toneswordr   r   
<listcomp>  s   $ ztrans.<locals>.<listcomp>3533r   21g)r#   r   r'   21241345r   5br   6b)rF   r?   r@   r   r   )
Cus_onsets
Cus_nuclei	Cus_codasCus_onglidesCus_offglidesCus_onoffglidesCus_quCus_giCus_tones_plenrangestr)r   dialectglottalphamcaopalatalsonsetsnucleicodasonglides	offglidesonoffglidesr*   r+   tones_ponsnuccodtonoOffsetcOffsetr   nucltonelistr   r   r   trans   s   $
 

























(2



 r  c                 C   s   d}d}d}	d}
d}z/t | |||||\}}}	}
d|||	|
fv r)d|  d }W |S |dd |||	|
fD }W |S  tyC   Y |S w )z,Convert a single orthographic string to IPA.r   r   N[]c                 s   s    | ]}|pd V  qdS )r   Nr   )r   r#   r   r   r   r   O  s    zconvert.<locals>.<genexpr>)r  join	TypeError)r   r   r   r   r   r   delimitr   r   r  r  seqr   r   r   convertA  s    r  c           
      C   s   d}| dkr
t  } | jdtd d}d}t|D ]M\}}d}|dkr(|d }q| D ]*}	t|	t||d  krT|	|||t|	  krT|||	 7 }d}t|	d } nq*|dkrft|dv r`q||| 7 }q| | S )	Nr   defaultT)reversekeyr   r   r   )u   ˈu   ˌ*)vi_symscopysortr   	enumerater   rstrip)
	listParsetextr  undefine_symboloutputskipiccharcheckr   r   r   r   ParsingW  s0   0r   c                   @   s<   e Zd Zddddddddddi fddZdd Zdd	 Zd
S )VIG2Pnorthr   Tc                 C   s   || _ || _|| _|| _|| _|| _|dv r|d | _ntd| | jdkr,d| _nd| _d|d< |r<td
i |ndd | _	t
|	|
d	| _d S )N)r"  centralsouthr   zVietnamese dialect r      ❓unkc                 S   s   dg fS )Nr%  r   _r   r   r   <lambda>  s    z VIG2P.__init__.<locals>.<lambda>)
clean_abbrclean_acronymr   )r   r   r   r   substr_tokenize	tone_typer   NotImplementedErrorr   en_g2pr   cleaner)selfr   r   r   r   r   r,  r-  enable_en_g2pr*  r+  en_g2p_kwargsr   r   r   __init__t  s   
zVIG2P.__init__c              	      s  d|vr
d|fgS    kr*tdurtnt  fdd  D S }  tdu rH\}}d|vrHd|fgS jsQd|fgS t }t }t }rt	dkr}	t
t|	|	jjjjjd}
|| ||	 ||
 n]d\}}tt	d d	d	D ]-}|d }t	|dkr|nt||}t
|jjjjjd}
d|
vr|}|
}q|d	kr|| || ||d  d| nns\tt|||S )
uC  
        Approximation of foreign name pronunciation
        Return (parent, text, phonemes)
        Example:
            Y:  /i/
            
            Blôk: 
            - k -> /k/
            - ôk -> /ok͡p1/
            - lôk -> /lok͡p1/
            - Blôk -> ❌
            - B -> Bờ -> bɤ2
            => /bɤ2 lok͡p1/
            
            Êban:
            - n -> /nɤ2/
            - an -> /an1/
            - ban -> /ban1/
            - Êban -> ❌
            - Ê -> /e1/
            => /e1 ban1/
        
        => /i bɤ2 lok͡p1 e1 ban1/
        r  Nc                    s8   g | ]}|t  ||jjjjjd fqS )/)r  getr   r   r   r   r   )r   r  mappingr1  tkr   r   r     s    $z$VIG2P.substr2ipa.<locals>.<listcomp>r%  r   r5  )r   r   r   )lowerupperVI_ONLYsearchVIENr/  r,  r   r   r  r6  r   r   r   r   r   
appendleftr   listzip)r1  r9  ipaorig_tkengr(  parentspartssub_ipar  _ipastartconverted_ipar<   tkcsub_tkr   r7  r   
substr2ipa  s^    




zVIG2P.substr2ipac                 C   s  | j r|dddd}i }tt|D ]}| \}}|dd||  < ||d|}q| j	
|}t|}g }|D ]}||ddd	d
  qB|}g }	g }
|D ]}t|d u r}|	d| d  |
t|ddd| d  q\|dv r|dv rd}|	| |
t|dd| q\|dv rd}|	| |
t|dd| q\|dv r|dv rd}|	| |
t|dd| q\||  }|d ur|	| |
t|dd| q\t| | j| j| j| j| jd}| ||}|D ]?\}}}d|v r|d\}}}}d||||g}n	dgd \}}}}|	| |
t|dd|tj|||||dd qq\d|	|
fS )Nr(  r   -r5  r   r   r   z . r   z , r  r	  )	r   r   r   r   r   r   )}r	  )rP  rQ  r	  rP  )({r  rR  )"r   r      “   ”)rU  rV  rT  r   )parentr   r   r   toner'  )r,  replacer   finditerr   groupsr:  stripgroupr0  
clean_textr   extendsplitEN_VI_REGEXmatchappendr	   r6  r  r   r   r   r   r   rN  r
  
Underscore)r1  r  custom_dictr   r   custom_phonemeTNTKnew_TKIPAmtokensr9  
custom_ipa	first_tryparent_tk_ipasrW  rC  r   r   r   rX  r   r   r   __call__  st    



"

zVIG2P.__call__N)__name__
__module____qualname__r4  rN  ro  r   r   r   r   r!  s  s    

Rr!  )'r   r   r   r   "underthesea.pipeline.word_tokenizer   r   
vi_cleanerr   collectionsr   tokenr	   r   updater   r   r   r   r   r   r   r   r?  r>  r  compilenumberNUMBER_REGEXVIETNAMESE_CHARACTERS_LOWER
IGNORECASEra  r
  r<  r  r  r   r!  __all__r   r   r   r   <module>   s    
&						





&						






>						






>
*					


  
& ( .