o
    ´©i‚  ã                   @   sª   d dl Z d dlZd dlZd dlmZ d dlmZ e  d¡Zdd„ Z	dd„ Z
d	d
„ Zdefdd„Zddefdd„Zdefdd„Zdedefdd„Zdd„ ZG dd„ dƒZdS ) é    N)Úpartial)Ú
Normalizerz[\u4e00-\u9fff]+c                 C   s   t t | ¡ƒS ©N)ÚboolÚchinese_char_patternÚsearch©Útext© r
   úO/home/ubuntu/.local/lib/python3.10/site-packages/voxcpm/utils/text_normalize.pyÚcontains_chinese   s   r   c                 C   s@   |   dd¡} |   dd¡} |   dd¡} |   dd¡} |   d	d
¡} | S )Nõ   Â²u   å¹³æ–¹õ   Â³u   ç«‹æ–¹u   âˆšu   æ ¹å·u   â‰ˆu	   çº¦ç­‰äºŽú<u   å°äºŽ©Úreplacer   r
   r
   r   Úreplace_corner_mark   s   r   c                 C   sL   |   dd¡  dd¡} |   dd¡  dd¡} |   dd¡  dd¡} |   dd¡} | S )	Nu   ï¼ˆú u   ï¼‰u   ã€u   ã€‘ú`Ú u   â€”â€”r   r   r
   r
   r   Úremove_bracket   s
   r   r	   c                 C   sœ   g }d }t | ƒD ](\}}| ¡ s*|d ur$| | ||… ¡}| |¡ d }| |¡ q|d u r0|}q|d urI|t| ƒk rI| | |d … ¡}| |¡ d |¡S )Nr   )Ú	enumerateÚisdigitÚnumber_to_wordsÚappendÚlenÚjoin)r	   Úinflect_parserÚnew_textÚstÚiÚcÚnum_strr
   r
   r   Úspell_out_number#   s    
€

r#   ÚzhéP   é<   é   Fc                    s¬  dt f‡ ‡fdd„}dt f‡ ‡‡fdd„}ˆ dkrg d¢}	ng d¢}	|r+|	 d	d
g¡ d}
g }t| ƒD ]F\}}||	v ryt| |
|… ƒdkrP| | |
|… | ¡ |d t| ƒk ru| |d  dv ru| d¡}| || |d   ¡ |d }
q3|d }
q3t|ƒdkr“ˆ dkrŒ| | d ¡ n| | d ¡ g }d}|D ]}||| ƒ|kr°||ƒ|kr°| |¡ d}|| }q™t|ƒdkrÔ||ƒrÏt|ƒdkrÏ|d | |d< |S | |¡ |S )NÚ_textc                    s   ˆ dkrt | ƒS t ˆ| ƒƒS ©Nr$   ©r   ©r(   )ÚlangÚtokenizer
   r   Úcalc_utt_length;   s   z(split_paragraph.<locals>.calc_utt_lengthc                    s$   ˆ dkr
t | ƒˆk S t ˆ| ƒƒˆk S r)   r*   r+   ©r,   Ú	merge_lenr-   r
   r   Úshould_mergeA   s   z%split_paragraph.<locals>.should_merger$   )
õ   ã€‚u   ï¼Ÿu   ï¼u   ï¼›u   ï¼šu   ã€Ú.ú?ú!ú;)r3   r4   r5   r6   ú:u   ï¼Œú,r   é   )ú"u   â€éÿÿÿÿé   r2   r3   r   )ÚstrÚextendr   r   r   Úpop)r	   r-   r,   Útoken_max_nÚtoken_min_nr0   Úcomma_splitr.   r1   Úpouncr   Úuttsr    r!   ÚtmpÚ
final_uttsÚcur_uttÚuttr
   r/   r   Úsplit_paragraph:   sH   
 

€


þrI   c                 C   s~   g }t | ƒD ]3\}}|dkr4| |d   ¡ r3| |d  dkr3| |d   ¡ r3| |d  dkr3| |¡ q| |¡ qd |¡S )Nr   r9   r   )r   Úisasciir   r   )r	   Úout_strr    r!   r
   r
   r   Úreplace_blanko   s    ÿ
€
rL   Úmd_textÚreturnc                 C   sŽ   t jdd| t jd} t  dd| ¡} t  dd| ¡} t  dd| ¡} t jdd| t jd} t  d	d| ¡} t jd
d| t jd} t  dd| ¡} |  ¡ } | S )Nz	```.*?```r   ©Úflagsz`[^`]*`z!\[[^\]]*\]\([^\)]+\)z\[([^\]]+)\]\([^)]+\)z\1z
^(\s*)-\s+z<[^>]+>z
^#{1,6}\s*z\n\s*\nÚ
)ÚreÚsubÚDOTALLÚ	MULTILINEÚstrip)rM   r
   r
   r   Úclean_markdownz   s   rW   c                 C   sH   t | ƒ} tjdtjd d| ¡} |  dd¡} |  dd¡} |  dd¡} | S )	Nz&\p{Emoji_Presentation}|\p{Emoji}\uFE0FrO   r   rQ   r   ú	r:   u   \â€œ)rW   ÚregexÚcompileÚUNICODErS   r   r   r
   r
   r   Ú
clean_text—   s   r\   c                   @   s    e Zd Zddd„Zddd„ZdS )	ÚTextNormalizerNc                 C   s2   || _ tdddd| _tddd| _t ¡ | _d S )Nr$   ÚtnT)r,   ÚoperatorÚremove_erhuaÚen)r,   r_   )Ú	tokenizerr   Úzh_tn_modelÚen_tn_modelÚinflectÚenginer   )Úselfrb   r
   r
   r   Ú__init__£   s   zTextNormalizer.__init__Fc                 C   s”   t |ƒrdnd}t|ƒ}|dkr6| dd¡}t d|¡r#t dd|¡}| j |¡}t|ƒ}t	|ƒ}t
|ƒ}n| j |¡}t|| jƒ}|du rH|S d S )	Nr$   ra   ú=u   ç­‰äºŽu   ([\d$%^*_+â‰¥â‰¤â‰ Ã—Ã·?=])z(?<=[a-zA-Z0-9])-(?=\d)z - F)r   r\   r   rR   r   rS   rc   Ú	normalizerL   r   r   rd   r#   r   )rg   r	   Úsplitr,   r
   r
   r   rj   ©   s   
ÿzTextNormalizer.normalizer   )F)Ú__name__Ú
__module__Ú__qualname__rh   rj   r
   r
   r
   r   r]   ¢   s    
r]   )r$   r%   r&   r'   F)rR   rY   re   Ú	functoolsr   Úwetextr   rZ   r   r   r   r   r=   r#   rI   rL   rW   r\   r]   r
   r
   r
   r   Ú<module>   s   

	5