o
    :iV                  
   @   s~  d dl Z d dlZd dlmZmZ d dlmZ d dlmZm	Z	m
Z
 d dlZd dlmZ d dlmZmZ d dlmZmZ d dlmZmZ zd d	lmZ W n ey] Z zee d
dZ[ww eje j G dd deZG dd deZ G dd deZ!G dd deZ"G dd de"Z#G dd deZ$dede%de%fddZ&e'dkrdZ(e" Z)e)*e(gZ+e,dd-e+d    dS dS )     N)ABCabstractmethod)reduce)DictListOptional)CutSet)Stylelazy_pinyin)to_finals_tone3to_initials)ChineseTextNormalizerEnglishTextNormalizer)phonemize_espeakzl
Please run
pip install piper_phonemize -f             https://k2-fsa.github.io/icefall/piper_phonemize.htmlc                   @   sz   e Zd ZdZedee deee  fddZedee deee  fddZ	edeee  deee  fd	d
Z
dS )	Tokenizerz>Abstract base class for tokenizers, defining common interface.textsreturnc                 C      t )z4Convert list of texts to list of token id sequences.NotImplementedErrorselfr    r   3/home/ubuntu/LuxTTS/zipvoice/tokenizer/tokenizer.pytexts_to_token_ids/      zTokenizer.texts_to_token_idsc                 C   r   )z1Convert list of texts to list of token sequences.r   r   r   r   r   texts_to_tokens4   r   zTokenizer.texts_to_tokenstokensc                 C   r   )z>Convert list of token sequences to list of token id sequences.r   )r   r   r   r   r   tokens_to_token_ids9   r   zTokenizer.tokens_to_token_idsN)__name__
__module____qualname____doc__r   r   strintr   r   r   r   r   r   r   r   ,   s      (r   c                   @   s   e Zd ZdZddee fddZdee deee  fdd	Z	dee deee  fd
dZ
deee  deee  fddZdS )SimpleTokenizerz_The simplpest tokenizer, treat every character as a token,
    without text normalization.
    N
token_filec                 C   s   d| _ |du rtd dS i | _t|ddd/}| D ]"}| d}|d t|d	 }}|| jvs:J ||| j|< qW d   n1 sJw   Y  | jd
 | _	t
| j| _d| _ dS )
        Args:
          tokens: the file that contains information that maps tokens to ids,
            which is a text file with '{token}	{token_id}' per line.
        FNTInitialize Tokenizer without tokens file,                 will fail when map to ids.rutf-8encoding	r      _T)
has_tokensloggingdebugtoken2idopen	readlinesrstripsplitr$   pad_idlen
vocab_size)r   r&   flineinfotokenidr   r   r   __init__D   s$   
zSimpleTokenizer.__init__r   r   c                 C      |  | |S Nr   r   r   r   r   r   r   ]      z"SimpleTokenizer.texts_to_token_idsc                    s    fddt t D }|S )Nc                       g | ]}t  | qS r   list.0ir   r   r   
<listcomp>g       z3SimpleTokenizer.texts_to_tokens.<locals>.<listcomp>ranger9   r   r   tokens_listr   rK   r   r   c   s   zSimpleTokenizer.texts_to_tokensrQ   c                 C   d   | j sJ dg }|D ]$}g }|D ]}|| jvr!td|  q|| j|  q|| q|S N/Please initialize Tokenizer with a tokens file.	Skip OOV r0   r3   r1   r2   appendr   rQ   token_ids_listr   	token_idstr   r   r   r   j      
z#SimpleTokenizer.tokens_to_token_idsrB   )r   r    r!   r"   r   r#   r@   r   r$   r   r   r   r   r   r   r   r%   ?   s$    





r%   c                   @   s   e Zd ZdZddee defddZded	ee fd
dZdee d	eee	  fddZ
dee d	eee  fddZdeee  d	eee	  fddZdS )EspeakTokenizerz,A simple tokenizer with Espeak g2p function.Nen-usr&   langc                 C   s   d| _ || _|du rtd dS i | _t|ddd/}| D ]"}| d}|d t	|d	 }}|| jvs=J ||| j|< q W d   n1 sMw   Y  | jd
 | _
t| j| _d| _ dS )a&  
        Args:
          tokens: the file that contains information that maps tokens to ids,
            which is a text file with '{token}	{token_id}' per line.
          lang: the language identifier, see
            https://github.com/rhasspy/espeak-ng/blob/master/docs/languages.md
        FNr(   r)   r*   r+   r-   r   r.   r/   T)r0   r_   r1   r2   r3   r4   r5   r6   r7   r$   r8   r9   r:   )r   r&   r_   r;   r<   r=   r>   r?   r   r   r   r@      s&   	
zEspeakTokenizer.__init__textr   c              
   C   sb   zt || j}tdd |}|W S  ty0 } ztd| j d|  g W  Y d }~S d }~ww )Nc                 S      | | S rB   r   xyr   r   r   <lambda>       z%EspeakTokenizer.g2p.<locals>.<lambda>zTokenization of z texts failed: )r   r_   r   	Exceptionr1   warningr   r`   r   exr   r   r   g2p   s   zEspeakTokenizer.g2pr   c                 C   rA   rB   rC   r   r   r   r   r      rD   z"EspeakTokenizer.texts_to_token_idsc                    s     fddt tD }|S )Nc                    s   g | ]	}  | qS r   )rk   rH   r   r   r   rL      s    z3EspeakTokenizer.texts_to_tokens.<locals>.<listcomp>rN   rP   r   r   r   r      s   zEspeakTokenizer.texts_to_tokensrQ   c                 C   rR   rS   rV   rX   r   r   r   r      r\   z#EspeakTokenizer.tokens_to_token_ids)Nr^   )r   r    r!   r"   r   r#   r@   r   rk   r$   r   r   r   r   r   r   r   r]      s&    	





r]   c                   @   sV  e Zd Zd*dee fddZdee deee  fdd	Zd
edefddZ	dee deee  fddZ
deee  deee  fddZd
edee fddZd
edee fddZd
edee fddZd
edee fddZdd Zd
edee fddZdd Zd edefd!d"Zd edefd#d$Zd%edefd&d'Zd%edefd(d)ZdS )+EmiliaTokenizerNphoner&   c                 C   s   |dksJ d| dt  | _t | _d| _|du r"td dS i | _t|ddd	/}|	 D ]"}|
 d
}|d t|d }}|| jvsNJ ||| j|< q1W d   n1 s^w   Y  | jd | _t| j| _d| _dS )r'   rm   z1Only support phone tokenizer for Emilia, but get .FNzXInitialize Tokenizer without tokens file,                     will fail when map to ids.r)   r*   r+   r-   r   r.   r/   T)r   english_normalizerr   chinese_normalizerr0   r1   r2   r3   r4   r5   r6   r7   r$   r8   r9   r:   )r   r&   
token_typer;   r<   r=   r>   r?   r   r   r   r@      s.   


zEmiliaTokenizer.__init__r   r   c                 C   rA   rB   rC   r   r   r   r   r      rD   z"EmiliaTokenizer.texts_to_token_idsr`   c                 C   s
   |  |S rB   )map_punctuationsr   r`   r   r   r   preprocess_text   s   
zEmiliaTokenizer.preprocess_textc           
      C   s   t t|D ]}| || ||< qg }|D ]^}| |}g }t t|D ]I}|| }|d dkr9| |d }	n1|d dkrG| |d }	n#|d dkrU| |d }	n|d dkra|d g}	n	td|  q%||	7 }q%|	| q|S )Nr.   zhr   enpinyintagzjNo English or Chinese characters found,                             skipping segment of unknown language: )
rO   r9   rt   get_segmenttokenize_ZHtokenize_ENtokenize_pinyinr1   rh   rW   )
r   r   rJ   phoneme_listr`   segmentsall_phonemeindexsegphonemer   r   r   r      s2   

zEmiliaTokenizer.texts_to_tokensrQ   c                 C   rR   rS   rV   rX   r   r   r   r     s   
z#EmiliaTokenizer.tokens_to_token_idsc              
   C   s   z<| j |}tt|}t|tjddd}g }|D ]}|dd  r+|d dv s1|	| q|
| | q|W S  tyX } ztd|  g W  Y d }~S d }~ww )NT)styletone_sandhineutral_tone_with_fiver   12345z&Tokenization of Chinese texts failed: )rp   	normalizerG   jiebacutr
   r	   TONE3isalpharW   extendseperate_pinyinrg   r1   rh   )r   r`   segsfullphonesrc   rj   r   r   r   rz   *  s*   
zEmiliaTokenizer.tokenize_ZHc              
   C   sd   z| j |}t|d}tdd |}|W S  ty1 } ztd|  g W  Y d }~S d }~ww )Nr^   c                 S   ra   rB   r   rb   r   r   r   re   E  rf   z-EmiliaTokenizer.tokenize_EN.<locals>.<lambda>z&Tokenization of English texts failed: )ro   r   r   r   rg   r1   rh   ri   r   r   r   r{   A  s   
zEmiliaTokenizer.tokenize_ENc              
   C   s   z4| dr|dsJ |dd}|dd  r#|d dv s/td| d g W S | |W S  tyP } ztd|  g W  Y d }~S d }~ww )	N<>r   r   r   zHStrings enclosed with <> should be pinyin,                     but got: z. Skipped it. zTokenize pinyin failed: )	
startswithendswithlstripr6   r   r1   rh   r   rg   )r   r`   rj   r   r   r   r|   K  s    zEmiliaTokenizer.tokenize_pinyinc                 C   sJ   g }t |dd}t|ddd}|dkr||d  |dkr#|| |S )z8
        Separate pinyin into initial and final
        F)strictT)r   r    0)r   r   rW   )r   r`   pinyinsinitialfinalr   r   r   r   \  s   
zEmiliaTokenizer.seperate_pinyinc                 C   s   | dd}| dd}| dd}| dd}| d	d
}| dd}| dd}| dd}| dd}| dd}| dd}| dd}| dd}| dd}| dd}|S )Nu   ，,u   。rn   u   ！!u   ？?u   ；;u   ：:u   、u   ‘'u   “"u   ”u   ’u   ⋯u   …u   ···u	   ・・・z...)replacers   r   r   r   rr   q  s    z EmiliaTokenizer.map_punctuationsc           	      C   s0  g }g }d}d}t d}||}t|D ]$\}}| |s$| |r*|d q| |r5|d q|d qt|t|ksEJ t	t|D ]>}|dkr\||| 7 }|| }qK|dkrk||| 7 }|| }qK|| |dfv rz||| 7 }qK|||f || }|| }qK|||f | 
|}|S )u  
        Split a text into segments based on language types
        (Chinese, English, Pinyin, tags, etc.)

        Args:
            text (str): Input text to be segmented

        Returns:
            List[str]: Segmented text parts with their language types

        Example:
            Input: 我们是小米人,是吗? Yes I think so!霍...啦啦啦
            Output: [('我们是小米人,是吗? ', 'zh'),
                ('Yes I think so!', 'en'), ('霍...啦啦啦', 'zh')]
        r   z[<[].*?[>\]]|.ru   rv   otherr   )recompilefindall	enumerate
is_chinese	is_pinyinrW   is_alphabetr9   rO   split_segments)	r   r`   r~   typestemp_seg	temp_lang_part_patternrJ   partr   r   r   ry     s8   






zEmiliaTokenizer.get_segmentc                 C   st   g }|D ]3\}}t d|}|D ]&}|sq| |r"||df q| |r/||df q|||f qq|S )a  
        split segments into smaller parts if special strings enclosed by [] or <>
        are found, where <> denotes pinyin strings, [] denotes other special strings.

        Args:
            segments (list): A list of tuples where each tuple contains:
                - temp_seg (str): The text segment to be split.
                - temp_lang (str): The language code associated with the segment.

        Returns:
            list: A list of smaller segments.
        z([<[].*?[>\]])rw   rx   )r   r7   r   rW   is_tag)r   r~   resultr   r   partsr   r   r   r   r     s   

	zEmiliaTokenizer.split_segmentscharc                 C   s   |dkr
|dkr
dS dS )Nu   一u   龥TFr   r   r   r   r   r   r     s   zEmiliaTokenizer.is_chinesec                 C   s(   |dkr|dks|dkr|dkrdS dS )NAZazTFr   r   r   r   r   r     s   zEmiliaTokenizer.is_alphabetr   c                 C      | dr|drdS dS )Nr   r   TFr   r   r   r   r   r   r   r        zEmiliaTokenizer.is_pinyinc                 C   r   )N[]TFr   r   r   r   r   r     r   zEmiliaTokenizer.is_tagNrm   )r   r    r!   r   r#   r@   r   r$   r   rt   r   r   rz   r{   r|   r   rr   ry   r   boolr   r   r   r   r   r   r   r   rl      sB     




!



=rl   c                       s:   e Zd Zd
dee f fddZdedefdd	Z  ZS )DialogTokenizerNrm   r&   c                    s4   t  j||d |r| jd | _| jd | _d S d S )N)r&   rq   z[S1]z[S2])superr@   r3   spk_a_idspk_b_id)r   r&   rq   	__class__r   r   r@     s
   zDialogTokenizer.__init__r`   r   c                 C   s   t dd|}| |}|S )Nz\s*(\[S[12]\])\s*z\1)r   subrr   rs   r   r   r   rt     s   
zDialogTokenizer.preprocess_textr   )r   r    r!   r   r#   r@   rt   __classcell__r   r   r   r   r     s    r   c                   @   s~   e Zd Zddee fddZdee deee  fdd	Zdee deee  fd
dZ	deee  deee  fddZ
dS )LibriTTSTokenizerNr   r&   c              
   C   sR  || _ |dv s	J zddl}W n ty" } zt| dd}~ww |jj| _d| _|du r6t	d dS |dkrWddl
}| | _| j| | jd| _| j | _nMi | _t|d	d
d/}| D ]"}| d}|d t|d }	}
|	| jvsJ |	|
| j|	< qfW d   n1 sw   Y  | jd | _t| j| _d| _dS )a0  
        Args:
          type: the type of tokenizer, e.g., bpe, char, phone.
          tokens: the file that contains information that maps tokens to ids,
            which is a text file with '{token}	{token_id}' per line if type is
            char or phone, otherwise it is a bpe_model file.
        )bper   rm   r   Nz+
Please run
pip install espnet_tts_frontendFr(   r   z<pad>r)   r*   r+   r-   r.   r/   T)typetacotron_cleaner.cleanersrg   RuntimeErrorcleanerscustom_english_cleanersr   r0   r1   r2   sentencepieceSentencePieceProcessorsploadpiece_to_idr8   get_piece_sizer:   r3   r4   r5   r6   r7   r$   r9   )r   r&   rq   tacotron_cleanerrj   spmr;   r<   r=   r>   r?   r   r   r   r@     sB   


zLibriTTSTokenizer.__init__r   r   c                 C   sJ   | j dkrtt|D ]}| || ||< q| j|S | | |S )Nr   )r   rO   r9   r   r   encoder   r   )r   r   rJ   r   r   r   r   2  s
   
z$LibriTTSTokenizer.texts_to_token_idsc                    s   t t D ]}|  |  |< q| jdkr& fddt t D }|S | jdkr: fddt t D }|S | jdkrG| jj td}|S )Nr   c                    rE   r   rF   rH   rK   r   r   rL   E  rM   z5LibriTTSTokenizer.texts_to_tokens.<locals>.<listcomp>rm   c                    s   g | ]}t  |  d qS )r^   )r   lowerrH   rK   r   r   rL   G  s    r   )out_type)rO   r9   r   r   r   r   r#   )r   r   rJ   rQ   r   rK   r   r   =  s   




z!LibriTTSTokenizer.texts_to_tokensrQ   c                 C   sv   | j sJ d| jdksJ dg }|D ]$}g }|D ]}|| jvr*td|  q|| j|  q|| q|S )NrT   r   z-BPE tokenizer does not support this function.rU   )r0   r   r3   r1   r2   rW   rX   r   r   r   r   O  s   
z%LibriTTSTokenizer.tokens_to_token_ids)Nr   )r   r    r!   r   r#   r@   r   r$   r   r   r   r   r   r   r   r     s"    +





r   cut_set	tokenizerr_   c                    s~    dkrt   n* dkrt|d n  dkrt  n dkr"t  n dkr*t  ntd  d fd	d
}| |} | S )Nemiliaespeak)r_   dialoglibrittssimplezUnsupported tokenizer: rn   c                    sL   t | jdksJ t | j| f| jd j} |gd }|| jd _| S )Nr.   r   )r9   supervisionsr`   r   r   )r   r`   r   r   r   r   _prepare_cutt  s
    z add_tokens.<locals>._prepare_cut)rl   r]   r   r   r%   
ValueErrormap)r   r   r_   r   r   r   r   
add_tokensf  s   
r   __main__u   我们是5年小米人,是吗? Yes I think so! mr king, 5 years, from 2019 to 2024.霍...啦啦啦超过90%的人<le5>...?!9204ztokens: |).r1   r   abcr   r   	functoolsr   typingr   r   r   r   lhotser   pypinyinr	   r
   pypinyin.contrib.tone_convertr   r   zipvoice.tokenizer.normalizerr   r   piper_phonemizer   rg   rj   r   default_loggersetLevelINFOr   r%   r]   rl   r   r   r#   r   r   r`   r   r   r   printjoinr   r   r   r   <module>   sF   @J  /`