o
    xiI                     @   s4   d dl mZ d dlZG dd dZG dd dZdS )    )IterableNc                	   @   s   e Zd ZdZddeddfddZededd fd	d
Zddede	de
jfddZ	ddee dedB dee
je
jf fddZdS )ByteTokenizerz/Simple byte-level tokenizer for text-to-speech.   	bos_tokenreturnNc                 C   s$   |dk rt d| t|| _d S )Nr   zbos_token must be >= 0, got )
ValueErrorintr   )selfr    r
   1/home/ubuntu/Irodori-TTS/irodori_tts/tokenizer.py__init__	   s   zByteTokenizer.__init__text_vocab_sizec                 C   s6   |dk rt d| |dkr| ddS | |d dS )Nr   z@text_vocab_size must be >= 256 for byte-level tokenization, got r   )r      )r   )clsr   r
   r
   r   for_vocab_size   s   
zByteTokenizer.for_vocab_sizeTtextadd_bosc                 C   s0   t |d}|r|d| j tj|tjdS )Nzutf-8r   dtype)listencodeinsertr   torchtensorlong)r	   r   r   tokensr
   r
   r   r      s   zByteTokenizer.encodetexts
max_lengthc           	         s    fdd|D }|d u rt dd |D }tjt||ftjd}tjt||ftjd}t|D ]\}}t|| }|d | ||d |f< d||d |f< q2||fS )Nc                       g | ]}  |qS r
   r   .0tr	   r
   r   
<listcomp>$       z.ByteTokenizer.batch_encode.<locals>.<listcomp>c                 s   s    | ]}|  V  qd S N)numelr!   xr
   r
   r   	<genexpr>&   s    z-ByteTokenizer.batch_encode.<locals>.<genexpr>r   T)	maxr   zeroslenr   bool	enumerateminr'   	r	   r   r   encodedbatchmaskiseqnr
   r#   r   batch_encode   s   zByteTokenizer.batch_encode)r   Tr&   )__name__
__module____qualname____doc__r   r   classmethodr   strr.   r   Tensorr   r   tupler8   r
   r
   r
   r   r      s    
	r   c                
   @   s   e Zd ZdZddeddfddZe		dd	eded
edd fddZe	de
fddZe	de
dB fddZe	de
fddZddededB dejfddZ	ddee de
dB deejejf fddZdS )PretrainedTextTokenizerz
    Hugging Face tokenizer wrapper for text conditioning.
    - right-padding for stable positional behavior
    - optional explicit BOS prepend
    Tr   r   Nc                 C   st   || _ t|| _d| j _| j jd u r)| j jd ur%| j jd ur%| j j| j _ntd| jr6| j j	d u r8tdd S d S )NrightzeTokenizer has no pad_token_id (and no eos_token fallback). Set a pad token before training/inference.z/Tokenizer has no bos_token_id but add_bos=True.)
	tokenizerr.   r   padding_sidepad_token_ideos_token_id	eos_token	pad_tokenr   bos_token_id)r	   rD   r   r
   r
   r   r   8   s   
z PretrainedTextTokenizer.__init__Frepo_idlocal_files_onlyc              
   C   sR   zddl m} W n ty } ztd|d }~ww |j|dd|d}| ||dS )Nr   )AutoTokenizerzqtransformers is required for pretrained text tokenization. Install with `pip install transformers sentencepiece`.TF)use_fasttrust_remote_coderL   )rD   r   )transformersrM   ImportErrorRuntimeErrorfrom_pretrained)r   rK   r   rL   rM   excrD   r
   r
   r   rS   J   s"   z'PretrainedTextTokenizer.from_pretrainedc                 C   s   t t| jS r&   )r   r-   rD   r#   r
   r
   r   
vocab_sizea   s   z"PretrainedTextTokenizer.vocab_sizec                 C   s   | j jS r&   )rD   rJ   r#   r
   r
   r   rJ   e   s   z$PretrainedTextTokenizer.bos_token_idc                 C   s    | j j}|d u rtdt|S )Nz"pad_token_id is unexpectedly None.)rD   rF   rR   r   )r	   pad_idr
   r
   r   rF   i   s   z$PretrainedTextTokenizer.pad_token_idr   c                 C   s`   | j j|dd}|d u r| jnt|}|r(| j}|d u r td|dt| tj	|tj
dS )NF)add_special_tokensz<Tokenizer has no bos_token_id but BOS prepend was requested.r   r   )rD   r   r   r.   rJ   r   r   r   r   r   r   )r	   r   r   	token_idsuse_bosbos_idr
   r
   r   r   p   s   zPretrainedTextTokenizer.encoder   r   c           	         s    fdd|D }|d u rt dd |D }|dkr!td| tjt||f jtjd}tjt||ftjd}t	|D ]#\}}t
|| }|dkrb|d | ||d |f< d	||d |f< q?||fS )
Nc                    r   r
   r   r    r#   r
   r   r$      r%   z8PretrainedTextTokenizer.batch_encode.<locals>.<listcomp>c                 s   s    | ]
}t | d V  qdS )r   N)r+   r'   r(   r
   r
   r   r*      s    z7PretrainedTextTokenizer.batch_encode.<locals>.<genexpr>r   zmax_length must be > 0, got )
fill_valuer   r   T)r+   r   r   fullr-   rF   r   r,   r.   r/   r0   r'   r1   r
   r#   r   r8   z   s$   
z$PretrainedTextTokenizer.batch_encoder9   )TFr&   )r:   r;   r<   r=   r.   r   r>   r?   rS   propertyr   rU   rJ   rF   r   r@   r   r   rA   r8   r
   r
   r
   r   rB   1   s<    rB   )collections.abcr   r   r   rB   r
   r
   r
   r   <module>   s    +