o
    	۷i                     @   sP   d Z ddlmZ ddlmZmZ ddlmZ ee	Z
G dd deZdgZdS )	zTokenization class for Dia.    )Optional   )
AddedTokenPreTrainedTokenizer)loggingc                	       s   e Zd ZdZddgZ				d!dee dee d	ee d
ef fddZe	dd Z
dd Zdedee fddZdd Zdd Zdee defddZd"dedee dee fdd Z  ZS )#DiaTokenizera  
    Construct a Dia tokenizer. Dia simply uses raw bytes utf-8 encoding except for special tokens `[S1]` and `[S2]`.

    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.

    Args:
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        unk_token (`str`, *optional*, defaults to `"<pad>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        max_length (`int`, *optional*, defaults to 1024):
            The maximum length of the sequences when encoding. Sequences longer than this will be truncated.
        offset (`int`, *optional*, defaults to 0):
            The offset of the tokenizer.
    	input_idsattention_mask<pad>   r   	pad_token	unk_token
max_lengthoffsetc                    sl   t |tr	t|n|}t |trt|n|}d| _|tdtdd| _|| _t jd|||d| d S )N   z[S1]z[S2])r         )r   r   r    )
isinstancestrr   _utf_vocab_size_added_tokens_decoderr   super__init__)selfr   r   r   r   kwargs	__class__r   ^/home/ubuntu/vllm_env/lib/python3.10/site-packages/transformers/models/dia/tokenization_dia.pyr   /   s   	
zDiaTokenizer.__init__c                 C   s   | j S N)r   r   r   r   r   
vocab_sizeE   s   zDiaTokenizer.vocab_sizec                    s.    fddt  j j D }| j |S )Nc                    s   i | ]}  ||qS r   )convert_ids_to_tokens.0ir    r   r   
<dictcomp>J   s    z*DiaTokenizer.get_vocab.<locals>.<dictcomp>)ranger!   r   updateadded_tokens_encoder)r   vocabr   r    r   	get_vocabI   s   zDiaTokenizer.get_vocabtextreturnc                 C   s   dd | dD }|S )zPTake as input a string and return a list of strings (tokens) for words/sub-wordsc                 S   s   g | ]}t |qS r   )chrr#   r   r   r   
<listcomp>P   s    z*DiaTokenizer._tokenize.<locals>.<listcomp>utf-8)encode)r   r,   tokensr   r   r   	_tokenizeN   s   zDiaTokenizer._tokenizec                 C   s&   t |dkr
d}|S t|| j }|S )z0Converts a token (str) in an id using the vocab.r   N)lenordr   )r   tokentoken_idr   r   r   _convert_token_to_idS   s
   z!DiaTokenizer._convert_token_to_idc                 C   s   t || j }|S )z=Converts an index (integer) in a token (str) using the vocab.)r.   r   )r   indexr6   r   r   r   _convert_id_to_token]   s   z!DiaTokenizer._convert_id_to_tokenr2   c                 C   sl   d}|D ](}|| j v r| j | }t|d}n|| jv r#|d}n|d}||7 }q|jddd}|S )z:Converts a sequence of tokens (string) in a single string.    r0   ignore)errors)added_tokens_decoderr   r1   r)   decode)r   r2   bstringr6   added_token_obj
tok_stringstringr   r   r   convert_tokens_to_stringb   s   




z%DiaTokenizer.convert_tokens_to_stringNsave_directoryfilename_prefixc                 C   s   dS )Nr   r   )r   rE   rF   r   r   r   save_vocabularyr   s   zDiaTokenizer.save_vocabulary)r
   r
   r   r   r   )__name__
__module____qualname____doc__model_input_namesr   r   intr   propertyr!   r+   listr3   r8   r:   rD   tuplerG   __classcell__r   r   r   r   r      s0    

(r   N)rK   typingr   tokenization_utilsr   r   utilsr   
get_loggerrH   loggerr   __all__r   r   r   r   <module>   s   

\