o
    ei                     @   sD   d Z ddlmZmZ ddlmZ eeZG dd deZ	dgZ
dS )zTokenization class for Dia.   )
AddedTokenPreTrainedTokenizer)loggingc                	       s   e Zd ZdZddgZ				ddedB d	edB d
edB def fddZedd Z	dd Z
dedee fddZdd Zdd Zdee defddZ  ZS )DiaTokenizera  
    Construct a Dia tokenizer. Dia simply uses raw bytes utf-8 encoding except for special tokens `[S1]` and `[S2]`.

    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.

    Args:
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        unk_token (`str`, *optional*, defaults to `"<pad>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        max_length (`int`, *optional*, defaults to 1024):
            The maximum length of the sequences when encoding. Sequences longer than this will be truncated.
        offset (`int`, *optional*, defaults to 0):
            The offset of the tokenizer.
    	input_idsattention_mask<pad>       	pad_tokenN	unk_token
max_lengthoffsetc              
      st   t |tr	t|n|}t |trt|n|}d| _|tdtdd| _|| _t jd	||||dddd| d S )
N   z[S1]z[S2])r
         	all_zerosTnone)r   r   r   r   token_type_ids_pattern%token_type_ids_include_special_tokensspecial_tokens_pattern )
isinstancestrr   _utf_vocab_size_added_tokens_decoderr   super__init__)selfr   r   r   r   kwargs	__class__r   f/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/dia/tokenization_dia.pyr   ,   s    	
zDiaTokenizer.__init__c                 C   s   | j S )N)r   r   r   r   r"   
vocab_sizeF   s   zDiaTokenizer.vocab_sizec                    s.    fddt  j j D }| j |S )Nc                    s   i | ]}  ||qS r   )convert_ids_to_tokens.0ir#   r   r"   
<dictcomp>K   s    z*DiaTokenizer.get_vocab.<locals>.<dictcomp>)ranger$   r   updateadded_tokens_encoder)r   vocabr   r#   r"   	get_vocabJ   s   zDiaTokenizer.get_vocabtextreturnc                 C   s   dd | dD }|S )zPTake as input a string and return a list of strings (tokens) for words/sub-wordsc                 S   s   g | ]}t |qS r   )chrr&   r   r   r"   
<listcomp>Q   s    z*DiaTokenizer._tokenize.<locals>.<listcomp>utf-8)encode)r   r/   tokensr   r   r"   	_tokenizeO   s   zDiaTokenizer._tokenizec                 C   s&   t |dkr
d}|S t|| j }|S )z0Converts a token (str) in an id using the vocab.r   N)lenordr   )r   tokentoken_idr   r   r"   _convert_token_to_idT   s
   z!DiaTokenizer._convert_token_to_idc                 C   s   t || j }|S )z=Converts an index (integer) in a token (str) using the vocab.)r1   r   )r   indexr9   r   r   r"   _convert_id_to_token^   s   z!DiaTokenizer._convert_id_to_tokenr5   c                 C   sl   d}|D ](}|| j v r| j | }t|d}n|| jv r#|d}n|d}||7 }q|jddd}|S )z:Converts a sequence of tokens (string) in a single string.    r3   ignore)errors)added_tokens_decoderr   r4   r,   decode)r   r5   bstringr9   added_token_obj
tok_stringstringr   r   r"   convert_tokens_to_stringc   s   




z%DiaTokenizer.convert_tokens_to_string)r   r   r	   r
   )__name__
__module____qualname____doc__model_input_namesr   intr   propertyr$   r.   listr6   r;   r=   rG   __classcell__r   r   r    r"   r      s.    

r   N)rK   tokenization_pythonr   r   utilsr   
get_loggerrH   loggerr   __all__r   r   r   r"   <module>   s   

\