o
    ei'                     @   sL   d Z ddlZddlmZmZ ddlmZ eeZ	G dd deZ
dgZdS )z"Tokenization class for model ByT5.    N   )
AddedTokenPreTrainedTokenizer)loggingc                
       s:  e Zd ZdZddgZ					d*	d+ fd
dZedd Zdd Z	d,de	e
 de	e
 dB ded	e	e
 f fddZde	e
 d	e	e
 fddZ	d-de	e
 de	e
 dB d	e	e
 fddZ	d-de	e
 de	e
 dB d	e	e
 fddZded	e	e fddZd d! Zd"d# Zd$d% Zd-d&ed'edB d	ee fd(d)Z  ZS ).ByT5Tokenizera  
    Construct a ByT5 tokenizer. ByT5 simply uses raw bytes utf-8 encoding.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
            The token used is the `sep_token`.

            </Tip>

        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        extra_ids (`int`, *optional*, defaults to 125):
            Add a number of extra ids added to the end of the vocabulary for use as sentinels. These tokens are
            accessible as "<extra_id_{%d}>" where "{%d}" is a number between 0 and extra_ids-1. Extra tokens are
            indexed from the end of the vocabulary up to beginning ("<extra_id_0>" is the last token in the vocabulary
            like in ByT5 preprocessing see
            [here](https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117)).
        additional_special_tokens (`list[str]`, *optional*):
            Additional special tokens used by the tokenizer.
    	input_idsattention_mask</s><unk><pad>}   Nreturnc                    s
  |dkr|d u rdd t |D }n(|dkr:|d ur:t|dkr:tttdd |}||kr:td| d| dt|trFt|d	d	d
n|}t|trTt|d	d	d
n|}t|trbt|d	d	d
n|}|||d| _t| j| _	d| _
t jd|||d|d| d S )Nr   c                 S   s   g | ]}d | dqS )z
<extra_id_> .0ir   r   h/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/byt5/tokenization_byt5.py
<listcomp>G       z*ByT5Tokenizer.__init__.<locals>.<listcomp>c                 S   s   t dt| v S )Nextra_id)boolstr)xr   r   r   <lambda>J   s    z(ByT5Tokenizer.__init__.<locals>.<lambda>zBoth extra_ids (z!) and additional_special_tokens (zm) are provided to ByT5Tokenizer. In this case the additional_special_tokens must include the extra_ids tokensT)lstriprstrip)r            )	eos_token	unk_token	pad_token	extra_idsadditional_special_tokensr   )rangelensetfilter
ValueError
isinstancer   r   _added_tokens_decoderoffset_utf_vocab_sizesuper__init__)selfr    r!   r"   r#   r$   kwargsextra_tokens	__class__r   r   r/   <   s.   

zByT5Tokenizer.__init__c                 C   s   | j S N)r-   r0   r   r   r   
vocab_sizec   s   zByT5Tokenizer.vocab_sizec                    s.    fddt  j j D }| j |S )Nc                    s   i | ]}  ||qS r   )convert_ids_to_tokensr   r6   r   r   
<dictcomp>h   r   z+ByT5Tokenizer.get_vocab.<locals>.<dictcomp>)r%   r7   r,   updateadded_tokens_encoder)r0   vocabr   r6   r   	get_vocabg   s   zByT5Tokenizer.get_vocabFtoken_ids_0token_ids_1already_has_special_tokensc                    sZ   |rt  j||ddS |du rdgt| dg S dgt| dg dgt|  dg S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`list[int]`):
                List of IDs.
            token_ids_1 (`list[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)r>   r?   r@   Nr   r   )r.   get_special_tokens_maskr&   )r0   r>   r?   r@   r3   r   r   rA   l   s   (z%ByT5Tokenizer.get_special_tokens_mask	token_idsc                 C   s>   t |dkr|d | jkrtd| j d |S || jg S )z.Do not add eos again if user already added it.r   zThis sequence already has zQ. In future versions this behavior may lead to duplicated eos tokens being added.)r&   eos_token_idwarningswarnr    )r0   rB   r   r   r   _add_eos_if_not_present   s   z%ByT5Tokenizer._add_eos_if_not_presentc                 C   s<   | j g}|du rt|| dg S t|| | | dg S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. ByT5 does not
        make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`list[int]`):
                List of IDs.
            token_ids_1 (`list[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `list[int]`: List of zeros.
        Nr   )rD   r&   )r0   r>   r?   eosr   r   r   $create_token_type_ids_from_sequences   s   z2ByT5Tokenizer.create_token_type_ids_from_sequencesc                 C   s(   |  |}|du r|S |  |}|| S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A sequence has the following format:

        - single sequence: `X </s>`
        - pair of sequences: `A </s> B </s>`

        Args:
            token_ids_0 (`list[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`list[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        N)rG   )r0   r>   r?   r   r   r    build_inputs_with_special_tokens   s
   

z.ByT5Tokenizer.build_inputs_with_special_tokenstextc                 C   s   dd | dD }|S )zPTake as input a string and return a list of strings (tokens) for words/sub-wordsc                 S   s   g | ]}t |qS r   )chrr   r   r   r   r      s    z+ByT5Tokenizer._tokenize.<locals>.<listcomp>utf-8)encode)r0   rK   tokensr   r   r   	_tokenize   s   zByT5Tokenizer._tokenizec                 C   s&   t |dkr
d}|S t|| j }|S )z0Converts a token (str) in an id using the vocab.r   N)r&   ordr,   )r0   tokentoken_idr   r   r   _convert_token_to_id   s
   z"ByT5Tokenizer._convert_token_to_idc                 C   s   t || j }|S )z=Converts an index (integer) in a token (str) using the vocab.)rL   r,   )r0   indexrR   r   r   r   _convert_id_to_token   s   z"ByT5Tokenizer._convert_id_to_tokenc                 C   sh   d}|D ]&}|| j v r| j | d}n|| jv r|d}ntt|g}||7 }q|jddd}|S )z:Converts a sequence of tokens (string) in a single string.    rM   ignore)errors)added_tokens_decoderrN   r;   bytesrQ   decode)r0   rO   bstringrR   
tok_stringstringr   r   r   convert_tokens_to_string   s   


z&ByT5Tokenizer.convert_tokens_to_stringsave_directoryfilename_prefixc                 C   s   dS )Nr   r   )r0   ra   rb   r   r   r   save_vocabulary   s   zByT5Tokenizer.save_vocabulary)r	   r
   r   r   N)r   N)NFr5   )__name__
__module____qualname____doc__model_input_namesr/   propertyr7   r=   listintr   rA   rG   rI   rJ   r   rP   rT   rV   r`   tuplerc   __classcell__r   r   r3   r   r      s\     '






(r   )rg   rE   tokenization_pythonr   r   utilsr   
get_loggerrd   loggerr   __all__r   r   r   r   <module>   s   
 
R