o
    }oi                     @   sN   d dl mZmZmZmZ d dlmZ ddgZG dd dZG dd deZ	dS )    )DictListOptionalUnion)TokenizerSpecByteLevelProcessorByteLevelTokenizerc                   @   sN   e Zd ZdZdee defddZdedee fddZdedefd	d
ZdS )r   zf
    A very basic tokenization and detokenization class for use with byte-level
    tokenization.
    tokensreturnc                 C   s
   d |S )z<
        Detokenize a list of tokens into a string.
         )joinselfr	    r   k/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/common/tokenizers/bytelevel_tokenizers.py
detokenize      
zByteLevelProcessor.detokenizetextc                 C   s   t |S )z:
        Tokenize a string into a list of tokens.
        )listr   r   r   r   r   tokenize"   s   zByteLevelProcessor.tokenizec                 C   s   |S )z%
        Normalize a string.
        r   r   r   r   r   	normalize(   s   zByteLevelProcessor.normalizeN)	__name__
__module____qualname____doc__r   strr   r   r   r   r   r   r   r      s
    c                   @   s   e Zd ZdZ					d%deeeeef ee f  de	de	d	e	d
e	f
ddZ
dd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zedd Zedd  Zed!d" Zed#d$ ZdS )&r   z
    A byte-level tokenizer that encodes text as UTF-8 bytes with user control over the EOS, BOS, and PAD
        tokens as well as the vocabulary size and a mapping of other special tokens to their IDs.
    N   r      special_tokens
vocab_size_eos_id_pad_id_bos_idc                 C   s   || _ || _|| _| j| j| j| j| j| ji| _|du r|n|t| | _| j| _	|du r/i n|}|D ]}|  j	d8  _	| j	| j|< q3dd | j
 D | _dS )a  A byte-level tokenizer that encodes text as UTF-8 bytes.

        This tokenizer treats each byte as a token, with a default vocabulary size of 512 to accommodate
        UTF-8 byte values (0-255) plus special tokens. It can handle arbitrary text input by encoding
        it into bytes.

        Args:
            special_tokens: Dictionary or list of special tokens to add to the vocabulary.
                These tokens will be assigned IDs at the end of the vocabulary.
                Defaults to None.
            vocab_size: Size of the vocabulary, should be at least 256 to handle all byte values.
                Special tokens will be added after this size.
                Defaults to 512.
            _eos_id: ID to use for the end-of-sequence token.
                Defaults to 0.
            _pad_id: ID to use for the padding token.
                Defaults to 1.
            _bos_id: ID to use for the beginning-of-sequence token.
                Defaults to None.
        Nr   c                 S   s   i | ]\}}||qS r   r   ).0kvr   r   r   
<dictcomp>`   s    z/ByteLevelTokenizer.__init__.<locals>.<dictcomp>)r!   r"   r#   pad_idbos_ideos_idspecial_token_to_idlenr    special_startitemsid_to_special_token)r   r   r    r!   r"   r#   tokr   r   r   __init__5   s   zByteLevelTokenizer.__init__c                 C   
   |  |S )z5
        Convert a text to a list of tokens.
        )text_to_idsr   r   r   r   text_to_tokensc   r   z!ByteLevelTokenizer.text_to_tokensc                 C   r2   )z5
        Convert a list of tokens to a text.
        )ids_to_textr   r   r   r   tokens_to_texti   r   z!ByteLevelTokenizer.tokens_to_textc                 C   s   t |dS )z2
        Convert a text to a list of IDs.
        utf-8)r   encoder   r   r   r   r3   o   s   zByteLevelTokenizer.text_to_idsc                    s(    fdd|D }t |jddd S )z2
        Convert a list of IDs to a text.
        c                    s   g | ]	}| j k r|qS r   )r-   )r$   xr   r   r   
<listcomp>z   s    z2ByteLevelTokenizer.ids_to_text.<locals>.<listcomp>r7   ignore)errors)bytesdecoderstrip)r   idsr   r:   r   r5   u   s   zByteLevelTokenizer.ids_to_textc                 C   2   t |tr|g}g }|D ]
}|| | q|S )z<
        Convert a list of tokens to a list of IDs.
        )
isinstancer   appendtoken_to_id)r   r	   rA   tokenr   r   r   tokens_to_ids}      
z ByteLevelTokenizer.tokens_to_idsc                 C   rB   )z<
        Convert a list of IDs to a list of tokens.
        )rC   intrD   id_to_token)r   rA   r	   idr   r   r   ids_to_tokens   rH   z ByteLevelTokenizer.ids_to_tokensc                 C   s   || j v r
| j | S |S )z:
        Convert a token to its corresponding ID.
        )r+   )r   rF   r   r   r   rE      s   

zByteLevelTokenizer.token_to_idc                 C   s   || j k r|S | j| S )z;
        Convert an ID to its corresponding token.
        )r-   r/   )r   rK   r   r   r   rJ      s   

zByteLevelTokenizer.id_to_tokenc                 C      | j S )z%
        Get the padding ID.
        )r"   r:   r   r   r   r(         zByteLevelTokenizer.pad_idc                 C   rM   )z3
        Get the beginning-of-sequence ID.
        )r#   r:   r   r   r   r)      rN   zByteLevelTokenizer.bos_idc                 C   rM   )z-
        Get the end-of-sequence ID.
        )r!   r:   r   r   r   r*      rN   zByteLevelTokenizer.eos_idc                 C   s   dS )z%
        Get the unknown ID.
        i  r   r:   r   r   r   unk_id   s   zByteLevelTokenizer.unk_id)Nr   r   r   N)r   r   r   r   r   r   r   r   r   rI   r1   r4   r6   r3   r5   rG   rL   rE   rJ   propertyr(   r)   r*   rO   r   r   r   r   r   /   sD    
.		


N)
typingr   r   r   r   1nemo.collections.common.tokenizers.tokenizer_specr   __all__r   r   r   r   r   r   <module>   s
   