o
    }oi                     @   s    d dl mZ G dd deZdS )    )MegatronTokenizerc                       s   e Zd ZdZ fddZdd Zdd Zdee d	e	d
ee fddZ
edd Zedd Zedd Zedd Zedd Zedd Zedd Zedd Z  ZS )NullTokenizerz
    Synthetic tokenizer for performance benchmarking and debugging

    Args:
        vocab_size: vocabulary size for embedding
    c                    s&   t  jd |d t|| _| j| _d S )N)
vocab_size)super__init__int_vocab_size_without_eod_eod_id)selfr   	__class__ e/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/common/tokenizers/null_tokenizer.pyr      s   
zNullTokenizer.__init__c                 C   s   dd | dD S )Nc                 S      g | ]}t |qS r   )r   .0xr   r   r   
<listcomp>        z*NullTokenizer.tokenize.<locals>.<listcomp> )split)r
   textr   r   r   tokenize   s   zNullTokenizer.tokenizec                 C   s   dd |D }d |S )Nc                 S   r   r   )strr   r   r   r   r   #   r   z,NullTokenizer.detokenize.<locals>.<listcomp>r   )join)r
   idsr   r   r   r   
detokenize"   s   
zNullTokenizer.detokenizer   r   returnc                 C   s6   g d}}|D ]}| | |dtt| 7 }q|S )Nr      )appendlenr   )r
   r   r   offsets	start_idxid_r   r   r   r!   &   s
   

zNullTokenizer.offsetsc                 C   s
   | j d S )Nr   )r   r
   r   r   r   r   -   s   
zNullTokenizer.vocab_sizec                 C      t NNotImplementedErrorr$   r   r   r   vocab1      zNullTokenizer.vocabc                 C   r%   r&   r'   r$   r   r   r   	inv_vocab5   r*   zNullTokenizer.inv_vocabc                 C      dS Nr   r$   r   r   r   cls9   r*   zNullTokenizer.clsc                 C   r,   r-   r   r$   r   r   r   sep=   r*   zNullTokenizer.sepc                 C   r,   r-   r   r$   r   r   r   maskA   r*   zNullTokenizer.maskc                 C   s   | j S r&   )r	   r$   r   r   r   eodE   s   zNullTokenizer.eodc                 C   s   d S r&   r   r$   r   r   r   additional_special_tokens_idsI   r*   z+NullTokenizer.additional_special_tokens_ids)__name__
__module____qualname____doc__r   r   r   listr   r   r!   propertyr   r)   r+   r/   r0   r1   r2   r3   __classcell__r   r   r   r   r      s,    






r   N))megatron.core.datasets.megatron_tokenizerr   r   r   r   r   r   <module>   s   