o
    }oi                     @   sB   d dl mZmZ d dlmZ d dlmZ dgZG dd deZdS )    )ABCabstractmethod)OrderedDict)ListTokenizerSpecc                   @   s   e Zd ZdZedd Zedd Zedd Zedd	 Zed
d Z	edd Z
dee fddZdd Zedd Zedd Zedd Zedd Zedd Zedd Zedd  Zed!d" Zed#d$ Zd%S )&r   z:
    Inherit this class to implement a new tokenizer.
    c                 C      dS )z$Converts text into a list of tokens.N selftextr   r   e/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/common/tokenizers/tokenizer_spec.pytext_to_tokens      zTokenizerSpec.text_to_tokensc                 C   r   )z)Converts a list of tokens back into text.Nr   r
   tokensr   r   r   tokens_to_text    r   zTokenizerSpec.tokens_to_textc                 C   r   )z5Converts a list of tokens to their corresponding IDs.Nr   r   r   r   r   tokens_to_ids%   r   zTokenizerSpec.tokens_to_idsc                 C   r   )z,Converts a list of token IDs back to tokens.Nr   r
   idsr   r   r   ids_to_tokens*   r   zTokenizerSpec.ids_to_tokensc                 C   r   )z$Converts text directly to token IDs.Nr   r	   r   r   r   text_to_ids/   r   zTokenizerSpec.text_to_idsc                 C   r   )z Converts token IDs back to text.Nr   r   r   r   r   ids_to_text4   r   zTokenizerSpec.ids_to_textspecial_tokensc                 C      t d)z0Adds special tokens (eos, pad, cls...) to vocab.To be implementedNotImplementedError)r
   r   r   r   r   add_special_tokens9      z TokenizerSpec.add_special_tokensc                 O   r   )z*Appies chat template and tokenizes resultsr   r   )r
   argskwargsr   r   r   apply_chat_template=   r   z!TokenizerSpec.apply_chat_templatec                 C   s
   t | jS )zname of the class)type__name__r
   r   r   r   nameA   s   
zTokenizerSpec.namec                 C   s"   t dt| j dt| j iS )z6Property required for use with megatron-core datasets.class.)r   r"   
__module____qualname__r$   r   r   r   unique_identifiersF   s   "z TokenizerSpec.unique_identifiersc                 C   $   t | dr| jS tt| j d)zGProperty alias to match MegatronTokenizer; returns cls_id if available.cls_idz# has no attribute 'cls' or 'cls_id')hasattrr,   AttributeErrorr"   r#   r$   r   r   r   clsK      
zTokenizerSpec.clsc                 C   r+   )zGProperty alias to match MegatronTokenizer; returns sep_id if available.sep_idz# has no attribute 'sep' or 'sep_id')r-   r1   r.   r"   r#   r$   r   r   r   sepR   r0   zTokenizerSpec.sepc                 C   r+   )zGProperty alias to match MegatronTokenizer; returns pad_id if available.pad_idz# has no attribute 'pad' or 'pad_id')r-   r3   r.   r"   r#   r$   r   r   r   padY   r0   zTokenizerSpec.padc                 C   s4   t | dr| jS t | dr| jS tt| j d)zGProperty alias to match MegatronTokenizer; returns eod_id if available.eod_ideos_idz5 has no attribute 'eod', 'eod_id', 'eos', or 'eos_id')r-   r5   r6   r.   r"   r#   r$   r   r   r   eod`   s
   

zTokenizerSpec.eodc                 C   r+   )zGProperty alias to match MegatronTokenizer; returns bos_id if available.bos_idz# has no attribute 'bos' or 'bos_id')r-   r8   r.   r"   r#   r$   r   r   r   bosj   r0   zTokenizerSpec.bosc                 C   r+   )zGProperty alias to match MegatronTokenizer; returns eos_id if available.r6   z# has no attribute 'eos' or 'eos_id')r-   r6   r.   r"   r#   r$   r   r   r   eosq   r0   zTokenizerSpec.eosc                 C   r+   )zHProperty alias to match MegatronTokenizer; returns mask_id if available.mask_idz% has no attribute 'mask' or 'mask_id')r-   r;   r.   r"   r#   r$   r   r   r   maskx   r0   zTokenizerSpec.maskN)r#   r(   r)   __doc__r   r   r   r   r   r   r   r   strr   r!   propertyr%   r*   r/   r2   r4   r7   r9   r:   r<   r   r   r   r   r      sD    











	

N)	abcr   r   collectionsr   typingr   __all__r   r   r   r   r   <module>   s
   