o
    }oi	                     @   s2   d dl mZ d dlmZ dgZG dd deZdS )    )Optional)CharTokenizerWordTokenizerc                       s~   e Zd ZdZ							ddedee dee dee dee dee d	ee d
ee f fddZdd Zdd Z  Z	S )r   zTokenizes at word boundaryN
vocab_file
mask_token	bos_token	eos_token	pad_token	sep_token	cls_token	unk_tokenc	           	   
      s    t  j||||||||d dS )a  
        Args:
            vocab_file: path to file with vocabulary which consists
                of characters separated by 

            mask_token: mask token 
            bos_token: the beginning of sequence token
            eos_token: the end of sequence token. Usually equal to sep_token
            pad_token: token to use for padding
            sep_token: token used for separating sequences
            cls_token: class token. Usually equal to bos_token
            unk_token: token to use for unknown tokens
        )r   r   r   r   r	   r   r
   r   N)super__init__)	selfr   r   r   r   r	   r
   r   r   	__class__ e/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/common/tokenizers/word_tokenizer.pyr      s   
zWordTokenizer.__init__c                 C   s@   |   }g }|D ]}|| jv r|| q
|| j q
|S )N)stripsplitvocabappendr   )r   texttoken_candidatestokenstokenr   r   r   text_to_tokens<   s   
zWordTokenizer.text_to_tokensc                    s"    fdd|D }d  |S )Nc                    s   g | ]	}| j vr|qS r   )special_tokens).0id_r   r   r   
<listcomp>G   s    z-WordTokenizer.ids_to_text.<locals>.<listcomp> )joinids_to_tokens)r   idsids_r   r    r   ids_to_textF   s   zWordTokenizer.ids_to_text)NNNNNNN)
__name__
__module____qualname____doc__strr   r   r   r'   __classcell__r   r   r   r   r      s8    	#
N)typingr   1nemo.collections.common.tokenizers.char_tokenizerr   __all__r   r   r   r   r   <module>   s   