o
    wi'                     @   sn   d dl Z d dlZd dlmZ d dlZd dlmZ d dlm	Z	 dgZ
dZdZdZd	Zd
ZdZG dd deZdS )    N)Optional)TokenizerSpec)loggingRegExTokenizerz<MASK>^&z<PAD>z<SEP>?c                   @   s.  e Zd ZdZdeeeeee	fde
e de
e de
e de
e de
e de
e d	e
e fd
dZdd Zdd Zedd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zed d! Zed"d# Zed$d% Zed&d' Zed(d) Zed*d+ Zd8d-d.Zd8d/d0Zd8d1d2Z d9d4d5Z!d6d7 Z"d,S ):r   z
    A regular expression-based tokenizer at word boundary.
    This tokenizer default to support MegaMolBART.
    <https://catalog.ngc.nvidia.com/orgs/nvidia/teams/clara/models/megamolbart>
     regex
mask_token	bos_token	eos_token	pad_token	sep_token	unk_tokenc                 C   st   || _ || _|| _|| _|| _|| _|| _d| _d| _| jd| jd| jd| jd| jd| jdi| _	| 
  |   dS )a  
        Args:
            regex: regular expression that defined tokenization rules
            mask_token: mask token
            bos_token: the beginning of sequence token
            eos_token: the end of sequence token. Usually equal to sep_token
            pad_token: token to use for padding
            sep_token: token used for separating sequences
            cls_token: class token. Usually equal to bos_token
            unk_token: token to use for unknown tokens
        Nr                  )r
   r   r   r   r   r   r   
regex_file
vocab_filevocab_update_cache_compile_regex)selfr
   r   r   r   r   r   r    r   o/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/common/tokenizers/regex_tokenizer.py__init__*   s$   zRegExTokenizer.__init__c                 C   s,   | j | jt| _dd | j  D | _d S )Nc                 S   s   i | ]\}}||qS r   r   ).0tir   r   r   
<dictcomp>\       z0RegExTokenizer._update_cache.<locals>.<dictcomp>)r   getr   DEFAULT_UNK_TOKEN_unk_iditems_decode_vocabr   r   r   r   r   Y   s   zRegExTokenizer._update_cachec                 C   s*   d}|| j d 7 }|d7 }t|| _d S )N(|z.))r
   recompile_compiled_regex)r   regex_stringr   r   r   r   ^   s   zRegExTokenizer._compile_regexc                 C   s
   t | jS N)lenr   r)   r   r   r   
vocab_sized   s   
zRegExTokenizer.vocab_sizec                 C   s   | j |}|S r0   )r.   findallr   texttokensr   r   r   text_to_tokensh   s   zRegExTokenizer.text_to_tokensc                 C   sf   g }|D ]%}|d | j kr|dd  }| j|v r$|| j}|d | }|| qdd |D }|S )Nr   r   c                 S   s   g | ]}d  |qS )r	   )join)r   r6   r   r   r   
<listcomp>z   r#   z1RegExTokenizer.tokens_to_text.<locals>.<listcomp>)r   r   indexappend)r   r6   tokens_listtokeneos_idxr5   r   r   r   tokens_to_textm   s   
zRegExTokenizer.tokens_to_textc                 C   s(   g }|D ]}| | j|| j q|S r0   )r;   r   r$   r&   )r   r6   ids_listr=   r   r   r   token_to_ids}   s   zRegExTokenizer.token_to_idsc                 C   s6   t |tr|g}g }|D ]}| |}|| q|S r0   )
isinstancestrrA   r;   )r   
token_datar@   r6   idsr   r   r   tokens_to_ids   s   

zRegExTokenizer.tokens_to_idsc                 C   s   t |rt|d ts|g}d}nd}g }|D ]%}g }|D ]}| j|}|d u r1td| d|| q|| q|rC|d S |S )Nr   TFz	Token id z is not recognised)r1   rB   listr(   r$   
ValueErrorr;   )r   r@   
added_listr<   rE   r6   token_idr=   r   r   r   ids_to_tokens   s    zRegExTokenizer.ids_to_tokensc                 C   s   |  |}|g}| |d S Nr   )r7   rF   r4   r   r   r   text_to_ids   s   
zRegExTokenizer.text_to_idsc                 C   s   |  |}| |S r0   )rK   r?   )r   rE   r6   r   r   r   ids_to_text   s   

zRegExTokenizer.ids_to_textc                 C      dS rL   r   r)   r   r   r   pad_id      zRegExTokenizer.pad_idc                 C   rO   Nr   r   r)   r   r   r   unk_id   rQ   zRegExTokenizer.unk_idc                 C   rO   )Nr   r   r)   r   r   r   bos_id   rQ   zRegExTokenizer.bos_idc                 C   rO   )Nr   r   r)   r   r   r   eos_id   rQ   zRegExTokenizer.eos_idc                 C   rO   )Nr   r   r)   r   r   r   mask_id   rQ   zRegExTokenizer.mask_idc                 C   rO   )Nr   r   r)   r   r   r   sep_id   rQ   zRegExTokenizer.sep_idNc                 C   sL   |p| j }|std|p| j}|stj|d d }|| _ || _||fS )z2
        Infers files or update if given.
        zregex_file must be specifiedr   z.vocab)r   rH   r   ospathsplitext)r   r   r   r   r   r   _get_regex_vocab_files   s   

z%RegExTokenizer._get_regex_vocab_filesc                 C   s   | j ||d\}}td|  t|d}| jD ]}||d  d qW d   n1 s1w   Y  td|  t|d}|| j W d   dS 1 sUw   Y  dS )z9
        Saves tokenizer's regex and vocab files
        r   r   zSaving vocabulary to file = wr   
NzSaving regex to file = )r[   r   infoopenr   writer
   )r   r   r   fpr=   fr   r   r   save_tokenizer   s   
"zRegExTokenizer.save_tokenizerc                 C   s   | j ||d\}}td|  tj|rCi }t|d}|D ]}| }|r/t|||< q!W d   n1 s:w   Y  || _	nt
d| tj|rdtd|  t|dd  | _nt
d	| |   |   | S )
z9
        Loads tokenizer's regex and vocab files
        r\   zLoading vocabulary from file = rNzMissing vocab_file = zLoading regex from file = utf-8encodingzMissing regex_file = )r[   r   r_   rX   rY   existsr`   stripr1   r   RuntimeErrorreadr
   r   r   )r   r   r   r   rc   liner   r   r   load_tokenizer   s*   zRegExTokenizer.load_tokenizersmilesc           	      C   s   t d| d|  tj|std| dt|}| j}|| D ]!}| 	|}t d| d|  |D ]}||vrEt
|||< q9q%t| dd d	}t d
|  || _|   dS )zg
        Learns vocabulary from a CSV file. Can be called multiple times to update vocabulary.
        z#Building vocabulary from CSV col = z file = Data file:  is missingText: 
, Tokens: c                 S      | d S rR   r   k_vr   r   r   <lambda>      z5RegExTokenizer.build_vocab_from_csv.<locals>.<lambda>keyVocab: N)r   debugrX   rY   ri   rH   pdread_csvr   r7   r1   sortedr'   r   )	r   data_csv_filecoldfr   dr6   r=   sorted_vocabr   r   r   build_vocab_from_csv  s"   

z#RegExTokenizer.build_vocab_from_csvc                 C   s   t d|  tj|std| d| j}t|dd2}| D ]%}|	 }| 
|}t d| d|  |D ]}||vrHt|||< q<q$W d   n1 sTw   Y  t| d	d
 d}t d|  || _|   dS )zh
        Learns vocabulary from a text file. Can be called multiple times to update vocabulary.
        z%Building vocabulary from TEXT file = rp   rq   rf   rg   rr   rs   Nc                 S   rt   rR   r   ru   r   r   r   rw   6  rx   z6RegExTokenizer.build_vocab_from_text.<locals>.<lambda>ry   r{   )r   r|   rX   rY   ri   rH   r   r`   	readlinesrstripr7   r1   r   r'   r   )r   data_text_filer   rc   r   r6   r=   r   r   r   r   build_vocab_from_text"  s(   
	z$RegExTokenizer.build_vocab_from_text)NN)ro   )#__name__
__module____qualname____doc__DEFAULT_MASK_TOKENDEFAULT_BOS_TOKENDEFAULT_EOS_TOKENDEFAULT_PAD_TOKENDEFAULT_SEP_TOKENr%   r   rC   r   r   r   propertyr2   r7   r?   rA   rF   rK   rM   rN   rP   rS   rT   rU   rV   rW   r[   rd   rn   r   r   r   r   r   r   r   #   sh    
/











")rX   r,   typingr   pandasr}   1nemo.collections.common.tokenizers.char_tokenizerr   
nemo.utilsr   __all__r   r   r   r   r   r%   r   r   r   r   r   <module>   s   