o
    }oi                     @   s^   d dl Z d dlmZ d dlZd dlmZ d dlmZ dgZdZ	dZ
dd	 ZG d
d deZdS )    N)List)ColumnCodes)TokenizerSpecTabularTokenizerz<|endoftext|>
c                 C   s*   d}z|  |}W |S  ty   Y |S w )N)index
ValueError)
list_inputitemoutput r   h/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/common/tokenizers/tabular_tokenizer.pyfind_index_of   s   r   c                   @   s   e Zd ZeegdfddZdd Zedd Zdd	 Z	d
d Z
edd Zedd Zdd Zdd Zdee fddZd!ddZdd Zdd Zdd Zd S )"r   ,c                 C   s   t |tr	|| _nt|d}t|| _W d    n1 sw   Y  t| jj| _i | _	i | _
| | || _| j	t | _| j| _| j| _d S )Nrb)
isinstancer   code_columnopenpickleloadlencolumnsnum_columnsspecial_tokensspecial_tokens_decoderadd_special_tokens	delimiterEND_OF_TEXTeod_ideos_idbos_id)selfcoderr   r   handler   r   r   __init__'   s   

zTabularTokenizer.__init__c                 C      | j S N)
vocab_sizer"   r   r   r   __len__6   s   zTabularTokenizer.__len__c                 C   s   t | j d S )N   )maxr   keysr)   r   r   r   r(   9   s   zTabularTokenizer.vocab_sizec                 C   
   |  |S r'   )encoder"   textr   r   r   text_to_ids=      
zTabularTokenizer.text_to_idsc                 C   r.   r'   )decode)r"   	token_idsr   r   r   ids_to_text@   r3   zTabularTokenizer.ids_to_textc                 C   r&   r'   )r   r)   r   r   r   eodC   s   zTabularTokenizer.eodc                 C   s
   | j t S r'   )r   NEW_LINEr)   r   r   r   eorG   s   
zTabularTokenizer.eorc                    sT   |s
i  _ i  _dS t fddt|D } j | dd  j  D  _dS )z Add a list of additional tokens to the encoder.
            The additional tokens are indexed starting from the last
            index of the
            current vocabulary in the order of the `special_tokens` list.
        Nc                 3   s.    | ]\}}| j vr| jj| fV  qd S r'   )r   r   r(   ).0itokr)   r   r   	<genexpr>U   s    
z6TabularTokenizer.add_special_tokens.<locals>.<genexpr>c                 S   s   i | ]\}}||qS r   r   )r:   kvr   r   r   
<dictcomp>[   s    z7TabularTokenizer.add_special_tokens.<locals>.<dictcomp>)r   r   dict	enumerateupdateitems)r"   r   newr   r)   r   r   K   s   z#TabularTokenizer.add_special_tokensc           
      C   s   g }| t}t|}t|D ]a}|| }|dkrq| | j}|D ]B}| t}	t|	dkr7||  q"t|	dkra|	d dkrL||	d   |t |	d dkr`||	d   q"td||d krp|t q|S )z Tokenize a string.  r+      r   zdelimiter error)	splitr8   r   ranger   r   appendstripr	   )
r"   r1   tokensrowsnum_rowsrow_idrowfieldsfsplitsr   r   r   text_to_tokens]   s0   



zTabularTokenizer.text_to_tokensrL   c                 C   s   g }d}t |v r|t }| j| | j }|D ])}|| jv r'|| j|  q|| j }| jj| }|| j|| |d7 }q|S )z9 Converts a sequence of tokens into ids using the vocab. r   r+   )	r8   r   r   r   rJ   r   r   extendr/   )r"   rL   idscindexiddtokenr   columnr   r   r   tokens_to_idsw   s   



zTabularTokenizer.tokens_to_idsFc                 C   sN  g }| j j}t|}d}t|| j}t|| j}|dkr,|dkr,t||}	||	 | }n!|dkr=|dk r=|}	||	 | }n|dkrM|dk rM|}	||	 | }t|}
d}g }|D ]L}|| j	v rj|si|
| j	|  qX|| }t||
k d d }| j j| }||kr|g}|}n|
| t||| kr|
| j || |d7 }qX|S )z=Converts a sequence of ids in Tabular tokens using the vocab.r   r   r+   )r   sizessumr   r9   r7   minnumpycumsumr   rJ   wherer   r   r4   )r"   rV   skip_special_tokensrL   r\   ids_sizerW   eor_poseod_posrX   	cum_sizesold_column_indexr5   r;   r   column_indexrZ   r   r   r   ids_to_tokens   sD   




zTabularTokenizer.ids_to_tokensc                 C   s   |  | |S r'   )r[   rT   r0   r   r   r   r/      s   zTabularTokenizer.encodec                 C   s   | j |dd}| |S )NF)rb   )ri   tokens_to_text)r"   r5   rL   r   r   r   r4      s   
zTabularTokenizer.decodec                 C   s   g }g }|D ](}|t ks|tkr)t|dkr!| j|}|| || g }q|| qt|dkr@| j|}|| d|}|S )Nr   rF   )r   r8   r   r   joinrJ   )r"   rL   	all_lineslinerY   	line_textr1   r   r   r   rj      s   



zTabularTokenizer.tokens_to_textN)F)__name__
__module____qualname__r   r8   r%   r*   propertyr(   r2   r6   r7   r9   r   rT   r   strr[   ri   r/   r4   rj   r   r   r   r   r   &   s$    



&)r   typingr   r_   /nemo.collections.common.tokenizers.column_coderr   1nemo.collections.common.tokenizers.tokenizer_specr   __all__r   r8   r   r   r   r   r   r   <module>   s   	