o
    }oi'                     @   st   d dl mZmZmZ d dlZd dlZd dlmZ d dl	m
Z
 ddgZG dd dZG d	d deZG d
d dZdS )    )DictListUnionN)TokenizerSpec)loggingAggregateTokenizerTokenizerWrapperc                   @   s   e Zd Zdd Zdd ZdS )DummyTokenizerc                 C   s   || _ t|| _d S N)vocablen
vocab_size)selfr    r   j/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/common/tokenizers/aggregate_tokenizer.py__init__   s   zDummyTokenizer.__init__c                 C      | j S r
   )r   r   r   r   r   	get_vocab"   s   zDummyTokenizer.get_vocabN)__name__
__module____qualname__r   r   r   r   r   r   r	      s    r	   c                	   @   s   e Zd ZdZdefddZdd Zdd Zd	d
 Zdd Z	dd Z
dd Zdd Zdd Zdd Zdd Zdeeee f deeee f deeee f fddZdedefdd Zdedefd!d"Zed#d$ Zed%d& Zd'S )(r   a	  
    AggregateTokenizer, allowing one to combine multiple regular monolongual tokenizers into one tokenizer.
    The intuition is that we can use existing tokenizers "as is", without retraining, and associate each tokenizer with a language id
    during text processing (language id will be used to route the incoming text sample to the right tokenizer)
    as well as a token id range for detokenization (e.g. [0..127] for tokenizer A, [128..255] for tokenizer B) so
    that the orignal text could be reconstructed. Note that we assume that the incoming dict of langs / tokenizers
    is ordered, e.g. the first tokenizer will be assigned a lower interval of token ids
        Args:
        tokenizers: dict of tokenizers, keys are lang ids, values are actual tokenizers
    
tokenizersc           	      C   s   || _ g | _i | _i | _d}d}| j  D ]\}}|| j|< || j|< |t|j7 }|d7 }q| j  D ]	}| j|j q4t| j| _	t
d| j	  t| j| _|  \}}}|| _|| _|| _d S )Nr      zAggregate vocab size: )tokenizers_dict
vocabularytoken_id_offset token_id_offset_by_tokenizer_numitemsr   r   valuesextendr   r   infor	   	tokenizer_calculate_offsetsoffset_token_ids_by_token_idtokenizers_by_token_idlangs_by_token_id)	r   r   offsetilangr"   r$   r%   r&   r   r   r   r   2   s(   



zAggregateTokenizer.__init__c                 C   s   i }i }i }d}t | j}tt | jD ]I}|t| j |  }|d |k rC|t| j |d  krC|d7 }|t| j |  }|||< t| j | ||< t| j | ||< q|||fS )Nr   r   )r   r   ranger   listr   r   keys)r   offsetsr   langscur_numtotidoff_idr   r   r   r#   [   s   

z%AggregateTokenizer._calculate_offsetsc                 C   s   | j | }||S r
   )r   text_to_tokens)r   textlang_idr"   r   r   r   r3   m   s   

z!AggregateTokenizer.text_to_tokensc                    s4   j   }||} fdd|D |d d < |S )Nc                    s   g | ]	}|j    qS r   )r   ).0tr5   r   r   r   
<listcomp>t   s    z2AggregateTokenizer.text_to_ids.<locals>.<listcomp>)r   text_to_ids)r   r4   r5   r"   	token_idsr   r8   r   r:   q   s   

zAggregateTokenizer.text_to_idsc                 C   s(   t |tjr
| }| j| }||S r
   )
isinstancenpndarraytolistr   decode_pieces)r   tokensr5   r"   r   r   r   tokens_to_textx   s   

z!AggregateTokenizer.tokens_to_textc                 C   sd   t |tjtjfr| }g }|D ]}| j| }| j| }||	|g qd
|dd}|S )N    ▁ )r<   r=   r>   torchTensorr?   r$   r%   r    ids_to_tokensjoinreplace)r   idsrA   r1   	offset_idr"   r4   r   r   r   ids_to_text   s   

zAggregateTokenizer.ids_to_textc                 C   s   | j | }||| j|  S r
   )r   token_to_idr   )r   tokenr5   r"   r   r   r   rN      s   
zAggregateTokenizer.token_to_idc                 C   s@   g }|D ]}| j | }| j| }||gd }|| q|S )Nr   )r$   r%   rH   append)r   rK   rA   r1   rL   r"   rO   r   r   r   rH      s   

z AggregateTokenizer.ids_to_tokensc           	      C   sd   g }|D ]+}| j | }| j| }||gd }|dd}| }| j| }|||d q|S )Nr   rD   rE   )charr)   )r$   r%   rH   rJ   stripr&   rP   )	r   rK   text_and_langsr1   rL   r"   rO   r4   r)   r   r   r   ids_to_text_and_langs   s   


z(AggregateTokenizer.ids_to_text_and_langsc                 C   s   g }g }|D ]>}| j | }| j| }||gd }|dr?t|dkr=| |}| }| |}	||	d}
||
 g }|| qt|dkrc| |}| }| |}	||	d}
||
 |S )Nr   rD   )wordr)   )	r$   r%   rH   
startswithr   rM   rR   ids_to_langrP   )r   rK   words_and_langsword_idsr1   rL   r"   rO   rU   r)   wlr   r   r   ids_to_words_and_langs   s,   










z)AggregateTokenizer.ids_to_words_and_langsc                 C   sn   i }|D ]}| j | }||}|d ur|d ||< qd||< qd}d}| D ]\}}||kr4|}|}q(|S )Nr   rC   )r&   getr   )r   rK   	lang_cntsr1   r)   lang_cntmax_langmax_lang_cntr   r   r   rW      s   


zAggregateTokenizer.ids_to_langrA   r.   returnc                 C   sT   t |tr|g}t |tr|g}g }t|D ]\}}|| }|| || q|S r
   )r<   str	enumeraterP   rN   )r   rA   r.   rK   r(   rO   r5   r   r   r   tokens_to_ids   s   

z AggregateTokenizer.tokens_to_idsr5   c                 C      | j | j| j|  S r
   )r   bosr   r   r5   r   r   r   get_bos      zAggregateTokenizer.get_bosc                 C   rf   r
   )r   eosr   rh   r   r   r   get_eos   rj   zAggregateTokenizer.get_eosc                 C   r   r
   )r   r   r   r   r   r      s   zAggregateTokenizer.vocabc                 C   s   t | j S r
   )r+   r   r,   r   r   r   r   r.      s   zAggregateTokenizer.langsN)r   r   r   __doc__r   r   r#   r3   r:   rB   rM   rN   rH   rT   r[   rW   r   rc   r   intre   ri   rl   propertyr   r.   r   r   r   r   r   &   s(    ):
c                   @   sx   e Zd ZdZdd ZddededB fddZddededB fd	d
ZddededB fddZddededB fddZ	dS )r   z`
    Provide a unified interface for NeMo Tokenizer, AggregateTokenizer, and (char) Parser.
    c                 C   s>   || _ t|tr| j| _d S t|tr| j| _d S | j| _d S r
   )
_tokenizerr<   r   _call_agg_tokenizer_implr   _call_tokenizer_call_parser)r   r"   r   r   r   r      s   

zTokenizerWrapper.__init__Nr4   r)   c                 C   s   |  ||S r
   )rr   r   r4   r)   r   r   r   __call__     zTokenizerWrapper.__call__c                 C   s   |d usJ d| j ||S )Nz1Expected 'lang' to be set for AggregateTokenizer.rp   r:   ru   r   r   r   rq     s   z$TokenizerWrapper._call_agg_tokenizerc                 C   s   | j |S r
   rx   ru   r   r   r   rs     rw   z TokenizerWrapper._call_tokenizerc                 C   s
   |  |S r
   )rp   ru   r   r   r   rt     s   
zTokenizerWrapper._call_parserr
   )
r   r   r   rm   r   rc   rv   rq   rs   rt   r   r   r   r   r      s    	)typingr   r   r   numpyr=   rF   1nemo.collections.common.tokenizers.tokenizer_specr   
nemo.utilsr   __all__r	   r   r   r   r   r   r   <module>   s    N