o
    	۷i&                     @   s   d Z ddlZddlZddlmZ ddlmZmZ e rddlZddl	m
Z
 ddlmZ eeZdd	iZd
d ZG dd dZG dd de
ZdgZdS )z Tokenization classes for CPMAnt.    N)Optional)is_rjieba_availablerequires_backends   )PreTrainedTokenizer)logging
vocab_filez	vocab.txtc                 C   sf   t  }t| ddd}| }W d   n1 sw   Y  t|D ]\}}|d}|||< q#|S )z*Loads a vocabulary file into a dictionary.rutf-8encodingN
)collectionsOrderedDictopen	readlines	enumeraterstrip)r   vocabreadertokensindextoken r   d/home/ubuntu/vllm_env/lib/python3.10/site-packages/transformers/models/cpmant/tokenization_cpmant.py
load_vocab$   s   


r   c                   @   s   e Zd ZdddZdd ZdS )	WordpieceTokenizer<unk>   c                 C   s   || _ || _|| _d S N)r   	unk_tokenmax_input_chars_per_word)selfr   r    r!   r   r   r   __init__0   s   
zWordpieceTokenizer.__init__c                 C   s   t |}t|| jkr| jgS d}g }|t|k rXt|}d }||k r<d||| }|| jv r4|}n|d8 }||k s#|d u rK|| j |d7 }n|| |}|t|k s|S )Nr       )listlenr!   r    joinr   append)r"   r   charsstart
sub_tokensend
cur_substrsubstrr   r   r   tokenize5   s,   


zWordpieceTokenizer.tokenizeN)r   r   )__name__
__module____qualname__r#   r0   r   r   r   r   r   /   s    
r   c                
       s@  e Zd ZdZeZddgZdZ							
			d4 fdd	Ze	dd Z
e	dd Ze	dd Ze	defddZdd Zdd Z fddZdd  Zd!ee defd"d#Zd$d% Zd&d' Zd5d)ed*ee dee fd+d,Z	(d5d-ee d.eee  dee fd/d0Z	d6d-ee d.eee  d1edee f fd2d3Z  ZS )7CpmAntTokenizera  
    Construct a CPMAnt tokenizer. Based on byte-level Byte-Pair-Encoding.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        bod_token (`str`, *optional*, defaults to `"<d>"`):
            The beginning of document token.
        eod_token (`str`, *optional*, defaults to `"</d>"`):
            The end of document token.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token.
        line_token (`str`, *optional*, defaults to `"</n>"`):
            The line token.
        space_token (`str`, *optional*, defaults to `"</_>"`):
            The space token.
    	input_idsattention_maskF<d></d><s></s><pad>r   </n></_>leftc                    s   t | dg || _|| _t|| _| j|	 | jd< | j| | jd< | j|	= | j|= tt| j dd d| _dd | j D | _	t
| j|d	| _t jd||||||||	|
d
	| d S )Nrjieba r   c                 S      | d S Nr%   r   xr   r   r   <lambda>       z*CpmAntTokenizer.__init__.<locals>.<lambda>keyc                 S   s   i | ]\}}||qS r   r   ).0kvr   r   r   
<dictcomp>   s    z,CpmAntTokenizer.__init__.<locals>.<dictcomp>)r   r    )		bod_token	eod_token	bos_token	eos_token	pad_tokenr    
line_tokenspace_tokenpadding_sider   )r   rM   rN   r   encoderr   r   sorteditemsdecoderr   wordpiece_tokenizersuperr#   )r"   r   rM   rN   rO   rP   rQ   r    rR   rS   rT   kwargs	__class__r   r   r#   l   s0   


zCpmAntTokenizer.__init__c                 C      | j | j S r   )rU   rM   r"   r   r   r   bod_token_id      zCpmAntTokenizer.bod_token_idc                 C   r^   r   )rU   rN   r_   r   r   r   eod_token_id   ra   zCpmAntTokenizer.eod_token_idc                 C   s
   | j d S )Nr   rU   r_   r   r   r   
newline_id      
zCpmAntTokenizer.newline_idreturnc                 C   s
   t | jS r   )r'   rU   r_   r   r   r   
vocab_size   re   zCpmAntTokenizer.vocab_sizec                 C   s   t | jfi | jS r   )dictrU   added_tokens_encoderr_   r   r   r   	get_vocab   s   zCpmAntTokenizer.get_vocabc                 C   s,   g }t |dD ]}|| j| q|S )zTokenize a string.F)r?   cutextendrY   r0   )r"   textoutput_tokensrD   r   r   r   	_tokenize   s   zCpmAntTokenizer._tokenizec                    s4   dd |D } fdd|D }t  j|fi |S )zDecode ids into a string.c                 S   s   g | ]}|d kr|qS )r   r   )rI   ir   r   r   
<listcomp>   s    z+CpmAntTokenizer._decode.<locals>.<listcomp>c                    s.   g | ]}| j kr| jkr| jkr|qS r   )pad_token_ideos_token_idbos_token_id)rI   rD   r_   r   r   rq      s    ()rZ   _decode)r"   	token_idsr[   r\   r_   r   ru      s
   
zCpmAntTokenizer._decodec                 C   s
   || j v S r   rc   r"   r   r   r   r   check      
zCpmAntTokenizer.checkr   c                 C   s
   d |S )Nr$   )r(   )r"   r   r   r   r   convert_tokens_to_string   ry   z(CpmAntTokenizer.convert_tokens_to_stringc                 C   s   | j || j | jS )z0Converts a token (str) in an id using the vocab.)rU   getr    rw   r   r   r   _convert_token_to_id   s   z$CpmAntTokenizer._convert_token_to_idc                 C   s   | j || jS )z=Converts an index (integer) in a token (str) using the vocab.)rX   r{   r    )r"   r   r   r   r   _convert_id_to_token   s   z$CpmAntTokenizer._convert_id_to_tokenNsave_directoryfilename_prefixc                 C   s*  t j|rt j||r|d ndtd  }n
|r|d nd| }d}d| jv r5| jd | jd< | jd= d| jv rF| jd | jd< | jd= tt| j	 d	d
 d| _t
|ddd.}| j	 D ]\}}||krutd| d |}||d  |d7 }qbW d    |fS 1 sw   Y  |fS )N-r$   r   r   r@   r=   r   r<   c                 S   rA   rB   r   rC   r   r   r   rE      rF   z1CpmAntTokenizer.save_vocabulary.<locals>.<lambda>rG   wr
   r   zSaving vocabulary to z\: vocabulary indices are not consecutive. Please check that the vocabulary is not corrupted!r%   )ospathisdirr(   VOCAB_FILES_NAMESrU   r   r   rV   rW   r   loggerwarningwrite)r"   r~   r   r   r   writerr   token_indexr   r   r   save_vocabulary   s6   






zCpmAntTokenizer.save_vocabularytoken_ids_0token_ids_1c                 C   s,   |du r
| j g| S | j g| | j g | S )a1  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A CPMAnt sequence has the following format:

        - single sequence: `[BOS] Sequence`.

        Args:
            token_ids_0 (`list[int]`): The first tokenized sequence that special tokens will be added.
            token_ids_1 (`list[int]`): The optional second tokenized sequence that special tokens will be added.

        Returns:
            `list[int]`: The model input with special tokens.
        N)rt   )r"   r   r   r   r   r    build_inputs_with_special_tokens   s   z0CpmAntTokenizer.build_inputs_with_special_tokensalready_has_special_tokensc                    sZ   |rt  j||ddS |dur#dgdgt|  dg dgt|  S dgdgt|  S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`list[int]`): List of IDs.
            token_ids_1 (`list[int]`, *optional*): Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)r   r   r   Nr%   r   )rZ   get_special_tokens_maskr'   )r"   r   r   r   r\   r   r   r      s   (z'CpmAntTokenizer.get_special_tokens_mask)	r7   r8   r9   r:   r;   r   r<   r=   r>   r   )NF)r1   r2   r3   __doc__r   vocab_files_namesmodel_input_namesadd_prefix_spacer#   propertyr`   rb   rd   intrg   rj   ro   ru   rx   r&   strrz   r|   r}   r   tupler   r   boolr   __classcell__r   r   r\   r   r4   O   sb    *


 


r4   )r   r   r   typingr   transformers.utilsr   r   r?   tokenization_utilsr   utilsr   
get_loggerr1   r   r   r   r   r4   __all__r   r   r   r   <module>   s    
  
B