o
    ei9                     @   s   d Z ddlZddlZddlmZmZ e rddlZddlmZ ddl	m
Z
 e
eZddiZd	d
 ZG dd dZG dd deZdgZdS )z Tokenization classes for CPMAnt.    N)is_rjieba_availablerequires_backends   )PreTrainedTokenizer)logging
vocab_filez	vocab.txtc                 C   sf   t  }t| ddd}| }W d   n1 sw   Y  t|D ]\}}|d}|||< q#|S )z*Loads a vocabulary file into a dictionary.rutf-8encodingN
)collectionsOrderedDictopen	readlines	enumeraterstrip)r   vocabreadertokensindextoken r   l/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/cpmant/tokenization_cpmant.py
load_vocab"   s   


r   c                   @   s   e Zd ZdddZdd ZdS )	WordpieceTokenizer<unk>   c                 C   s   || _ || _|| _d S N)r   	unk_tokenmax_input_chars_per_word)selfr   r   r    r   r   r   __init__.   s   
zWordpieceTokenizer.__init__c                 C   s   t |}t|| jkr| jgS d}g }|t|k rXt|}d }||k r<d||| }|| jv r4|}n|d8 }||k s#|d u rK|| j |d7 }n|| |}|t|k s|S )Nr       )listlenr    r   joinr   append)r!   r   charsstart
sub_tokensend
cur_substrsubstrr   r   r   tokenize3   s,   


zWordpieceTokenizer.tokenizeN)r   r   )__name__
__module____qualname__r"   r/   r   r   r   r   r   -   s    
r   c                       s   e Zd ZdZeZddgZdZ							
			d- fdd	Ze	dd Z
e	dd Ze	dd Ze	defddZdd Zdd Z fddZdd  Zd!ee defd"d#Zd$d% Zd&d' Zd.d)ed*ed(B dee fd+d,Z  ZS )/CpmAntTokenizera  
    Construct a CPMAnt tokenizer. Based on byte-level Byte-Pair-Encoding.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        bod_token (`str`, *optional*, defaults to `"<d>"`):
            The beginning of document token.
        eod_token (`str`, *optional*, defaults to `"</d>"`):
            The end of document token.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token.
        line_token (`str`, *optional*, defaults to `"</n>"`):
            The line token.
        space_token (`str`, *optional*, defaults to `"</_>"`):
            The space token.
    	input_idsattention_maskF<d></d><s></s><pad>r   </n></_>leftc                    s   t | dg || _|| _t|| _| j|	 | jd< | j| | jd< | j|	= | j|= tt| j dd d| _dd | j D | _	t
| j|d	| _t jd||||||||	|
d
ddd| |	|fD ]}| j|d }|d ury| j|d  qe|   d S )Nrjieba r   c                 S      | d S Nr$   r   xr   r   r   <lambda>       z*CpmAntTokenizer.__init__.<locals>.<lambda>keyc                 S   s   i | ]\}}||qS r   r   ).0kvr   r   r   
<dictcomp>   s    z,CpmAntTokenizer.__init__.<locals>.<dictcomp>)r   r   	all_zerosTbos)	bod_token	eod_token	bos_token	eos_token	pad_tokenr   
line_tokenspace_tokenpadding_sidetoken_type_ids_pattern%token_type_ids_include_special_tokensspecial_tokens_patternr   )r   rN   rO   r   encoderr   r   sorteditemsdecoderr   wordpiece_tokenizersuperr"   added_tokens_encoderpop_added_tokens_decoder_update_total_vocab_size)r!   r   rN   rO   rP   rQ   rR   r   rS   rT   rU   kwargsspecial_tokentoken_id	__class__r   r   r"   j   sB   
zCpmAntTokenizer.__init__c                 C      | j | j S r   )rY   rN   r!   r   r   r   bod_token_id      zCpmAntTokenizer.bod_token_idc                 C   rh   r   )rY   rO   ri   r   r   r   eod_token_id   rk   zCpmAntTokenizer.eod_token_idc                 C   s
   | j d S )Nr   rY   ri   r   r   r   
newline_id      
zCpmAntTokenizer.newline_idreturnc                 C   s
   t | jS r   )r&   rY   ri   r   r   r   
vocab_size   ro   zCpmAntTokenizer.vocab_sizec                 C   s   t | jfi | jS r   )dictrY   r_   ri   r   r   r   	get_vocab   s   zCpmAntTokenizer.get_vocabc                 C   s,   g }t |dD ]}|| j| q|S )zTokenize a string.F)r>   cutextendr]   r/   )r!   textoutput_tokensrC   r   r   r   	_tokenize   s   zCpmAntTokenizer._tokenizec                    s4   dd |D } fdd|D }t  j|fi |S )zDecode ids into a string.c                 S   s   g | ]}|d kr|qS )r   r   )rH   ir   r   r   
<listcomp>   s    z+CpmAntTokenizer._decode.<locals>.<listcomp>c                    s.   g | ]}| j kr| jkr| jkr|qS r   )pad_token_ideos_token_idbos_token_id)rH   rC   ri   r   r   rz      s    ()r^   _decode)r!   	token_idsrc   rf   ri   r   r~      s
   
zCpmAntTokenizer._decodec                 C   s
   || j v S r   rm   r!   r   r   r   r   check      
zCpmAntTokenizer.checkr   c                 C   s
   d |S )Nr#   )r'   )r!   r   r   r   r   convert_tokens_to_string   r   z(CpmAntTokenizer.convert_tokens_to_stringc                 C   s   | j || j | jS )z0Converts a token (str) in an id using the vocab.)rY   getr   r   r   r   r   _convert_token_to_id   s   z$CpmAntTokenizer._convert_token_to_idc                 C   s   | j || jS )z=Converts an index (integer) in a token (str) using the vocab.)r\   r   r   )r!   r   r   r   r   _convert_id_to_token   s   z$CpmAntTokenizer._convert_id_to_tokenNsave_directoryfilename_prefixc                 C   s*  t j|rt j||r|d ndtd  }n
|r|d nd| }d}d| jv r5| jd | jd< | jd= d| jv rF| jd | jd< | jd= tt| j	 d	d
 d| _t
|ddd.}| j	 D ]\}}||krutd| d |}||d  |d7 }qbW d    |fS 1 sw   Y  |fS )N-r#   r   r   r?   r<   r   r;   c                 S   r@   rA   r   rB   r   r   r   rD      rE   z1CpmAntTokenizer.save_vocabulary.<locals>.<lambda>rF   wr	   r
   zSaving vocabulary to z\: vocabulary indices are not consecutive. Please check that the vocabulary is not corrupted!r$   )ospathisdirr'   VOCAB_FILES_NAMESrY   r   r   rZ   r[   r   loggerwarningwrite)r!   r   r   r   r   writerr   token_indexr   r   r   save_vocabulary   s6   






zCpmAntTokenizer.save_vocabulary)	r6   r7   r8   r9   r:   r   r;   r<   r=   r   )r0   r1   r2   __doc__r   vocab_files_namesmodel_input_namesadd_prefix_spacer"   propertyrj   rl   rn   intrq   rs   rx   r~   r   r%   strr   r   r   tupler   __classcell__r   r   rf   r   r3   M   s>    2


(r3   )r   r   r   transformers.utilsr   r   r>   tokenization_pythonr   utilsr   
get_loggerr0   r   r   r   r   r3   __all__r   r   r   r   <module>   s   
  
