o
    	۷i                     @   s  d Z ddlZddlZddlmZ ddlZddlmZ ddl	m
Z
 e
eZddd	Zi d
dddddddddddddddddddddd d!d"d#d$d%d&d'd(d)d*d+i d,d-d.d/d0d1d2d3d4d5d6d7d8d9d:d;d<d=d>d?d@dAdBdCdDdEdFdGdHdIdJdKdLdMi dNdOdPdQdRdSdTdUdVdWdXdYdZd[d\d]d^d_d`dadbdcdddedfdgdhdidjdkdldmdndodpdqdrdsdtZdudv ZG dwdx dxeZdxgZdS )yz)Tokenization classes for Salesforce CTRL.    N)Optional   )PreTrainedTokenizer)loggingz
vocab.jsonz
merges.txt)
vocab_filemerges_file	Pregnancyi Christianityi  Explaini Fitnessi  Savingi  Aski#j  Assiv Jokei~ 	Questionsi6  Thoughtsi  Retailiv  Feminismi Writingi.  Atheismi Netflixi  	Computingiך  Opinioniͨ  Alonei  Funnyi%  Gamingi  Humani  Indiai3  JokeriR- Dietin  LegaliS.  NormaniK  Tipi Weightiw  Moviesi  Runningi[  Sciencei*  Horrori  
Confessioni  Financei/  Politicsi?  Scaryi Supportin1  Technologiesi  Teenageip Eventi  Learnedi Notioni 	Wikipediaiϒ  Booksi	  Extracti) Confessionsi- 
Conspiracyi( Linksi  	NarcissusiK Relationshipi  Relationshipsi iǢ  i  ih  i )ReviewsNewsTranslationmultilingualc                 C   s>   t  }| d }| dd D ]}|||f |}qt |}|S )z
    Return set of symbol pairs in a word.

    Word is represented as tuple of symbols (symbols being variable-length strings).
    r      N)setadd)wordpairs	prev_charchar rF   `/home/ubuntu/vllm_env/lib/python3.10/site-packages/transformers/models/ctrl/tokenization_ctrl.py	get_pairs^   s   rH   c                       s   e Zd ZdZeZeZd fdd	Ze	dd Z
dd Zd	d
 Zdd Zdd Zdd Zdd Zddedee dee fddZ  ZS )CTRLTokenizera`  
    Construct a CTRL tokenizer. Based on Byte-Pair-Encoding.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        merges_file (`str`):
            Path to the merges file.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
    <unk>c                    s   t |dd}t|| _W d    n1 sw   Y  dd | j D | _t |dd}| ddd }W d    n1 sCw   Y  dd	 |D }tt	|t
t|| _i | _t jdd
|i| d S )Nutf-8encodingc                 S   s   i | ]\}}||qS rF   rF   ).0kvrF   rF   rG   
<dictcomp>   s    z*CTRLTokenizer.__init__.<locals>.<dictcomp>
r?   c                 S   s   g | ]}t | qS rF   )tuplesplit)rN   mergerF   rF   rG   
<listcomp>   s    z*CTRLTokenizer.__init__.<locals>.<listcomp>	unk_tokenrF   )openjsonloadencoderitemsdecoderreadrU   dictziprangelen	bpe_rankscachesuper__init__)selfr   r   rX   kwargsvocab_handlemerges_handlemerges	__class__rF   rG   rg      s   zCTRLTokenizer.__init__c                 C   s
   t | jS N)rc   r\   rh   rF   rF   rG   
vocab_size   s   
zCTRLTokenizer.vocab_sizec                 C   s   t | jfi | jS ro   )r`   r\   added_tokens_encoderrp   rF   rF   rG   	get_vocab   s   zCTRLTokenizer.get_vocabc           
         s  | j v r
 j | S t|}tt|d d |d d g }t|}|s'|S 	 t| fddd}| jvr8ny|\}}g }d}|t|k rz|||}	W n ty`   |	||d   Y n?w |	|||	  |	}|| |kr|t|d k r||d  |kr|
||  |d	7 }n|
||  |d7 }|t|k sFt|}|}t|dkrnt|}q(d
|}|d d }| j |< |S )NrS   z</w>Tc                    s    j | tdS )Ninf)rd   getfloat)pairrp   rF   rG   <lambda>   s    z#CTRLTokenizer.bpe.<locals>.<lambda>keyr   r?      @@ )re   rT   listrH   minrd   rc   index
ValueErrorextendappendjoin)
rh   tokenrB   rC   bigramfirstsecondnew_wordijrF   rp   rG   bpe   sN   

"
,


zCTRLTokenizer.bpec                 C   s8   g }t d|}|D ]}|t| |d q
|S )zTokenize a string.z\S+\n? )refindallr   r~   r   rU   )rh   textsplit_tokenswordsr   rF   rF   rG   	_tokenize   s
   zCTRLTokenizer._tokenizec                 C   s   | j || j | jS )z0Converts a token (str) in an id using the vocab.)r\   ru   rX   )rh   r   rF   rF   rG   _convert_token_to_id   s   z"CTRLTokenizer._convert_token_to_idc                 C   s   | j || jS )z=Converts an index (integer) in a token (str) using the vocab.)r^   ru   rX   )rh   r   rF   rF   rG   _convert_id_to_token   s   z"CTRLTokenizer._convert_id_to_tokenc                 C   s   d |dd }|S )z:Converts a sequence of tokens (string) in a single string.r   r|    )r   replacestrip)rh   tokens
out_stringrF   rF   rG   convert_tokens_to_string   s   z&CTRLTokenizer.convert_tokens_to_stringNsave_directoryfilename_prefixreturnc           
   	   C   sV  t j|std| d d S t j||r|d ndtd  }t j||r,|d ndtd  }t|ddd	}|t	j
| jd
dddd  W d    n1 sTw   Y  d}t|ddd	=}|d t| j dd dD ]!\}}	||	krtd| d |	}|d|d  |d7 }qsW d    ||fS 1 sw   Y  ||fS )NzVocabulary path (z) should be a directory-r   r   r   wrK   rL   r{   TF)indent	sort_keysensure_asciirR   r   z#version: 0.2
c                 S   s   | d S )Nr?   rF   )kvrF   rF   rG   rx      s    z/CTRLTokenizer.save_vocabulary.<locals>.<lambda>ry   zSaving vocabulary to zZ: BPE merge indices are not consecutive. Please check that the tokenizer is not corrupted!r   r?   )ospathisdirloggererrorr   VOCAB_FILES_NAMESrY   writerZ   dumpsr\   sortedrd   r]   warning)
rh   r   r   r   
merge_filefr   writer
bpe_tokenstoken_indexrF   rF   rG   save_vocabulary   s8    



zCTRLTokenizer.save_vocabulary)rJ   ro   )__name__
__module____qualname____doc__r   vocab_files_namesCONTROL_CODEScontrol_codesrg   propertyrq   rs   r   r   r   r   r   strr   rT   r   __classcell__rF   rF   rm   rG   rI   n   s    
,
(rI   )r   rZ   r   typingr   regexr   tokenization_utilsr   utilsr   
get_loggerr   r   r   r   rH   rI   __all__rF   rF   rF   rG   <module>   s   
	
 !"#$%&'()*+,-./01234; 
