o
    ei                     @   s  d Z ddlZddlZddlmZ ddlmZ ee	Z
dddZi d	d
dddddddddddddddddddddd d!d"d#d$d%d&d'd(d)d*i d+d,d-d.d/d0d1d2d3d4d5d6d7d8d9d:d;d<d=d>d?d@dAdBdCdDdEdFdGdHdIdJdKdLi dMdNdOdPdQdRdSdTdUdVdWdXdYdZd[d\d]d^d_d`dadbdcdddedfdgdhdidjdkdldmdndodpdqdrdsZdtdu ZG dvdw dweZdwgZdS )xz)Tokenization classes for Salesforce CTRL.    N   )PreTrainedTokenizer)loggingz
vocab.jsonz
merges.txt)
vocab_filemerges_file	Pregnancyi Christianityi  Explaini Fitnessi  Savingi  Aski#j  Assiv Jokei~ 	Questionsi6  Thoughtsi  Retailiv  Feminismi Writingi.  Atheismi Netflixi  	Computingiך  Opinioniͨ  Alonei  Funnyi%  Gamingi  Humani  Indiai3  JokeriR- Dietin  LegaliS.  NormaniK  Tipi Weightiw  Moviesi  Runningi[  Sciencei*  Horrori  
Confessioni  Financei/  Politicsi?  Scaryi Supportin1  Technologiesi  Teenageip Eventi  Learnedi Notioni 	Wikipediaiϒ  Booksi	  Extracti) Confessionsi- 
Conspiracyi( Linksi  	NarcissusiK Relationshipi  Relationshipsi iǢ  i  ih  i )ReviewsNewsTranslationmultilingualc                 C   s>   t  }| d }| dd D ]}|||f |}qt |}|S )z
    Return set of symbol pairs in a word.

    Word is represented as tuple of symbols (symbols being variable-length strings).
    r      N)setadd)wordpairs	prev_charchar rE   h/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/ctrl/tokenization_ctrl.py	get_pairs[   s   rG   c                       sf   e Zd ZdZeZeZd fdd	Ze	dd Z
dd Zd	d
 Zdd Zdd Zdd Zdd Z  ZS )CTRLTokenizera`  
    Construct a CTRL tokenizer. Based on Byte-Pair-Encoding.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        merges_file (`str`):
            Path to the merges file.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
    <unk>c                    s   t |dd}t|| _W d    n1 sw   Y  dd | j D | _t |dd}| ddd }W d    n1 sCw   Y  dd	 |D }tt	|t
t|| _i | _d
| _t jd|dd
dd| d S )Nzutf-8)encodingc                 S   s   i | ]\}}||qS rE   rE   ).0kvrE   rE   rF   
<dictcomp>   s    z*CTRLTokenizer.__init__.<locals>.<dictcomp>
r>   c                 S   s   g | ]}t | qS rE   )tuplesplit)rK   mergerE   rE   rF   
<listcomp>   s    z*CTRLTokenizer.__init__.<locals>.<listcomp>T	all_zerosnone)	unk_tokentoken_type_ids_pattern%token_type_ids_include_special_tokensspecial_tokens_patternrE   )openjsonloadencoderitemsdecoderreadrR   dictziprangelen	bpe_rankscacheadd_bpe_version_headersuper__init__)selfr   r   rW   kwargsvocab_handlemerges_handlemerges	__class__rE   rF   rj      s&   
zCTRLTokenizer.__init__c                 C   s
   t | jS N)re   r^   rk   rE   rE   rF   
vocab_size   s   
zCTRLTokenizer.vocab_sizec                 C   s   t | jfi | jS rr   )rb   r^   added_tokens_encoderrs   rE   rE   rF   	get_vocab   s   zCTRLTokenizer.get_vocabc           
         s  | j v r
 j | S t|}tt|d d |d d g }t|}|s'|S 	 t| fddd}| jvr8ny|\}}g }d}|t|k rz|||}	W n ty`   |	||d   Y n?w |	|||	  |	}|| |kr|t|d k r||d  |kr|
||  |d	7 }n|
||  |d7 }|t|k sFt|}|}t|dkrnt|}q(d
|}|d d }| j |< |S )NrP   z</w>Tc                    s    j | tdS )Ninf)rf   getfloat)pairrs   rE   rF   <lambda>   s    z#CTRLTokenizer.bpe.<locals>.<lambda>)keyr   r>      @@ )rg   rQ   listrG   minrf   re   index
ValueErrorextendappendjoin)
rk   tokenrA   rB   bigramfirstsecondnew_wordijrE   rs   rF   bpe   sN   

"
,


zCTRLTokenizer.bpec                 C   s8   g }t d|}|D ]}|t| |d q
|S )zTokenize a string.z\S+\n? )refindallr   r   r   rR   )rk   textsplit_tokenswordsr   rE   rE   rF   	_tokenize   s
   zCTRLTokenizer._tokenizec                 C   s   | j || j | jS )z0Converts a token (str) in an id using the vocab.)r^   rx   rW   )rk   r   rE   rE   rF   _convert_token_to_id   s   z"CTRLTokenizer._convert_token_to_idc                 C   s   | j || jS )z=Converts an index (integer) in a token (str) using the vocab.)r`   rx   rW   )rk   r   rE   rE   rF   _convert_id_to_token   s   z"CTRLTokenizer._convert_id_to_tokenc                 C   s   d |dd }|S )z:Converts a sequence of tokens (string) in a single string.r   r~    )r   replacestrip)rk   tokens
out_stringrE   rE   rF   convert_tokens_to_string   s   z&CTRLTokenizer.convert_tokens_to_string)rI   )__name__
__module____qualname____doc__VOCAB_FILES_NAMESvocab_files_namesCONTROL_CODEScontrol_codesrj   propertyrt   rv   r   r   r   r   r   __classcell__rE   rE   rp   rF   rH   k   s    
,
rH   )r   r\   regexr   tokenization_pythonr   utilsr   
get_loggerr   loggerr   r   rG   rH   __all__rE   rE   rE   rF   <module>   s   
	
 !"#$%&'()*+,-./01234;
w