o
    	۷i                     @   sd   d Z ddlZddlmZ ddlmZ ddlmZ ee	Z
ddiZd	d
 ZG dd deZdgZdS )zTokenization classes for ESM.    N)Optional   )PreTrainedTokenizer)logging
vocab_file	vocab.txtc                 C   sH   t | d}|  }dd |D W  d    S 1 sw   Y  d S )Nrc                 S   s   g | ]}|  qS  )strip).0lr	   r	   ^/home/ubuntu/vllm_env/lib/python3.10/site-packages/transformers/models/esm/tokenization_esm.py
<listcomp>    s    z#load_vocab_file.<locals>.<listcomp>)openread
splitlines)r   flinesr	   r	   r   load_vocab_file   s   $r   c                
       s   e Zd ZdZeZddgZ					d' fd	d
	Zdede	fddZ
de	defddZdd Zdd Zde	defddZdede	fddZ	d(dee deee  dee fddZ	d)dedee d edee fd!d"Zd#d$ Zedefd%d&Z  ZS )*EsmTokenizerz&
    Constructs an ESM tokenizer.
    	input_idsattention_mask<unk><cls><pad><mask><eos>c                    sf   t || _tt| j| _dd t| jD | _t jd|||||d| | j| _| 	| j d S )Nc                 S   s   i | ]\}}||qS r	   r	   )r   indtokr	   r	   r   
<dictcomp>7   s    z)EsmTokenizer.__init__.<locals>.<dictcomp>)	unk_token	cls_token	pad_token
mask_token	eos_tokenr	   )
r   
all_tokensdict	enumerate_id_to_token_token_to_idsuper__init__unique_no_split_tokens_update_trie)selfr   r    r!   r"   r#   r$   kwargs	__class__r	   r   r+   +   s   

zEsmTokenizer.__init__indexreturnc                 C      | j || jS Nr(   getr    r.   r2   r	   r	   r   _convert_id_to_tokenG      z!EsmTokenizer._convert_id_to_tokentokenc                 C      | j || j | jS r5   r)   r7   r    r.   r;   r	   r	   r   _convert_token_to_idJ      z!EsmTokenizer._convert_token_to_idc                 K   s   |  S r5   )split)r.   textr/   r	   r	   r   	_tokenizeM   s   zEsmTokenizer._tokenizec                 C   s   | j  }|| j |S r5   )r)   copyupdateadded_tokens_encoder)r.   
base_vocabr	   r	   r   	get_vocabP   s   
zEsmTokenizer.get_vocabc                 C   r<   r5   r=   r>   r	   r	   r   token_to_idU   r@   zEsmTokenizer.token_to_idc                 C   r4   r5   r6   r8   r	   r	   r   id_to_tokenX   r:   zEsmTokenizer.id_to_tokenNtoken_ids_0token_ids_1c                 C   s\   | j g}| jg}|d u r| jd u r|| S || | S | jd u r$td|| | | | S )Nz=Cannot tokenize multiple sequences when EOS token is not set!)cls_token_ideos_token_id
ValueError)r.   rK   rL   clssepr	   r	   r    build_inputs_with_special_tokens[   s   

z-EsmTokenizer.build_inputs_with_special_tokensFalready_has_special_tokensc                    sd   |r|dur
t d fdd|D S dgdgt|  dg }|dur0|dgt| dg 7 }|S )a  
        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.

        Args:
            token_ids_0 (`list[int]`):
                List of ids of the first sequence.
            token_ids_1 (`list[int]`, *optional*):
                List of ids of the second sequence.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        NzYou should not supply a second sequence if the provided sequence of ids is already formatted with special tokens for the model.c                    s   g | ]}| j v rd ndqS )   r   )all_special_ids)r   r;   r.   r	   r   r      s    z8EsmTokenizer.get_special_tokens_mask.<locals>.<listcomp>rT   r   )rO   len)r.   rK   rL   rS   maskr	   rV   r   get_special_tokens_maski   s   z$EsmTokenizer.get_special_tokens_maskc                 C   sd   t j||r
|d ndd }t|d}|d| j W d    |fS 1 s*w   Y  |fS )N- r   w
)ospathjoinr   writer%   )r.   save_directoryfilename_prefixr   r   r	   r	   r   save_vocabulary   s   
zEsmTokenizer.save_vocabularyc                 C   s
   t | jS r5   )rW   r%   rV   r	   r	   r   
vocab_size   s   
zEsmTokenizer.vocab_size)r   r   r   r   r   r5   )NF)__name__
__module____qualname____doc__VOCAB_FILES_NAMESvocab_files_namesmodel_input_namesr+   intstrr9   r?   rC   rH   rI   rJ   listr   rR   boolrY   rd   propertyre   __classcell__r	   r	   r0   r   r   #   sJ    


r   )ri   r^   typingr   tokenization_utilsr   utilsr   
get_loggerrf   loggerrj   r   r   __all__r	   r	   r	   r   <module>   s   

p