o
    ei                     @   sX   d Z ddlZddlmZ ddlmZ eeZddiZ	dd	 Z
G d
d deZdgZdS )zTokenization classes for ESM.    N   )PreTrainedTokenizer)logging
vocab_file	vocab.txtc                 C   sH   t | d}|  }dd |D W  d    S 1 sw   Y  d S )Nrc                 S   s   g | ]}|  qS  )strip).0lr   r   f/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/esm/tokenization_esm.py
<listcomp>   s    z#load_vocab_file.<locals>.<listcomp>)openread
splitlines)r   flinesr   r   r   load_vocab_file   s   $r   c                
       s   e Zd ZdZeZddgZ					d' fd	d
	Zdede	fddZ
de	defddZdd Zdd Zde	defddZdede	fddZ	d(dee dee dB dee fddZ	d)dededB d edee fd!d"Zd#d$ Zedefd%d&Z  ZS )*EsmTokenizerz&
    Constructs an ESM tokenizer.
    	input_idsattention_mask<unk><cls><pad><mask><eos>c                    sf   t || _tt| j| _dd t| jD | _t jd|||||d| | j| _| 	| j d S )Nc                 S   s   i | ]\}}||qS r   r   )r
   indtokr   r   r   
<dictcomp>5   s    z)EsmTokenizer.__init__.<locals>.<dictcomp>)	unk_token	cls_token	pad_token
mask_token	eos_tokenr   )
r   
all_tokensdict	enumerate_id_to_token_token_to_idsuper__init__unique_no_split_tokens_update_trie)selfr   r   r    r!   r"   r#   kwargs	__class__r   r   r*   )   s   

zEsmTokenizer.__init__indexreturnc                 C      | j || jS Nr'   getr   r-   r1   r   r   r   _convert_id_to_tokenE      z!EsmTokenizer._convert_id_to_tokentokenc                 C      | j || j | jS r4   r(   r6   r   r-   r:   r   r   r   _convert_token_to_idH      z!EsmTokenizer._convert_token_to_idc                 K   s   |  S r4   )split)r-   textr.   r   r   r   	_tokenizeK   s   zEsmTokenizer._tokenizec                 C   s   | j  }|| j |S r4   )r(   copyupdateadded_tokens_encoder)r-   
base_vocabr   r   r   	get_vocabN   s   
zEsmTokenizer.get_vocabc                 C   r;   r4   r<   r=   r   r   r   token_to_idS   r?   zEsmTokenizer.token_to_idc                 C   r3   r4   r5   r7   r   r   r   id_to_tokenV   r9   zEsmTokenizer.id_to_tokenNtoken_ids_0token_ids_1c                 C   s\   | j g}| jg}|d u r| jd u r|| S || | S | jd u r$td|| | | | S )Nz=Cannot tokenize multiple sequences when EOS token is not set!)cls_token_ideos_token_id
ValueError)r-   rJ   rK   clssepr   r   r    build_inputs_with_special_tokensY   s   

z-EsmTokenizer.build_inputs_with_special_tokensFalready_has_special_tokensc                    sd   |r|dur
t d fdd|D S dgdgt|  dg }|dur0|dgt| dg 7 }|S )a  
        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.

        Args:
            token_ids_0 (`list[int]`):
                List of ids of the first sequence.
            token_ids_1 (`list[int]`, *optional*):
                List of ids of the second sequence.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        NzYou should not supply a second sequence if the provided sequence of ids is already formatted with special tokens for the model.c                    s   g | ]}| j v rd ndqS )   r   )all_special_ids)r
   r:   r-   r   r   r      s    z8EsmTokenizer.get_special_tokens_mask.<locals>.<listcomp>rS   r   )rN   len)r-   rJ   rK   rR   maskr   rU   r   get_special_tokens_maskg   s   z$EsmTokenizer.get_special_tokens_maskc                 C   sd   t j||r
|d ndd }t|d}|d| j W d    |fS 1 s*w   Y  |fS )N- r   w
)ospathjoinr   writer$   )r-   save_directoryfilename_prefixr   r   r   r   r   save_vocabulary   s   
zEsmTokenizer.save_vocabularyc                 C   s
   t | jS r4   )rV   r$   rU   r   r   r   
vocab_size   s   
zEsmTokenizer.vocab_size)r   r   r   r   r   r4   )NF)__name__
__module____qualname____doc__VOCAB_FILES_NAMESvocab_files_namesmodel_input_namesr*   intstrr8   r>   rB   rG   rH   rI   listrQ   boolrX   rc   propertyrd   __classcell__r   r   r/   r   r   !   sJ    


r   )rh   r]   tokenization_pythonr   utilsr   
get_loggerre   loggerri   r   r   __all__r   r   r   r   <module>   s   

p