o
    i}/                     @   sn   d Z ddlZddlZddlmZ ddlmZ ddlmZ e	e
Zddd	Zd
d ZG dd deZdgZdS )z Tokenization classes for BioGPT.    N)Optional   )PreTrainedTokenizer)loggingz
vocab.jsonz
merges.txt)
vocab_filemerges_filec                 C   s6   t  }| d }| dd D ]}|||f |}q|S )z
    Return set of symbol pairs in a word. word is represented as tuple of symbols (symbols being variable-length
    strings)
    r      N)setadd)wordpairs	prev_charchar r   b/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/biogpt/tokenization_biogpt.py	get_pairs!   s   r   c                
       s  e Zd ZdZeZddgZ					d. fdd		Zed
d Z	dd Z
dd Zdd Zdd Zd/ddZdd Zdd Zdd Z	d0dee deee  d ee fd!d"Z	d1dee deee  d#ed ee f fd$d%Zd0d&ed'ee d ee fd(d)Zd*d+ Zd,d- Z  ZS )2BioGptTokenizera:  
    Construct an FAIRSEQ Transformer tokenizer. Moses tokenization followed by Byte-Pair Encoding.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        merges_file (`str`):
            Merges file.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the beginning of
            sequence. The token used is the `cls_token`.

            </Tip>

        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
            The token used is the `sep_token`.

            </Tip>

        sep_token (`str`, *optional*, defaults to `"</s>"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
    	input_idsattention_mask<unk><s></s><pad>c                    s  zdd l }	W n ty   tdw d| _|	| _i | _i | _	 t|dd}
t|
| _	W d    n1 s6w   Y  dd | j	
 D | _t|dd}| dd d	 }W d    n1 sbw   Y  d
d |D }tt|tt|| _i | _t jd|||||d| d S )Nr   zqYou need to install sacremoses to use BioGptTokenizer. See https://pypi.org/project/sacremoses/ for installation.enutf-8encodingc                 S   s   i | ]\}}||qS r   r   ).0kvr   r   r   
<dictcomp>x   s    z,BioGptTokenizer.__init__.<locals>.<dictcomp>
c                 S   s    g | ]}t | d d qS )N   )tuplesplit)r   merger   r   r   
<listcomp>{        z,BioGptTokenizer.__init__.<locals>.<listcomp>)	bos_token	eos_token	sep_token	unk_token	pad_tokenr   )
sacremosesImportErrorlangsmcache_moses_tokenizercache_moses_detokenizeropenjsonloadencoderitemsdecoderreadr%   dictziprangelen	bpe_rankscachesuper__init__)selfr   r   r,   r)   r*   r+   r-   kwargsr.   vocab_handlemerges_handlemerges	__class__r   r   rB   \   s>   
zBioGptTokenizer.__init__c                 C   s
   t | jS )zReturns vocab size)r>   r7   rC   r   r   r   
vocab_size   s   
zBioGptTokenizer.vocab_sizec                 C   s   t | jfi | jS N)r;   r7   added_tokens_encoderrJ   r   r   r   	get_vocab   s   zBioGptTokenizer.get_vocabc                 C   s:   || j vr| jj|d}|| j |< | j | j|ddddS )Nr0   TF)aggressive_dash_splits
return_strescape)r2   r1   MosesTokenizertokenize)rC   textr0   moses_tokenizerr   r   r   moses_tokenize   s   


zBioGptTokenizer.moses_tokenizec                 C   s2   || j vr| jj|d}|| j |< | j | |S )NrO   )r3   r1   MosesDetokenizer
detokenize)rC   tokensr0   moses_detokenizerr   r   r   moses_detokenize   s   

z BioGptTokenizer.moses_detokenizec           
         s~  t |d d |d d f }| jv r j| S t|}|s#|d S 	 t| fddd}| jvr4ny|\}}g }d}|t|k rz|||}	W n ty\   |||d   Y n?w ||||	  |	}|| |kr|t|d k r||d  |kr|	||  |d	7 }n|	||  |d7 }|t|k sBt |}|}t|dkrnt|}q$d

|}|dkrd}| j|< |S )Nr"   </w>Tc                    s    j | tdS )Ninf)r?   getfloat)pairrJ   r   r   <lambda>   s    z%BioGptTokenizer.bpe.<locals>.<lambda>keyr   r   r#    z
  </w>z
</w>)r$   r@   r   minr?   r>   index
ValueErrorextendappendjoin)
rC   tokenr   r   bigramfirstsecondnew_wordijr   rJ   r   bpe   sN   


,


zBioGptTokenizer.bpeFc                 C   sL   |r|  }n| || j}g }|D ]}|r#|t| | d q|S )zReturns a tokenized string.re   )r%   rW   r0   ri   listrs   )rC   rU   bypass_tokenizersplit_tokensrl   r   r   r   	_tokenize   s   
zBioGptTokenizer._tokenizec                 C   s   | j || j | jS )z0Converts a token (str) in an id using the vocab.)r7   r_   r,   )rC   rl   r   r   r   _convert_token_to_id   s   z$BioGptTokenizer._convert_token_to_idc                 C   s   | j || jS )z=Converts an index (integer) in a token (str) using the vocab.)r9   r_   r,   )rC   rg   r   r   r   _convert_id_to_token   s   z$BioGptTokenizer._convert_id_to_tokenc                 C   s.   dd |D }d | }| || j}|S )z:Converts a sequence of tokens (string) in a single string.c                 S   s    g | ]}| d d dd qS )re    r]   )replace)r   tr   r   r   r'      r(   z<BioGptTokenizer.convert_tokens_to_string.<locals>.<listcomp>rz   )rk   r%   r\   r0   )rC   rZ   rU   r   r   r   convert_tokens_to_string   s   z(BioGptTokenizer.convert_tokens_to_stringNtoken_ids_0token_ids_1returnc                 C   s,   |du r
| j g| S | j g}|| | | S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A BioGPT sequence has the following format:

        - single sequence: `</s> X `
        - pair of sequences: `</s> A </s> B `

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        N)sep_token_id)rC   r~   r   sepr   r   r    build_inputs_with_special_tokens   s   z0BioGptTokenizer.build_inputs_with_special_tokensalready_has_special_tokensc                    sZ   |rt  j||ddS |dur#dgdgt|  dg dgt|  S dgdgt|  S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)r~   r   r   Nr   r   )rA   get_special_tokens_maskr>   )rC   r~   r   r   rH   r   r   r     s   (z'BioGptTokenizer.get_special_tokens_masksave_directoryfilename_prefixc           
   	   C   sL  t j|std| d d S t j||r|d ndtd  }t j||r,|d ndtd  }t|ddd	}|t	j
| jd
dddd  W d    n1 sTw   Y  d}t|ddd	8}t| j dd dD ]!\}}	||	krtd| d |	}|d|d  |d7 }qnW d    ||fS 1 sw   Y  ||fS )NzVocabulary path (z) should be a directory-rz   r   r   wr   r   r#   TF)indent	sort_keysensure_asciir!   r   c                 S   s   | d S )Nr   r   )kvr   r   r   rb   ,  s    z1BioGptTokenizer.save_vocabulary.<locals>.<lambda>rc   zSaving vocabulary to zZ: BPE merge indices are not consecutive. Please check that the tokenizer is not corrupted!re   r   )ospathisdirloggererrorrk   VOCAB_FILES_NAMESr4   writer5   dumpsr7   sortedr?   r8   warning)
rC   r   r   r   
merge_filefrg   writer
bpe_tokenstoken_indexr   r   r   save_vocabulary  s6    


zBioGptTokenizer.save_vocabularyc                 C   s   | j  }d |d< |S )Nr1   )__dict__copy)rC   stater   r   r   __getstate__8  s   
zBioGptTokenizer.__getstate__c                 C   s4   || _ zdd l}W n ty   tdw || _d S )Nr   znYou need to install sacremoses to use XLMTokenizer. See https://pypi.org/project/sacremoses/ for installation.)r   r.   r/   r1   )rC   dr.   r   r   r   __setstate__=  s   
zBioGptTokenizer.__setstate__)r   r   r   r   r   )FrL   )NF)__name__
__module____qualname____doc__r   vocab_files_namesmodel_input_namesrB   propertyrK   rN   rW   r\   rs   rw   rx   ry   r}   rt   intr   r   boolr   strr$   r   r   r   __classcell__r   r   rH   r   r   .   sR    *,

,



 r   )r   r5   r   typingr   tokenization_utilsr   utilsr   
get_loggerr   r   r   r   r   __all__r   r   r   r   <module>   s   
  
