o
    Gi04                     @   sn   d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZm	Z	 d dl
mZ G dd dZG dd	 d	eZdS )
    N)SentencePieceProcessor)PreTrainedTokenizer)BatchEncodingEncodedInput)PaddingStrategyc                
   @   s   e Zd ZdefddZddefddZddeded	ed
ee fddZ	dee d
efddZ
dee d
efddZdd Zdd ZdS )SPTokenizer
model_pathc                 C   s   t j|s
J |t|d| _| j | _| j | _| j | _| j	 | _
| j | j ks4J g d}g d| }i | _i | _|D ]}| j| j|< || j| j< |  jd7  _qFddd |D | _d S )N)
model_file)z
<|system|>z<|user|><|assistant|>z<|observation|>)z[MASK][gMASK]z[sMASK]sopeop   |c                 S   s   g | ]}t |qS  )reescape).0tokenr   r   X/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/pipelines/kolors/tokenizer.py
<listcomp>.   s    z(SPTokenizer.__init__.<locals>.<listcomp>)ospathisfiler   sp_model
vocab_sizen_wordsbos_ideos_idunk_idpad_idget_piece_sizespecial_tokensindex_special_tokensjoinrole_special_token_expression)selfr   role_special_tokensr"   r   r   r   r   __init__   s    zSPTokenizer.__init__Fsc              	   C   s   |rKd}g }t | j|D ](}|| k r$|| j|||   ||| |   | }q|t	|k rI|| j||d   |S | j|S )Nr   )
r   finditerr%   startextendr   EncodeAsPiecesappendendlen)r&   r)   encode_special_tokens
last_indextmatchr   r   r   tokenize0   s   
zSPTokenizer.tokenizeboseosreturnc                 C   s>   t |tsJ | j|}|r| jg| }|r|| jg }|S N)
isinstancestrr   encoder   r   )r&   r)   r6   r7   r3   r   r   r   r<   ?   s   zSPTokenizer.encoder3   c                 C   sh   dg }}|D ] }|| j v r"|r|| j|7 }g }|| j | 7 }q|| q|r2|| j|7 }|S )N )r#   r   decoder.   )r&   r3   textbufferr   r   r   r   r>   H   s   

zSPTokenizer.decodetokensc                 C   s   | j |}|S r9   )r   DecodePieces)r&   rA   r?   r   r   r   decode_tokensV   s   zSPTokenizer.decode_tokensc                 C   s    || j v r
| j | S | j|S z0Converts a token (str) in an id using the vocab.)r"   r   	PieceToIdr&   r   r   r   r   convert_token_to_idZ   s   

zSPTokenizer.convert_token_to_idc                 C   s@   || j v r
| j | S || j| j| jfv s|dk rdS | j|S )=Converts an index (integer) in a token (str) using the vocab.r   r=   )r#   r   r   r    r   	IdToPiecer&   indexr   r   r   convert_id_to_token`   s
   

zSPTokenizer.convert_id_to_tokenN)F)FF)__name__
__module____qualname__r;   r(   r5   boollistintr<   r>   rC   rG   rL   r   r   r   r   r      s     	r   c                       s  e Zd ZddiZg dZ			d< fdd	Zdd	 Zed
efddZ	e	j
defddZ	ed
efddZej
defddZedd Zed
efddZej
defddZedd Zedd Zdd Zdd Zdd  Zd!d" Zd#ee d
efd$d%Zd=d'd(Zd)d* Zd+d, Zd>d.d/Z	&d=d0ee d1ee d&B d
ee fd2d3Zd&ejd&d&d&fd4eeef e B d5ed&B d6ed7ed&B d8e!d&B d9e!d&B d
efd:d;Z"  Z#S )?ChatGLMTokenizer
vocab_fileztokenizer.model)	input_idsattention_maskposition_idsleftFc                    sT   d| _ || _t|| _| jj| jj| jjd| _|| _t	 j
d|||d| d S )NGLMTokenizer)z<bos><eos><pad>)padding_sideclean_up_tokenization_spacesr1   r   )namerT   r   	tokenizerr   r   r    r"   r1   superr(   )r&   rT   r\   r]   r1   kwargs	__class__r   r   r(   n   s   

zChatGLMTokenizer.__init__c                 C   s@   || j v r
| j | S || jj v sJ | d| j | jj | S )Nz is not a special token for )r"   r_   r^   rF   r   r   r   get_command   s   

 zChatGLMTokenizer.get_commandr8   c                 C      dS Nz<unk>r   r&   r   r   r   	unk_token      zChatGLMTokenizer.unk_tokenvaluec                 C   
   || _ d S r9   )
_unk_tokenr&   rj   r   r   r   rh         
c                 C   re   rf   r   rg   r   r   r   	pad_token   ri   zChatGLMTokenizer.pad_tokenc                 C   rk   r9   )
_pad_tokenrm   r   r   r   ro      rn   c                 C   
   |  dS )Nr[   rd   rg   r   r   r   pad_token_id   rn   zChatGLMTokenizer.pad_token_idc                 C   re   )Nz</s>r   rg   r   r   r   	eos_token   ri   zChatGLMTokenizer.eos_tokenc                 C   rk   r9   )
_eos_tokenrm   r   r   r   rt      rn   c                 C   rq   )NrZ   rr   rg   r   r   r   eos_token_id   rn   zChatGLMTokenizer.eos_token_idc                 C   s   | j jS r9   )r_   r   rg   r   r   r   r      s   zChatGLMTokenizer.vocab_sizec                    s(    fddt  jD }| j |S )zReturns vocab as a dictc                    s   i | ]}  ||qS r   )_convert_id_to_token)r   irg   r   r   
<dictcomp>   s    z.ChatGLMTokenizer.get_vocab.<locals>.<dictcomp>)ranger   updateadded_tokens_encoder)r&   vocabr   rg   r   	get_vocab   s   zChatGLMTokenizer.get_vocabc                 K   s   | j j|| jdS )N)r1   )r_   r5   r1   )r&   r?   ra   r   r   r   	_tokenize   s   zChatGLMTokenizer._tokenizec                 C      | j |S rD   )r_   rG   rF   r   r   r   _convert_token_to_id      z%ChatGLMTokenizer._convert_token_to_idc                 C   r   )rH   )r_   rL   rJ   r   r   r   rw      r   z%ChatGLMTokenizer._convert_id_to_tokenrA   c                 C   r   r9   )r_   rC   )r&   rA   r   r   r   convert_tokens_to_string   s   z)ChatGLMTokenizer.convert_tokens_to_stringNc                 C   s   t j|rt j|| jd }n|}t| jd}| }W d   n1 s(w   Y  t|d}|| W d   |fS 1 sDw   Y  |fS )a  
        Save the vocabulary and special tokens file to a directory.

        Args:
            save_directory (`str`):
                The directory in which to save the vocabulary.
            filename_prefix (`str`, *optional*):
                An optional prefix to add to the named of the saved files.

        Returns:
            `tuple(str)`: Paths to the files saved.
        rT   rbNwb)	r   r   isdirr$   vocab_files_namesopenrT   readwrite)r&   save_directoryfilename_prefixrT   fin	proto_strwriterr   r   r   save_vocabulary   s   

z ChatGLMTokenizer.save_vocabularyc                 C   s   |  d|  dg}|S )Nr   r   rr   )r&   prefix_tokensr   r   r   get_prefix_tokens   s   z"ChatGLMTokenizer.get_prefix_tokensc                 C   sN   |dv sJ ||  d| dg| j| d }| j|}|| }|S )N)systemuser	assistantobservationz<|z|>
)rd   r_   r<   )r&   rolemetadatamessagerole_tokensmessage_tokensrA   r   r   r   build_single_message   s
   &z%ChatGLMTokenizer.build_single_messager   c              
   C   s   |d u rg }g }|D ].}|d }|d dkr(d|v r(|d t j|d ddd }|| |d |d	d
| q
|| |d
| || dg | j|gdddS )Ncontentr   r   toolsr      F)indentensure_asciir   r=   r
   ptT)return_tensorsis_split_into_words)jsondumpsr,   r   getrd   batch_encode_plus)r&   queryhistoryr   rU   itemr   r   r   r   build_chat_input   s   "z!ChatGLMTokenizer.build_chat_inputtoken_ids_0token_ids_1c                 C   s0   |   }|| }|dur|| | dg }|S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A BERT sequence has the following format:

        - single sequence: `[CLS] X [SEP]`
        - pair of sequences: `[CLS] A [SEP] B [SEP]`

        Args:
            token_ids_0 (`list[int]`):
                list of IDs to which the special tokens will be added.
            token_ids_1 (`list[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `list[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        NrZ   )r   rd   )r&   r   r   r   r   r   r    build_inputs_with_special_tokens   s
   z1ChatGLMTokenizer.build_inputs_with_special_tokensencoded_inputs
max_lengthpadding_strategypad_to_multiple_ofreturn_attention_maskr\   c                 C   s  | j dksJ || jd  }t|}|tjkrt|}|dur1|dur1|| dkr1|| d | }|tjko;t||k}	d|vrGdg| |d< d|vrStt||d< |	r|t| }
d|v rjdg|
 |d  |d< d|v rydg|
 |d  |d< | jg|
 | || jd < |S )a7  
        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)

        Args:
            encoded_inputs:
                Dictionary of tokenized inputs (`list[int]`) or batch of tokenized inputs (`list[list[int]]`).
            max_length: maximum length of the returned list and optionally padding length (see below).
                Will truncate by taking into account the special tokens.
            padding_strategy: PaddingStrategy to use for padding.

                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
                - PaddingStrategy.DO_NOT_PAD: Do not pad
                The tokenizer padding sides are defined in self.padding_side:

                    - 'left': pads on the left of the sequences
                    - 'right': pads on the right of the sequences
            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
                `>= 7.5` (Volta).
            padding_side (`str`, *optional*):
                The side on which the model should have padding applied. Should be selected between ['right', 'left'].
                Default value is picked from the class attribute of the same name.
            return_attention_mask:
                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
        rX   r   Nr   rV   rW   )	r\   model_input_namesr0   r   LONGEST
DO_NOT_PADrQ   rz   rs   )r&   r   r   r   r   r   r\   required_input
seq_lengthneeds_to_be_padded
differencer   r   r   _pad  s(   $
zChatGLMTokenizer._pad)rX   FFr9   )Nr   )$rM   rN   rO   r   r   r(   rd   propertyr;   rh   setterro   rs   rt   rv   r   r~   r   r   rw   rQ   r   r   r   r   r   rR   r   r   r   dictr   r   rP   r   __classcell__r   r   rb   r   rS   i   s~    






rS   )r   r   r   sentencepiecer   transformersr   $transformers.tokenization_utils_baser   r   transformers.utilsr   r   rS   r   r   r   r   <module>   s   P