o
    ߥi'                     @   sv   d dl Z d dlmZmZmZmZ d dlmZ d dlm	Z	 d dl
mZmZ d dlmZ G dd dZG d	d
 d
e	ZdS )    N)DictListOptionalUnion)SentencePieceProcessor)PreTrainedTokenizer)BatchEncodingEncodedInput)PaddingStrategyc                
   @   s   e Zd ZdefddZdefddZ		ddeded	ed
ee fddZ	dee d
efddZ
dee d
efddZdd Zdd ZdS )SPTokenizer
model_pathc                 C   s   t j|s
J |t|d| _| j | _| j | _| j | _| j	 | _
| j | j ks4J g d}i | _i | _|D ]}| j| j|< || j| j< |  jd7  _q@d S )N)
model_file)z[MASK][gMASK]z[sMASK]sopeop   )ospathisfiler   sp_model
vocab_sizen_wordsbos_ideos_idunk_idpad_idget_piece_sizespecial_tokensindex_special_tokens)selfr   r   token r!   _/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/nlp/chatglm2/tokenization.py__init__   s   zSPTokenizer.__init__sc                 C      | j |S N)r   EncodeAsPieces)r   r$   r!   r!   r"   tokenize       zSPTokenizer.tokenizeFboseosreturnc                 C   s@   t |tu sJ | j|}|r| jg| }|r|| jg }|S r&   )typestrr   encoder   r   )r   r$   r*   r+   tr!   r!   r"   r/   #   s   zSPTokenizer.encoder0   c                 C   r%   r&   )r   decode)r   r0   r!   r!   r"   r1   /   r)   zSPTokenizer.decodetokensc                 C   s   | j |}|S r&   )r   DecodePieces)r   r2   textr!   r!   r"   decode_tokens2   s   zSPTokenizer.decode_tokensc                 C   s    || j v r
| j | S | j|S z2 Converts a token (str) in an id using the vocab. )r   r   	PieceToIdr   r    r!   r!   r"   convert_token_to_id6   s   

zSPTokenizer.convert_token_to_idc                 C   s6   || j v s|| j| j| jfv s|dk rdS | j|S )=Converts an index (integer) in a token (str) using the vocab.r    )r   r   r   r   r   	IdToPiecer   indexr!   r!   r"   convert_id_to_token<   s   zSPTokenizer.convert_id_to_tokenN)FF)__name__
__module____qualname__r.   r#   r(   boolr   intr/   r1   r5   r9   r?   r!   r!   r!   r"   r   
   s$    
r   c                       sD  e Zd ZddiZg dZd1 fdd	Zdd Zed	efd
dZ	edd Z
ed	efddZedd Zedd Zdd Zdd Zdd Zdd Zdee d	efddZd2d d!Zd"d# Zd2d$d%Z	d2d&ee d'eee  d	ee fd(d)Zdejddfd*eeeef ef d+ee d,ed-ee d.ee  d	e!fd/d0Z"  Z#S )3ChatGLM2Tokenizer
vocab_fileztokenizer.model)	input_idsattention_maskposition_idsleftc                    sJ   d| _ || _t|| _| jj| jj| jjd| _t j	dd|i| d S )NGLMTokenizer)z<bos><eos><pad>padding_sider!   )
namerF   r   	tokenizerr   r   r   r   superr#   )r   rF   rN   kwargs	__class__r!   r"   r#   J   s   
zChatGLM2Tokenizer.__init__c                 C   s@   || j v r
| j | S || jj v sJ | d| j | jj | S )Nz is not a special token for )r   rP   rO   r8   r!   r!   r"   get_commandV   s   

 zChatGLM2Tokenizer.get_commandr,   c                 C      dS )Nz<unk>r!   r   r!   r!   r"   	pad_token\      zChatGLM2Tokenizer.pad_tokenc                 C   
   |  dS )NrM   rU   rW   r!   r!   r"   pad_token_id`      
zChatGLM2Tokenizer.pad_token_idc                 C   rV   )Nz</s>r!   rW   r!   r!   r"   	eos_tokend   rY   zChatGLM2Tokenizer.eos_tokenc                 C   rZ   )NrL   r[   rW   r!   r!   r"   eos_token_idh   r]   zChatGLM2Tokenizer.eos_token_idc                 C   s   | j jS r&   )rP   r   rW   r!   r!   r"   r   l   s   zChatGLM2Tokenizer.vocab_sizec                    s(    fddt  jD }| j |S )z Returns vocab as a dict c                    s   i | ]}  ||qS r!   )_convert_id_to_token).0irW   r!   r"   
<dictcomp>r   s    
z/ChatGLM2Tokenizer.get_vocab.<locals>.<dictcomp>)ranger   updateadded_tokens_encoder)r   vocabr!   rW   r"   	get_vocabp   s
   
zChatGLM2Tokenizer.get_vocabc                 K   r%   r&   )rP   r(   )r   r4   rR   r!   r!   r"   	_tokenizey   r)   zChatGLM2Tokenizer._tokenizec                 C   r%   r6   )rP   r9   r8   r!   r!   r"   _convert_token_to_id|      z&ChatGLM2Tokenizer._convert_token_to_idc                 C   r%   )r:   )rP   r?   r=   r!   r!   r"   r`      rk   z&ChatGLM2Tokenizer._convert_id_to_tokenr2   c                 C   r%   r&   )rP   r5   )r   r2   r!   r!   r"   convert_tokens_to_string   r)   z*ChatGLM2Tokenizer.convert_tokens_to_stringNc                 C   s   t j|rt j|| jd }n|}t| jd}| }W d   n1 s(w   Y  t|d}|| W d   |fS 1 sDw   Y  |fS )a  
        Save the vocabulary and special tokens file to a directory.

        Args:
            save_directory (`str`):
                The directory in which to save the vocabulary.
            filename_prefix (`str`, *optional*):
                An optional prefix to add to the named of the saved files.

        Returns:
            `Tuple(str)`: Paths to the files saved.
        rF   rbNwb)	r   r   isdirjoinvocab_files_namesopenrF   readwrite)r   save_directoryfilename_prefixrF   fin	proto_strwriterr!   r!   r"   save_vocabulary   s   

z!ChatGLM2Tokenizer.save_vocabularyc                 C   s   |  d|  dg}|S )Nr   r   r[   )r   prefix_tokensr!   r!   r"   get_prefix_tokens   s   z#ChatGLM2Tokenizer.get_prefix_tokensc                 C   sX   |d u rg }d}t |D ]\}\}}|d|d ||7 }q|dt|d |7 }|S )Nr;   u    [Round {}]

问：{}

答：{}

r   u   [Round {}]

问：{}

答：)	enumerateformatlen)r   queryhistorypromptrb   	old_queryresponser!   r!   r"   build_prompt   s   
zChatGLM2Tokenizer.build_prompttoken_ids_0token_ids_1c                 C   s0   |   }|| }|dur|| | dg }|S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A BERT sequence has the following format:

        - single sequence: `[CLS] X [SEP]`
        - pair of sequences: `[CLS] A [SEP] B [SEP]`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        NrL   )r|   rU   )r   r   r   r{   r!   r!   r"    build_inputs_with_special_tokens   s   z2ChatGLM2Tokenizer.build_inputs_with_special_tokensencoded_inputs
max_lengthpadding_strategypad_to_multiple_ofreturn_attention_maskc           
      C   s  | j dksJ || jd  }t|}|tjkrt|}|dur1|dur1|| dkr1|| d | }|tjko;t||k}d|vrGdg| |d< d|vrStt||d< |r|t| }	d|v rjdg|	 |d  |d< d|v rydg|	 |d  |d< | jg|	 | || jd < |S )a?  
        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)

        Args:
            encoded_inputs:
                Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
            max_length: maximum length of the returned list and optionally padding length (see below).
                Will truncate by taking into account the special tokens.
            padding_strategy: PaddingStrategy to use for padding.

                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
                - PaddingStrategy.DO_NOT_PAD: Do not pad
                The tokenizer padding sides are defined in self.padding_side:

                    - 'left': pads on the left of the sequences
                    - 'right': pads on the right of the sequences
            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
                `>= 7.5` (Volta).
            return_attention_mask:
                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
        rJ   r   Nr   rH   rI   )	rN   model_input_namesr   r
   LONGEST
DO_NOT_PADlistrd   r\   )
r   r   r   r   r   r   required_input
seq_lengthneeds_to_be_padded
differencer!   r!   r"   _pad   sN    
zChatGLM2Tokenizer._pad)rJ   r&   )$r@   rA   rB   rq   r   r#   rU   propertyr.   rX   r\   r^   r_   r   rh   ri   rj   r`   r   rl   rz   r|   r   rD   r   r   r
   r   r   r   r	   r   rC   dictr   __classcell__r!   r!   rS   r"   rE   E   s`    


	



rE   )r   typingr   r   r   r   sentencepiecer   transformersr   $transformers.tokenization_utils_baser   r	   transformers.utilsr
   r   rE   r!   r!   r!   r"   <module>   s    ;