o
    ߥiD                     @   s   d Z ddlZddlmZmZmZmZ ddlZddl	Z
ddlmZ ddlmZmZ ddlmZ ddlmZ e Zdd	iZG d
d dZG dd dZG dd deZdS )z!Tokenization classes for ChatGLM.    N)DictListOptionalUnion)PreTrainedTokenizer)BatchEncodingEncodedInput)PaddingStrategy)loggerzTHUDM/chatglm-6bi   c                   @   sV   e Zd Zdd Zdd Zdee fddZdd	 Zd
d Z	dd Z
dd Zdd ZdS )TextTokenizerc                 C   s&   t  | _| j| | j | _d S N)spmSentencePieceProcessorspLoad
vocab_size
num_tokens)self
model_path r   ^/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/nlp/chatglm/tokenization.py__init__   s   
zTextTokenizer.__init__c                 C      | j |S r   )r   EncodeAsIdsr   textr   r   r   encode      zTextTokenizer.encodeidsc                 C   r   r   )r   	DecodeIds)r   r   r   r   r   decode   r   zTextTokenizer.decodec                 C   r   r   )r   EncodeAsPiecesr   r   r   r   tokenize!   r   zTextTokenizer.tokenizec                    s    fdd|D S )Nc                    s   g | ]} j |qS r   r   	PieceToId).0tokenr   r   r   
<listcomp>%       z7TextTokenizer.convert_tokens_to_ids.<locals>.<listcomp>r   )r   tokensr   r'   r   convert_tokens_to_ids$   s   z#TextTokenizer.convert_tokens_to_idsc                 C   r   r   r#   r   r&   r   r   r   convert_token_to_id'   r   z!TextTokenizer.convert_token_to_idc                 C   r   r   )r   	IdToPiece)r   idxr   r   r   convert_id_to_token*   r   z!TextTokenizer.convert_id_to_tokenc                 C      | j S r   )r   r'   r   r   r   __len__-      zTextTokenizer.__len__N)__name__
__module____qualname__r   r   r   intr    r"   r+   r-   r0   r2   r   r   r   r   r      s    r   c                   @   s   e Zd Z			d#ddZdd Zedefd	d
Zedd Ze	dd Z
e	dd Zed$dedefddZd%defddZ			d&dedee fddZdee defddZ			d&dedee fddZdeeef fd d!Zd"S )'SPTokenizer N  P   Tc                 C   s<   |d usJ || _ || _g d| _|| _|| _t|| _d S )N)[MASK][gMASK]z[sMASK]z
<unused_0><sop><eop>z<ENC>z<dBLOCK>)
vocab_filenum_image_tokensspecial_tokensmax_blank_lengthbyte_fallbackr   text_tokenizer)r   r?   r@   rB   rC   r   r   r   r   3   s   
zSPTokenizer.__init__c                 C   r1   r   )rD   r'   r   r   r   _get_text_tokenizerE   r3   zSPTokenizer._get_text_tokenizerlengthc                 C   s   | dksJ d|  dS )N   z<|blank_z|>r   )rF   r   r   r   get_blank_tokenH   s   zSPTokenizer.get_blank_tokenc                   C   s   dS )Nz<|tab|>r   r   r   r   r   get_tab_tokenM   s   zSPTokenizer.get_tab_tokenc                 C      | j jS r   )rD   r   r'   r   r   r   num_text_tokensQ   s   zSPTokenizer.num_text_tokensc                 C   s   | j | j S r   )r@   rK   r'   r   r   r   r   U   s   zSPTokenizer.num_tokensr   max_lenc                 C   s<   |  dt } t|ddD ]}|  d| t|} q| S )N	    )replacer8   rI   rangerH   )r   rL   ir   r   r   _encode_whitespacesY   s   zSPTokenizer._encode_whitespacesc                 C   s(   |r| dd}|r| j|| jd}|S )N
<n>)rL   )rQ   rT   rB   )r   r   	linebreakwhitespacesr   r   r   _preprocess`   s   zSPTokenizer._preprocessreturnc                    sN     |||}|sd| }  |} fdd|D }|r!|S |dd S )  
        @param text: Text to encode.
        @param linebreak: Whether to encode newline (
) in text.
        @param whitespaces: Whether to encode multiple whitespaces or tab in text, useful for source code encoding.
        @param special_tokens: Whether to encode special token ([MASK], [gMASK], etc.) in text.
        @param add_dummy_prefix: Whether to add dummy blank space in the beginning.
        rV   c                    s   g | ]}| j  qS r   r@   )r%   xr'   r   r   r(   x   s    z&SPTokenizer.encode.<locals>.<listcomp>rG   N)rY   rE   r   )r   r   rW   rX   add_dummy_prefixtmpr*   r   r'   r   r   h   s   zSPTokenizer.encodetext_idsc                    sz    fdd|D }dd |D }   |}|dd}|t d}td jd D ]}| |d	| }q-|S )
Nc                    s   g | ]	}t | j qS r   )r7   r@   r%   _idr'   r   r   r(   |   s    z&SPTokenizer.decode.<locals>.<listcomp>c                 S   s   g | ]}|d kr|qS )r   r   ra   r   r   r   r(   }   r)   rV   rU   rM   rG   rN   rP   )rE   r    rQ   r8   rI   rR   rB   rH   )r   r`   r   r   rS   r   r'   r   r    {   s   zSPTokenizer.decodec                 C   s<   |  |||}|sd| }|  |}|r|S |dd S )r[   rV   rG   N)rY   rE   r"   )r   r   rW   rX   r^   r*   r   r   r   r"      s
   zSPTokenizer.tokenizer]   c                 C   s   t |tr|| jk rd|S | j|| j S t |tr@|dr7|dr7|dd 	 r7t|dd S | j
|| j S td)Nz
<image_{}>z<image_>   rO   zThe key should be str or int.)
isinstancer7   r@   formatrD   r0   str
startswithendswithisdigitr-   
ValueError)r   r]   r   r   r   __getitem__   s&   



zSPTokenizer.__getitem__N)r9   r:   T)r:   )TT)TTT)r4   r5   r6   r   rE   staticmethodr7   rH   rI   propertyrK   r   rg   rT   rY   r   r   r    r"   r   rl   r   r   r   r   r8   1   sF    






r8   c                       sV  e Zd ZdZddiZeZg dZ								
		d4	d5 fddZe	de
e fddZe	de
e fddZe	dd Zdd Zdd Zdd Z		d6deeee f ded edefd!d"Zd#d$ Zd%d& Zd7d'd(Z	d7d)ee d*e
ee  dee fd+d,Zdejddfd-eeeef ef d.e
e d/ed0e
e d1e
e de fd2d3Z!  Z"S )8ChatGLMTokenizera  
    Construct a ChatGLM tokenizer. Based on byte-level Byte-Pair-Encoding.

    Args:
        vocab_file: Path to the vocabulary file.
        do_lower_case: Use lower case letters.
        remove_space: Remove spaces.
        bos_token: The bos token
        eos_token: The Eos Token
        end_token: The end token
        mask_token: The mask token
        gmask_token: The gmask token
        padding_side: The padding side
        num_image_tokens: The `num_image_tokens` in `SPTokenizer`
    r?   zice_text.model)	input_idsattention_maskposition_idsFr=   r>   </s>r;   r<   leftr9   rZ   Nc                    sh   t ||
d| _t jd|||	||||||
d	| || _|| _|| _|| _|| _|| _	|| _
|| _d S )Nr\   )	do_lower_caseremove_spacepadding_side	bos_token	eos_token	end_token
mask_tokengmask_tokenr@   r   )r8   sp_tokenizersuperr   ru   rv   r?   rx   ry   rz   r{   r|   )r   r?   ru   rv   rx   ry   rz   r{   r|   rw   r@   kwargs	__class__r   r   r      s2   
zChatGLMTokenizer.__init__c                 C   s   | j d u rd S | | j S r   )r|   r+   r'   r   r   r   gmask_token_id   s   
zChatGLMTokenizer.gmask_token_idc                 C   s   | j du rdS | | j S )z
        `Optional[int]`: Id of the end of context token in the vocabulary. Returns `None` if the token has not been
        set.
        N)rz   r+   r'   r   r   r   end_token_id   s   
zChatGLMTokenizer.end_token_idc                 C   rJ   )z Returns vocab size )r}   r   r'   r   r   r   r      s   zChatGLMTokenizer.vocab_sizec                    s(    fddt  jD }| j |S )z Returns vocab as a dict c                    s   i | ]}  ||qS r   )_convert_id_to_token)r%   rS   r'   r   r   
<dictcomp>   s    
z.ChatGLMTokenizer.get_vocab.<locals>.<dictcomp>)rR   r   updateadded_tokens_encoder)r   vocabr   r'   r   	get_vocab   s
   
zChatGLMTokenizer.get_vocabc                 C   s0   | j rd|  }n|}| jr| }|S )NrP   )rv   joinstripsplitru   lower)r   inputsoutputsr   r   r   preprocess_text  s   z ChatGLMTokenizer.preprocess_textc                 K   s   |  |}| j|}|S )z Returns a tokenized string. )r   r}   r"   )r   r   r   seqr   r   r   	_tokenize  s   
zChatGLMTokenizer._tokenizeT	token_idsskip_special_tokensclean_up_tokenization_spacesc                 K   sH   t |tr|g}t|dkrdS | j|v rtt| jj|}| j|S )Nr    )	re   r7   lenpad_token_idlistfilter__ne__r}   r    )r   r   r   r   r   r   r   r   _decode  s   

zChatGLMTokenizer._decodec                 C   
   | j | S )z2 Converts a token (str) in an id using the vocab. r}   r,   r   r   r   _convert_token_to_id#     
z%ChatGLMTokenizer._convert_token_to_idc                 C   r   )z=Converts an index (integer) in a token (str) using the vocab.r   )r   indexr   r   r   r   '  r   z%ChatGLMTokenizer._convert_id_to_tokenc                 C   s   t j|rt j|| jd }n|}t| jd}| }W d   n1 s(w   Y  t|d}|| W d   |fS 1 sDw   Y  |fS )a  
        Save the vocabulary and special tokens file to a directory.

        Args:
            save_directory (`str`):
                The directory in which to save the vocabulary.
            filename_prefix (`str`, *optional*):
                An optional prefix to add to the named of the saved files.

        Returns:
            `Tuple(str)`: Paths to the files saved.
        r?   rbNwb)	ospathisdirr   vocab_files_namesopenr?   readwrite)r   save_directoryfilename_prefixr?   fin	proto_strwriterr   r   r   save_vocabulary+  s   

z ChatGLMTokenizer.save_vocabularytoken_ids_0token_ids_1c                 C   s   | j | j }| j | j }| j | j }||vr||vr||g7 }|d |kr4|d |kr4|| j | j g7 }|| j | j g7 }|durR|rI|d |krN||g7 }||7 }|S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A BERT sequence has the following format:

        - single sequence: `[CLS] X [SEP]`
        - pair of sequences: `[CLS] A [SEP] B [SEP]`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        rO   N)r}   r{   r|   ry   rz   rx   )r   r   r   mask_ids	gmask_idseos_idr   r   r    build_inputs_with_special_tokensF  s   

z1ChatGLMTokenizer.build_inputs_with_special_tokensencoded_inputs
max_lengthpadding_strategypad_to_multiple_ofreturn_attention_maskc                 C   sb  | j | j }| j | j }| j | j }| jdksJ || jd  }	t|	}
|tjkr-t|	}|durC|durC|| dkrC|| d | }|tj	koMt|	|k}|durd|vr||	v r`|	
|}n|
}td|
|
f}t|}d|ddddd|f< t|dk }||d< d|vrtj|
tjd}||	v r|n|}||	v r|	
|}|||d< ttj|tjdtjd|
| d tjdg}tj||gdd	|d< |r/|t|	 }d|v rtj|d d
|df|dfgddd|d< d|v r| jg| |d  |d< d|v rdg| |d  |d< d|v r"tj|d d
|dfgd|d< | jg| |	 || jd < |S )a?  
        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)

        Args:
            encoded_inputs:
                Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
            max_length: maximum length of the returned list and optionally padding length (see below).
                Will truncate by taking into account the special tokens.
            padding_strategy: PaddingStrategy to use for padding.

                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
                - PaddingStrategy.DO_NOT_PAD: Do not pad
                The tokenizer padding sides are defined in self.padding_side:

                    - 'left': pads on the left of the sequences
                    - 'right': pads on the right of the sequences
            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
                `>= 7.5` (Volta).
            return_attention_mask:
                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
        rt   r   NrN   rq   g      ?rr   )dtype)axis)r   r   constantT)	pad_widthmodeconstant_valuestoken_type_idsspecial_tokens_mask)r   )r}   rx   r{   r|   rw   model_input_namesr   r	   LONGEST
DO_NOT_PADr   nponestrilbool_arangeint64concatenatezerosstackpadpad_token_type_idr   )r   r   r   r   r   r   bos_token_idmask_token_idr   required_input
seq_lengthneeds_to_be_paddedcontext_lengthrq   rr   r{   mask_positionblock_position_ids
differencer   r   r   _padl  s    








zChatGLMTokenizer._pad)	FFr=   r>   rs   r;   r<   rt   r9   )rZ   N)FTr   )#r4   r5   r6   __doc__r   &PRETRAINED_POSITIONAL_EMBEDDINGS_SIZESmax_model_input_sizesr   r   rn   r   r7   r   r   r   r   r   r   r   r   boolrg   r   r   r   r   r   r	   r   r   r   r   dictr   __classcell__r   r   r   r   ro      s    '	
	




)ro   )r   r   typingr   r   r   r   numpyr   sentencepiecer   transformers.tokenization_utilsr   $transformers.tokenization_utils_baser   r   transformers.utilsr	   modelscope.utilsr
   logging
get_loggerr   r   r8   ro   r   r   r   r   <module>   s    x