o
    ߥi8                     @   s  d Z ddlZddlZddlZddlZddlmZ ddlZddlZ	ddl
ZddlZddlmZ ddlmZ ddlmZ ddlmZmZ 								d,d
dZG dd deZ	 dZeddZefddZG dd deZg dZeeZ	 eddZefddZG dd deZ g dZ!ee!Z!G dd deZ"G dd deZ#G dd de#Z$d Z%d-d"d#Z&G d$d% d%e#Z'G d&d' d'e"Z(G d(d) d)e"Z)G d*d+ d+e"Z*dS ).zLUtilities for using and training tokenizers (char, wordpiece, sentencepiece)    N)
namedtuple)tokenize   )sp_tokenizer)GPT2Tokenizer)PRETRAINED_VOCAB_ARCHIVE_MAPBertTokenizer      ?c	                 K   s   | }
t |
trt|
}
|
tu rt|fi |	S |
tu r)|du r!d}t|fi |	S |
tu r5t|fi |	S |
||||||d}t|||S )zZ
    Helper function to instantiate a tokenizer given common combinations of options.
    Ngpt2)corpus
vocab_size
model_path
model_type	pad_tokencharacter_coverage)
isinstancestrevalBertWordPieceTokenizerGPT2BPETokenizerChineseSPTokenizer	Tokenizer)tokenizer_typer   r   r   r   r   r   command_tokenstype_tokenskwargstokenizer_classtext_tokenizer r   f/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/nlp/mglm/data_utils/tokenization.pymake_tokenizer    s(   
r    c                   @   sZ   e Zd ZdZ				dddZdd Zdd	 Zd
d Zdd Zdd Z	dd Z
dd ZdS )Tokenizationa  
    Tokenization object to hold tokenization, (processed text),and original
    text. Can hold tokenization as Ids or tokens.

    It also holds command tokens (pad, unk, etc.) for the tokenization.
    This allows functions to pad/operate on tokenizations without having
    access to the full tokenizer, just the tokenization.

    Several standard array operations are implemented (insert, append, extend).
    NTc                 C   sN   || _ || _| jd u r| j | _|| _| jd u r| j| _|| _|| _|   d S N)tokenizationtextoriginal_textr   asIdsparse_command_tokens)selfr#   r$   r%   r   r&   r   r   r   __init__N   s   

zTokenization.__init__c                 C   s   || _ |  S r"   )r   r'   )r(   r   r   r   r   set_command_tokens_   s   zTokenization.set_command_tokensc                 C   sF   | j d u rd S | j D ]}| jrt| |j|j q
t| |j|j q
d S r"   )r   r&   setattrnameIdtoken)r(   command_tokenr   r   r   r'   c   s   

z!Tokenization.parse_command_tokensc                 C   
   | j | S r"   )r#   )r(   indexr   r   r   __getitem__l      
zTokenization.__getitem__c                 C   s
   t | jS r"   )lenr#   r(   r   r   r   __len__o   r3   zTokenization.__len__c                 C   s   t |ttfr@| j||j |dkr#|j| j | _|j| j | _d S |t	| jd kr>|  j|j7  _|  j|j7  _d S d S t |t
rY| jd | |j | j|d   | _d S | jd | |j | j|d   | _d S Nr   r   )r   CommandToken	TypeTokenr#   insertr-   r.   r$   r%   r4   r!   )r(   idxotherr   r   r   r:   r   s8   


zTokenization.insertc                 C   s   t |ttfr | j|j |  j|j7  _|  j|j7  _| S t |t	r>| j
|j |  j|j7  _|  j|j7  _| S | j| | S r"   )r   r8   r9   r#   appendr-   r$   r.   r%   r!   extendr(   r<   r   r   r   r=      s   
zTokenization.appendc                 C   s   t |ttfr | j|j |  j|j7  _|  j|j7  _| S t |t	rSt |d ttfrS| j
dd |D  |  jdd |D 7  _|  jdd |D 7  _| S t |trq| j
|j |  j|j7  _|  j|j7  _| S | j
| | S )Nr   c                 S      g | ]}|j qS r   r-   .0or   r   r   
<listcomp>       z'Tokenization.extend.<locals>.<listcomp>c                 S   r@   r   r.   rB   r   r   r   rE      rF   c                 S   r@   r   rG   rB   r   r   r   rE      rF   )r   r8   r9   r#   r=   r-   r$   r.   r%   listr>   r!   r?   r   r   r   r>      s&   
zTokenization.extend)NNNT)__name__
__module____qualname____doc__r)   r*   r'   r2   r6   r:   r=   r>   r   r   r   r   r!   B   s    
	r!   z<{0}>r8   r,   r.   r-   c                        fdd| D S )Nc                    *   g | ]}t |d   |d  |d qS r   r   )r8   formatrC   toktoken_formatr   r   rE          z'prep_command_tokens.<locals>.<listcomp>r   	tokenlistrU   r   rT   r   prep_command_tokens      
rY   c                   @   s   e Zd ZdddZdd ZdS )r8   Fc                 C   s"   || _ || _|| _|| _|| _d S r"   )r,   r.   r-   lstriprstrip)r(   r,   r.   r-   r[   r\   r   r   r   r)      s
   
zCommandToken.__init__c                 C      t t| j| j| jS r"   )r   COMMAND_TUPLEr,   r.   r-   r5   r   r   r   __str__      zCommandToken.__str__N)FFrI   rJ   rK   r)   r_   r   r   r   r   r8      s    
))padr   )eosr   )bos   )unk   )sep   )L2R   )ENC   )MASK   r9   c                    rN   )Nc                    rO   rP   )r9   rQ   rR   rT   r   r   rE      rV   z$prep_type_tokens.<locals>.<listcomp>r   rW   r   rT   r   prep_type_tokens   rZ   rp   c                   @   s   e Zd Zdd Zdd ZdS )r9   c                 C   s   || _ || _|| _d S r"   rM   )r(   r,   r.   r-   r   r   r   r)      s   
zTypeToken.__init__c                 C   r]   r"   )r   
TYPE_TUPLEr,   r.   r-   r5   r   r   r   r_      r`   zTypeToken.__str__Nra   r   r   r   r   r9      s    ))functionr   )commandr   )str0re   )str1rg   )str2ri   )
embedding0rk   )
embedding1rm   )
embedding2ro   )arg0   )arg1	   )arg2
   c                   @   s   e Zd ZdZd,ddZd-ddZdd Zd	d
 Zdd Ze	dd Z
e	dd Ze	dd Ze	dd Ze	dd Ze	dd Ze	dd Ze	dd Zd-ddZdd  Zd-d!d"Zd.d$d%Zd.d&d'Zd.d(d)Zd.d*d+ZdS )/r   a  
    Tokenizer object that handles text tokenization, command tokens, and type tokens.

    Command tokens and text tokens are stored together in one mapping of size
    `len(text_tokenizer)+len(command_tokens)`. Command tokens are stored as first
    `len(command_tokens)` tokens. Token idx is stored at `idx+len(command_tokens)`.

    Token types are stored in a separate mapping of size `len(type_tokens)`.
    Nc                    s  | _ t dst j  _|d u rt}| _dd  jD  _dd  jD  _dd  jD  _t ds=t j _	t dsI j	 j  _
|d u rOt}| _dd  jD  _d	d  jD  _d
d  jD  _t dsxt j _t j t j j  _dd  j D  _ j fdd j j D  t j j _ fdd j j D  _t j  _dd  j D  _t j  _dd  j D  _d S )Nnum_text_tokensc                 S      i | ]}|j |qS r   r,   rR   r   r   r   
<dictcomp>      z&Tokenizer.__init__.<locals>.<dictcomp>c                 S   r   r   rG   rR   r   r   r   r         c                 S   r   r   rA   rR   r   r   r   r   
  r   num_command_tokens
num_tokensc                 S   r   r   r   rR   r   r   r   r     r   c                 S   r   r   rG   rR   r   r   r   r     r   c                 S   r   r   rA   rR   r   r   r   r     r   num_type_tokensc                 S      i | ]\}}||qS r   r   rC   r-   tr   r   r   r         c                       i | ]
\}}|| j  qS r   r   rC   r   r-   r5   r   r   r         
c                    r   r   r   r   r5   r   r   r   $  r   c                 S   r   r   r   r   r   r   r   r   *      c                 S   r   r   r   r   r   r   r   r   0  r   )r   hasattrr4   r   DEFAULT_COMMAND_TOKENS_command_tokenscommand_name_mapcommand_token_mapcommand_id_mapr   r   DEFAULT_TYPE_TOKENSr   type_name_maptype_token_maptype_id_mapr   rH   keystokens_tokensitems_vocabupdatevocab_text_tokens_text_token_vocab_command_token_tokens_command_token_vocab_token_types_token_type_vocab)r(   r   r   r   r   r5   r   r)      sP   






zTokenizer.__init__c                 C   s   | j ||dS )z(run preprocessing and encode text as Ids
process_fnEncodeAsIdsr(   r$   r   r   r   r   __call__2  s   zTokenizer.__call__c                 C      | j S )ztotal number of tokens)r   r5   r   r   r   r6   6     zTokenizer.__len__c                 C   r0   )z)get command token corresponding to `name`)r   r(   r,   r   r   r   get_command:     
zTokenizer.get_commandc                 C   r0   )z&get type token corresponding to `name`)r   r   r   r   r   get_type>  r   zTokenizer.get_typec                 C   r   )z.list (or iterable) of all tokens for tokenizerr   r5   r   r   r   r   B     zTokenizer.tokensc                 C   r   )z.dictionary mapping tokens to ids for tokenizerr   r5   r   r   r   r   G  r   zTokenizer.vocabc                 C   r   )z3list (or iterable) of all token types for tokenizer)r   r5   r   r   r   token_typesL  r   zTokenizer.token_typesc                 C   r   )z3dictionary mapping token types to ids for tokenizer)r   r5   r   r   r   token_type_vocabQ  r   zTokenizer.token_type_vocabc                 C   r   )z6list (or iterable) of all command tokens for tokenizer)r   r5   r   r   r   r   V  r   zTokenizer.command_tokensc                 C   r   )z6dictionary mapping command tokens to ids for tokenizer)r   r5   r   r   r   command_token_vocab[  r   zTokenizer.command_token_vocabc                 C   r   )4list (or iterable) of text tokens for text tokenizer)r   r5   r   r   r   text_tokens`  r   zTokenizer.text_tokensc                 C   r   )z8dictionary mapping text tokens to ids for text tokenizer)r   r5   r   r   r   text_token_vocabe  r   zTokenizer.text_token_vocabc                    s\   |}|dur
||}dt fdd fdd} j}|||}t|||}| j |S )zY
        encode text using text tokenizer and shift Id values for command tokens
        Ntok_extendedc                 S      g }| j }||}t|D ]H\}}| jr|dkr| }| jr,|t|d k r,| }|dkr8|s8|| q|t|d krJ|rH|| q	 q|rQ|| || q|S r7   r.   split	enumerater\   r[   r4   r=   r   r$   resultrS   
split_textisub_textr   r   r   split_on_tokenr  $   

z-Tokenizer.EncodeAsIds.<locals>.split_on_tokenc                       |  sg S | s j|S g }|g}| D ]}g }|D ]}| jvr+||| q|| q|}qttj	 fdd|D S )Nc                 3   s2    | ]}| j vr |n j| jgV  qd S r"   )r   _encoder   r-   rC   r.   r5   r   r   	<genexpr>  s    


zATokenizer.EncodeAsIds.<locals>.split_on_tokens.<locals>.<genexpr>
stripr   encoder   r>   r=   rH   	itertoolschainfrom_iterabletok_listr$   tokenized_text	text_listrS   r   r(   r   r   r   split_on_tokens  (   

z.Tokenizer.EncodeAsIds.<locals>.split_on_tokensr8   r   r!   r*   r(   r$   r   processed_textr   no_split_tokensIdsr#   r   r   r   r   j  s   
zTokenizer.EncodeAsIdsc                 C   s   t r"   NotImplementedErrorr(   r$   r   r   r   r        zTokenizer._encodec                 C   s    | j j||d}|| j |S )z<
        encode text as tokens using text tokenizer
        r   )r   EncodeAsTokensr*   r   )r(   r$   r   r#   r   r   r   r     s
   zTokenizer.EncodeAsTokensFc                 C   sL   t |ttfr
|jS |r| j| jS || jk r| j| jS | j|| j S )z:convert Id to token accounting for command and type tokens)	r   r9   r8   r.   r   r   r   r   	IdToTokenr(   r-   
type_tokenr   r   r   r        
zTokenizer.IdToTokenc                 C   sL   t |ttfr
|jS |r| j| jS || jv r| j| jS | j|| j S )z:convert token to Id accounting for command and type tokens)	r   r9   r8   r-   r   r   r   	TokenToIdr   r(   r.   r   r   r   r   r     r   zTokenizer.TokenToIdc                    s   |rd  fdd|D S g }g }t|tr|j}|D ];}t|tr5| j| g }||j q| j	k rO| j| g }| j
| j q|| j	  q|g kre| j| d |S )z
        convert Ids to tokens accounting for command and type tokens, tokens
        are joined and returned as a string.
         c                 3   ,    | ]}t |tr|jn j| jV  qd S r"   r   r9   r.   r   rC   r-   r5   r   r   r         
z&Tokenizer.DecodeIds.<locals>.<genexpr>)joinr   r!   r#   r8   r=   r   	DecodeIdsr.   r   r   )r(   r   r   rtn_strscurrent_strr-   r   r5   r   r     s*   



zTokenizer.DecodeIdsc                 C   s   |rd dd |D S g }g }t|tr|j}|D ]4}t|tr3|| j| g }||j q|| j	v rI|| j| g }|| q|| q|g kr\|| j| d |S )zT
        convert tokens to a string accounting for command and type tokens.
        r   c                 s   $    | ]}t |tr|jn|V  qd S r"   r   r9   r.   rC   r   r   r   r   r         
z)Tokenizer.DecodeTokens.<locals>.<genexpr>)
r   r   r!   r#   r8   r=   r   DecodeTokensr.   r   )r(   Tokensr   r   r   r   r   r   r   r     s*   




zTokenizer.DecodeTokens)NNr"   F)rI   rJ   rK   rL   r)   r   r6   r   r   propertyr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r      s<    


7








E

	



r   c                   @   s   e Zd ZdZdd ZdddZdd Zed	d
 Zedd Z	e
dd Zdd ZdddZdddZdd Zdd Zdd Zdd ZdS )TextTokenizerz&
    Interface for text tokenizer
    c                 C   s*   t | dsd| _t | ds| j| _d S d S )Nr   r   r   )r   r   r   r5   r   r   r   r)     s
   

zTextTokenizer.__init__Nc                 C   s   |  ||S r"   r   r   r   r   r   r        zTextTokenizer.__call__c                 C   r   r"   r   r5   r   r   r   r6        zTextTokenizer.__len__c                 C      t d)r   z-TextTokenizer tokens property not implementedr   r5   r   r   r   r        zTextTokenizer.tokensc                 C   r   )z dictionary mapping tokens to idsz,TextTokenizer vocab property not implementedr   r5   r   r   r   r     r  zTextTokenizer.vocabc                 C   r   )z1check if the filepath for a text tokenizer existsz+TextTokenizer exists method not implementedr   r   r   r   r   exists#  r  zTextTokenizer.existsc                 C   r   )z@train a tokenizer on a data corpus and save model for future usez#TextTokenizer Train not implementedr   r(   r   r   r   r   Train)     zTextTokenizer.Trainc                 C   r   )z
        Preprocess text and encode as ids. Return a tokenization object with
        original text, processed text, and id tokenization.
        z)TextTokenizer EncodeAsIds not implementedr   r   r   r   r   r   -  s   zTextTokenizer.EncodeAsIdsc                 C   r   )z
        Preprocess text and encode as tokens. Return a tokenization object with
        original text, processed text, and token tokenization.
        z,TextTokenizer EncodeAsTokens not implementedr   r   r   r   r   r   4  s   zTextTokenizer.EncodeAsTokensc                 C   r   )z4Convert an Id to Token. Reverse lookup of self.vocabz'TextTokenizer IdToToken not implementedr   r(   r-   r   r   r   r   <  r  zTextTokenizer.IdToTokenc                 C   r   )z+Convert a Token to Id. Lookup of self.vocabz'TextTokenizer TokenToId not implementedr   r(   r.   r   r   r   r   @  r  zTextTokenizer.TokenToIdc                 C   r   )z=Convert a list or tokenization object of Ids to a text stringz'TextTokenizer DecodeIds not implementedr   r(   r   r   r   r   r   D  r  zTextTokenizer.DecodeIdsc                 C   r   )z@Convert a list or tokenization object of tokens to a text stringz*TextTokenizer DecodeTokens not implementedr   r(   r   r   r   r   r   H  r  zTextTokenizer.DecodeTokensr"   )rI   rJ   rK   rL   r)   r   r6   r   r   r   staticmethodr  r  r   r   r   r   r   r   r   r   r   r   r     s$    





r   c                       s   e Zd ZdZ fddZdd Zedd Zdd	 Ze	d
d Z
e	dd ZdddZdddZdd Zdd Zdd Zdd Z  ZS )CharacterLevelTokenizerzD
    Text tokenizer for ASCII-256 Character Level Tokenization.
    c                    sH   d _ tt    fddt j D  _dd t jD  _d S )N   c                       g | ]}  |qS r   r   r   r5   r   r   rE   U  s    
z4CharacterLevelTokenizer.__init__.<locals>.<listcomp>c                 S   r   r   r   rC   r   r   r   r   r   r   X  r   z4CharacterLevelTokenizer.__init__.<locals>.<dictcomp>)r   superr  r)   ranger   r   r   )r(   r   	__class__r5   r   r)   R  s   
z CharacterLevelTokenizer.__init__c                 C      dS )Nr  r   r5   r   r   r   r6   Z  r   zCharacterLevelTokenizer.__len__c                 C   r  )NTr   r  r   r   r   r  ]  s   zCharacterLevelTokenizer.existsc                 C   s   d S r"   r   r  r   r   r   r  a  r   zCharacterLevelTokenizer.Trainc                 C   r   r"   r   r5   r   r   r   r   d  r   zCharacterLevelTokenizer.tokensc                 C   r   r"   r   r5   r   r   r   r   h  r   zCharacterLevelTokenizer.vocabNc                    s:   |}|dur||}t |} fdd|D }t|||S )zconvert text to ascii 256 IdsNc                    r  r   r   rC   cr5   r   r   rE   r  r   z7CharacterLevelTokenizer.EncodeAsIds.<locals>.<listcomp>r   r!   r(   r$   r   r   r   r   r5   r   r   l  s   z#CharacterLevelTokenizer.EncodeAsIdsc                 C   s:   |}|dur
||}t |}dd |D }t|||ddS )z$convert text to ascii 256 charactersNc                 S   s   g | ]}|qS r   r   r  r   r   r   rE   {  s    z:CharacterLevelTokenizer.EncodeAsTokens.<locals>.<listcomp>Fr&   r  r  r   r   r   r   u  s   z&CharacterLevelTokenizer.EncodeAsTokensc                 C      t |S )zascii index to character)chrr  r   r   r   r   ~  r  z!CharacterLevelTokenizer.IdToTokenc                 C   r  )zascii character to index)ordr  r   r   r   r     r  z!CharacterLevelTokenizer.TokenToIdc                    s(   t |tr|j}d fdd|D S )z:converts ascii ids to tokens before joining them into text c                    r  r   r  rR   r5   r   r   rE     r   z5CharacterLevelTokenizer.DecodeIds.<locals>.<listcomp>r   r!   r#   r   r	  r   r5   r   r     s   
z!CharacterLevelTokenizer.DecodeIdsc                 C   s   t |tr|j}d|S )z(just concatenates ascii tokens into textr  r   r
  r   r   r   r     s   

z$CharacterLevelTokenizer.DecodeTokensr"   )rI   rJ   rK   rL   r)   r6   r  r  r  r   r   r   r   r   r   r   r   r   __classcell__r   r   r  r   r  M  s"    




		r  i tsvc                 C   s  t jddd |dkrd}nd}tddd	 d
}d
}i }| D ]E}t|tr)|d }| d}|D ]0}	t|	}
|t	|
7 }|
D ] }t
t	|	|}| D ]}||vrXd
||< ||  d7  < qNqAq2qtdtt	| dd	 tdt| dd	 i }d
}t| dd ddD ]\}}|tkr n	|d7 }|||< qtdtt	| dd	 t|d&}tj||d}| D ]\}}|t|t|g qW d   ||fS 1 sw   Y  ||fS )z
    Take corpus, split it into sentences, and extract word frequencies.
    Write frequencies to `filepath` as a tsv. Only write the first
    MAX_SENTENCEPIECE_SENTENCES most common words to the file.
    punktz./nltk)download_dirr"  	,zcompute corpus frequency
Tflushr   r$   
r   z"length of freqs before truncating zfile path for freq c                 S   s   | d S )Nr   r   )xr   r   r   <lambda>  s    z!get_corpus_freq.<locals>.<lambda>)keyreversez!length of freqs after trancating w)	delimiterN)nltkdownloadprintr   dictr   r   nltk_tokenizesent_tokenizer4   maxr   sortedr   MAX_SENTENCEPIECE_SENTENCESopencsvwriterwriterow)datasetfilepathfiletyper/  total_sentence_countmaxlenfreqsentrylinesline	sentencessentencewordfreqs_sortedcountercountfr;  kvr   r   r   get_corpus_freq  s\   




rO  c                       s   e Zd ZdZ					d fdd	Zdd Zed	d
 Zedd Ze	dd Z
dd Zdd Zd ddZd ddZdd Zdd Zdd Zdd Z  ZS )!SentencePieceTokenizerz3Trains and uses sentencepiece for text tokenizationbpeNr	   c                    sx   || _ | | _|| _|| _t| j }|r)|d ur | jd us"J | || j g | _i | _	| 
  tt|   d S r"   )r   lowerr   	spm_modelr   rP  r  r  r   r   load_spm_modelr  r)   )r(   r   r   r   r   r   r   
make_trainr  r   r   r)     s   
zSentencePieceTokenizer.__init__c                 C   r   r"   r   r5   r   r   r   r6     r   zSentencePieceTokenizer.__len__c                 C   r   r"   r   r5   r   r   r   r     r   zSentencePieceTokenizer.tokensc                 C   r   r"   r   r5   r   r   r   r     r   zSentencePieceTokenizer.vocabc                 C   s@   | d u rdS t j|  }|r| dst j| d  }| S )NF.model)ospathr  endswith)r   dner   r   r   r    s   zSentencePieceTokenizer.existsc                    s   t j js jds jd  _t  _ j j t	 j  _
 _ fddt j
D  _dd t jD  _dS )z(load sentencepiece model and parse vocabrV  c                    r  r   r  r   r5   r   r   rE     r   z9SentencePieceTokenizer.load_spm_model.<locals>.<listcomp>c                 S   r   r   r   r  r   r   r   r     r   z9SentencePieceTokenizer.load_spm_model.<locals>.<dictcomp>N)rW  rX  r  rS  rY  spmSentencePieceProcessorspLoadr4   r   r   r  r   r   r   r5   r   r5   r   rT    s   

z%SentencePieceTokenizer.load_spm_modelc           	      C   s   || _ | j}ttdd}|du r|}|dr"|d|d }|d | }t||\}}t|t	}t
d|dd t
d	dd d
}|j|||| j| jt|d}t
d| dd tj| t| |d | _t
d| j dd dS )z:train sentencepiece model on corpus using word frequenciesr   iNrV  z.tsv.z'line count used as input_sentence_size Tr'  ztraining sentencepiece modelz--input={file_path} --model_prefix={model_prefix} --vocab_size={vocab_size} --model_type={model_type} --character_coverage={character_coverage} --input_sentence_size={input_sentence_size} --input_format=tsv)	file_pathmodel_prefixr   r   r   input_sentence_sizez*calling spm.SentencePieceTrainer.Train(%s)zsentencepiece model written to )r   rS  r   randomrandintrY  rfindrO  minr8  r2  rQ   r   r   intr[  SentencePieceTrainerr  rW  remove)	r(   r   r   use_model_pathrandom_hash
input_path
line_count
maxlenlinetrain_stringr   r   r   r    s>   



zSentencePieceTokenizer.Trainc                 C   s,   |}|dur
||}| j |}t|||S )z!convert text to sentencepiece IdsN)r]  r   r!   r  r   r   r   r   &  s
   z"SentencePieceTokenizer.EncodeAsIdsc                 C   0   |}|dur
||}| j |}t|||ddS )z$convert text to sentencepiece tokensNFr  )r]  r   r!   r  r   r   r   r   .  
   z%SentencePieceTokenizer.EncodeAsTokensc                 C      | j |S z convert Id to sentencpiece token)r]  	IdToPiecer  r   r   r   r   6     z SentencePieceTokenizer.IdToTokenc                 C   rq  z convert sentencpiece token to Id)r]  	PieceToIdr  r   r   r   r   :  rt  z SentencePieceTokenizer.TokenToIdc                 C      t |tr|j}| j|S )zconverts ids to a text string)r   r!   r#   r]  r   r	  r   r   r   r   >     
z SentencePieceTokenizer.DecodeIdsc                 C   rw  )z.converts sentencepiece tokens to a text string)r   r!   r#   r]  r   r
  r   r   r   r   D  rx  z#SentencePieceTokenizer.DecodeTokens)rQ  NNNr	   r"   )rI   rJ   rK   rL   r)   r6   r   r   r   r  r  rT  r  r   r   r   r   r   r   r!  r   r   r  r   rP    s.    




"
rP  c                   @   s`   e Zd ZdZ						dddZdd Zdd	d
ZdddZdddZdddZ	dddZ
dS )r   z
    Loads a pretrained WordPiece tokenizer from `cache_dir` for tokenization
    in BERT training. Default to bert-large-uncased tokenizer.
    NFr   c           
   
   K   s  |t vrd}tj rtj dkrtd|d| d|v p d|v  }tj|||d| _tj r7tj dkr<td| t	d	| j_
d
| _t| jj| _| jd | _d| _tdd| jjd tdd| jjd tdd| jjd tdd| jjd tdd| jjd tdd| jjd g| _|r| jtdd| jtdd| jd g |  jd7  _|  jd7  _|r| jtdd| jtdd | jd g |  jd7  _|  jd7  _|r| jtd!d"| jg |  jd7  _|  jd7  _|dkr/td|D ]2}	| jtd|	 d#|	 d$| jtd|	 d%|	 d&| jd g |  jd7  _|  jd7  _qd'd( | jD | _d)d( | jD | _d*d( | jD | _td+d,dtd-d.dg| _d/d( | jD | _d0d( | jD | _d1d( | jD | _t| jj | _d2d( | jj  D | _!t| j| _"d3d( | jj  D | _#t| j | _$d4d( | j  D | _%t| j | _&d5d( | j  D | _'d S )6Nzbert-large-uncasedr   z loading BertWordPieceTokenizer (z) from cache_dir z-casedchinese)do_lower_case	cache_dirloaded   mBrm   rk   re   rb   z[PAD]rl   [CLS]rn   [MASK]rf   [UNK]rh   [SEP]rc   sop<|startofpiece|>eop<|endofpiece|>r   gMASK[gMASK]sMASK[sMASK]dBLOCK[dBLOCK]z[MASK]z<|startofpiecez|>c                 S   r   r   r   rR   r   r   r   r     r   z3BertWordPieceTokenizer.__init__.<locals>.<dictcomp>c                 S   r   r   rG   rR   r   r   r   r     r   c                 S   r   r   rA   rR   r   r   r   r     r   rt   <str0>ru   <str1>c                 S   r   r   r   rR   r   r   r   r     r   c                 S   r   r   rG   rR   r   r   r   r     r   c                 S   r   r   rA   rR   r   r   r   r     r   c                 S      i | ]\}}||qS r   r   rC   rM  rN  r   r   r   r     r   c                 S   r  r   r   r  r   r   r   r     r   c                 S   r   r   r   r   r   r   r   r     r   c                 S   r   r   r   r   r   r   r   r     r   )(r   torchdistributedis_initializedget_rankr2  r   from_pretrainedr   rf  max_lenr   r4   r   r   r   r   r8   r   r>   r  r   r   r   r9   r   r   r   r   rH   r   r   r   r   r   r   r   r   r   r   )
r(   tokenizer_model_typer{  add_block_symbolsadd_sentinel_tokenadd_task_maskadd_decoder_maskr   rz  r   r   r   r   r)   Q  s   	



	



zBertWordPieceTokenizer.__init__c                 C   s   | j |}| j |}|S r"   )r   r   convert_tokens_to_ids)r(   r$   r   idsr   r   r   r     s   zBertWordPieceTokenizer._encodec                 C   ro  )zconvert wordpiece token to IdNFr  )r   r   r!   r  r   r   r   r     rp  z%BertWordPieceTokenizer.EncodeAsTokensc                 C   F   t |ttfr
|jS |r| j| jS || jv r| j| jS | jj| S rr  )r   r9   r8   r.   r   r   r   ids_to_tokensr   r   r   r   r     s   
z BertWordPieceTokenizer.IdToTokenc                 C   0   t |ttfr
|jS |r| j| jS | jj| S ru  )r   r9   r8   r-   r   r   r   r   r   r   r   r     s
   z BertWordPieceTokenizer.TokenToIdc                    s   |rd  fdd|D S t|tr|j}g }|D ] }| jv r+| j| j q| jjv r:| jj|  qg }|D ]}|	drYt
|dkrY|d  |dd 7  < q?|| q?d |S )	z@converts ids to wordpiece tokens and joins them as a text stringr   c                 3   r   r"   r   r   r5   r   r   r     r   z3BertWordPieceTokenizer.DecodeIds.<locals>.<genexpr>z##r   re   N)r   r   r!   r#   r   r=   r.   r   r  
startswithr4   )r(   r   r   r   r-   
new_tokensr.   r   r5   r   r     s&   


z BertWordPieceTokenizer.DecodeIdsc                 C   s2   |rd dd |D S t|tr|j}d |S )z*converts wordpiece tokens to a text stringr   c                 s   r   r"   r   r   r   r   r   r     r   z6BertWordPieceTokenizer.DecodeTokens.<locals>.<genexpr>)r   r   r!   r#   r(   r   r   r   r   r   r     s   


z#BertWordPieceTokenizer.DecodeTokens)NNFr   FFr"   r   )rI   rJ   rK   rL   r)   r   r   r   r   r   r   r   r   r   r   r   K  s    
j




r   c                   @   sj   e Zd Z				dddZdddZdd Zdd	d
Zdd ZdddZdddZ	dddZ
dddZdS )r   NFc           	      K   s  t j||d| _td| j_t| jj| _d| _|	drd| _
| jd | _tdd| jjd	 td
d| jjd	 tdd| jjd	 tdd| jjd tdd| jjd ddtdd| jjd g| _|r| jtdd| jtdd| jd g |  jd7  _|  j
d7  _
ndd| _
| jd | _tdd| jjd td
d| jjd g| _|r| jtdd| jtdd| jd tdd| jd tdd| jd ddtdd| jd tdd| jd g |  jd7  _|  j
d7  _
|r3|r| jtdd | jddtd!d"| jd ddg |  jd7  _|  j
d7  _
|r3| jtd#d$| jg |  jd7  _|  j
d7  _
d%d& | jD | _d'd& | jD | _d(d& | jD | _td)d*d+td,d-dg| _d.d& | jD | _d/d& | jD | _d0d& | jD | _t| jj | _d1d& | jj D | _t| j| _d2d& | jj D | _t| j | _d3d& | j D | _t| j | _ d4d& | j D | _!| j D ]\}}|j"| jj#|< qd S )5N)r{  r}  re   robertarm   rg   rb   <|endoftext|>z</s>rc   rh   r  rl   r~  z<s>rn   r  z<mask>Tr[   rf   r  z<unk>r  r  r  r  r   ri   rk   r  r  r  r  r  r  c                 S   r   r   r   rR   r   r   r   r   I  r   z-GPT2BPETokenizer.__init__.<locals>.<dictcomp>c                 S   r   r   rG   rR   r   r   r   r   J  r   c                 S   r   r   rA   rR   r   r   r   r   N  r   rt   r  r   ru   r  c                 S   r   r   r   rR   r   r   r   r   T  r   c                 S   r   r   rG   rR   r   r   r   r   U  r   c                 S   r   r   rA   rR   r   r   r   r   V  r   c                 S   r  r   r   r  r   r   r   r   Y  r   c                 S   r  r   r   r  r   r   r   r   \  r   c                 S   r   r   r   r   r   r   r   r   b  r   c                 S   r   r   r   r   r   r   r   r   h  r   )$r   r  r   rf  r  r4   encoderr   r   r  r   r   r8   r   r>   r   r   r   r9   r   r   r   r   rH   r   r   r   r   r   r   r   r   r   r   r.   decoder)	r(   model_type_or_pathr{  r  r  r  r   r;   rS   r   r   r   r)     s   








	



zGPT2BPETokenizer.__init__c                    s\   |}|d ur
||}dt fdd fdd} j}|||}t|||}| j |S )Nr   c                 S   r   r7   r   r   r   r   r   r   r  r   z4GPT2BPETokenizer.EncodeAsIds.<locals>.split_on_tokenc                    r   )Nc                 3   s4    | ]}| j vr j|n j| jgV  qd S r"   )r   r   r   r   r-   r   r5   r   r   r     s    

zHGPT2BPETokenizer.EncodeAsIds.<locals>.split_on_tokens.<locals>.<genexpr>r   r   r   r   r   r     r   z5GPT2BPETokenizer.EncodeAsIds.<locals>.split_on_tokensr   r   r   r   r   r   m  s   
zGPT2BPETokenizer.EncodeAsIdsc                 C   rq  r"   r   r   r   r   r   r   r     r   zGPT2BPETokenizer._encodec                    s   |}|d ur
||}g }t  jj|D ]"}d fdd|dD }|dd  j|dD  qt	|||dd}|
 j |S )	Nr  c                 3   s    | ]	} j j| V  qd S r"   )r   bye_encoder)rC   br5   r   r   r     s    z2GPT2BPETokenizer.EncodeAsTokens.<locals>.<genexpr>zutf-8c                 s   s    | ]}|V  qd S r"   r   )rC   	bpe_tokenr   r   r   r     s
    
r   Fr  )refindallr   patr   r   r>   rQ  r   r!   r*   r   )r(   r$   r   r   r   r.   r#   r   r5   r   r     s   

zGPT2BPETokenizer.EncodeAsTokensc                    s    fdd|D S )Nc                    r  r   r  )rC   r*  r5   r   r   rE     r   z3GPT2BPETokenizer.DecodeAsTokens.<locals>.<listcomp>r   r	  r   r5   r   DecodeAsTokens  s   zGPT2BPETokenizer.DecodeAsTokensc                 C   r  r"   )r   r9   r8   r.   r   r   r   r  r   r   r   r   r     s   
zGPT2BPETokenizer.IdToTokenc                 C   r  r"   )r   r9   r8   r-   r   r   r  r   r   r   r   r     
   zGPT2BPETokenizer.TokenToIdc                    s8   |rd  fdd|D S t|tr|j} j|S )Nr   c                 3   r   r"   r   r   r5   r   r   r     r   z-GPT2BPETokenizer.DecodeIds.<locals>.<genexpr>r   r   r!   r#   r   decode)r(   r   r   r   r5   r   r     s   
zGPT2BPETokenizer.DecodeIdsc                    B   |rd dd |D S t|tr|j} j fdd|D S )Nr   c                 s   r   r"   r   r   r   r   r   r     r   z0GPT2BPETokenizer.DecodeTokens.<locals>.<genexpr>c                    r  r   r  rR   r5   r   r   rE     r   z1GPT2BPETokenizer.DecodeTokens.<locals>.<listcomp>r  r  r   r5   r   r        

zGPT2BPETokenizer.DecodeTokens)NFFFr"   r   )rI   rJ   rK   r)   r   r   r   r  r   r   r   r   r   r   r   r   r     s    

qB


	
	r   c                   @   sV   e Zd Z			dddZdd ZdddZdd	d
ZdddZdddZdddZ	dS )r   Fc              
   K   sV  t || _d| _| jj | _| j| _d| _t	dd| jt	dd| jt	dd| jd t	d	d
| jd t	dd| jd ddt	dd| jd g| _
|  jd7  _|  jd7  _|r| j
t	dd| jd t	dd| jd g |  jd7  _|  jd7  _|r| j
t	dd| jddt	dd| jd ddg |  jd7  _|  jd7  _|r| j
t	dd| jg |  jd7  _|  jd7  _dd  | j
D | _d!d  | j
D | _d"d  | j
D | _td#d$dtd%d&dg| _d'd  | jD | _d(d  | jD | _d)d  | jD | _t| j | _d*d  | j D | _t| j | _d+d  | j D | _d S ),Nr   re   rb   r  rc   rh   r  r   rl   r~  rn   r  rg   Tr  rf   r  ri   rk   rm   r  r  r  r  r  r  r  r  r  r  c                 S   r   r   r   rR   r   r   r   r     r   z/ChineseSPTokenizer.__init__.<locals>.<dictcomp>c                 S   r   r   rG   rR   r   r   r   r     r   c                 S   r   r   rA   rR   r   r   r   r     r   rt   r  ru   r  c                 S   r   r   r   rR   r   r   r   r   !  r   c                 S   r   r   rG   rR   r   r   r   r   "  r   c                 S   r   r   rA   rR   r   r   r   r   #  r   c                 S   r   r   r   r   r   r   r   r   ,  r   c                 S   r   r   r   r   r   r   r   r   2  r   )r   r  r   r   r]  r   r   r   r   r8   r   r>   r   r   r   r9   r   r   r   r   rH   r   r   r   r   r   r   )r(   r   r  r  r  r   r   r   r   r)     sr   	


zChineseSPTokenizer.__init__c                 C   s   | j |}|S r"   r  )r(   r$   r  r   r   r   r   4  s   zChineseSPTokenizer._encodeNc                 C   s@   |}|d ur
||}| j |}t|||dd}|| j |S )NFr  )r   r   r!   r*   r   )r(   r$   r   r   r   r#   r   r   r   r   8  s   z!ChineseSPTokenizer.EncodeAsTokensc                 C   s`   t |ttfr
|jS |r| j| jS || jv r| j| jS || jv r(| j| jS | jt|S r"   )	r   r9   r8   r.   r   r   r   convert_id_to_tokenrf  r   r   r   r   r   B  s   

zChineseSPTokenizer.IdToTokenc                 C   s0   t |ttfr
|jS |r| j| jS | j|S r"   )r   r9   r8   r-   r   r   convert_token_to_idr   r   r   r   r   N  r  zChineseSPTokenizer.TokenToIdc           	         s   |rd  fdd|D S t|tr|j}ttt|}g }d}t|D ]\}}| jv r@|	|||  |	| |d }q%|	||d   d}|D ]}t|tr^| j| j
7 }qN|rh| j|7 }qN|S )Nr   c                 3   r   r"   r   r   r5   r   r   r   W  r   z/ChineseSPTokenizer.DecodeIds.<locals>.<genexpr>r   r   r  )r   r   r!   r#   rH   maprf  r   r   r=   r.   r   r  )	r(   r   r   pieceslastr   token_idr$   piecer   r5   r   r   U  s0   



zChineseSPTokenizer.DecodeIdsc                    r  )Nr   c                 s   r   r"   r   r   r   r   r   r   o  r   z2ChineseSPTokenizer.DecodeTokens.<locals>.<genexpr>c                    r  r   r  rR   r5   r   r   rE   t  r   z3ChineseSPTokenizer.DecodeTokens.<locals>.<listcomp>r  r  r   r5   r   r   m  r  zChineseSPTokenizer.DecodeTokens)FFFr"   r   )
rI   rJ   rK   r)   r   r   r   r   r   r   r   r   r   r   r     s    
K




r   )NNNr   r	   NN)r"  )+rL   r:  r   rW  rb  collectionsr   r0  regexr  sentencepiecer[  r  r   r4  r  r   tokenization_gpt2r   	wordpiecer   r   r    objectr!   rU   r^   rY   r8   r   rq   rp   r9   r   r   r   r  r8  rO  rP  r   r   r   r   r   r   r   <module>   sb   
"b


  GF
6 0 n