o
    ॵi`                     @   s$  d dl mZmZmZmZ d dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZdd ZG dd deZ	 eeZdd Zd	d
 ZG dd deZG dd deZG dd deZdd Zdd Zdd Z	 zd dlmZ W n ey|   dd ZY nw e dd Zdd ZG dd deZ dS )     )absolute_importdivisionprint_functionunicode_literalsNc              	   C   s8   ddddddddd	}|  D ]
\}}| ||} q| S )
N-'zn'tz'mz don'tz'sz'vez're)z - z ' z n'tz 'mz do notz 'sz 'vez 're)itemsreplace)string
replace_mpkv r   `/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/preprocessors/nlp/space/tokenizer.pyclean_string   s   
r   c                   @   s>   e Zd Zg dfddZdd Zdd Zdd	 Zg fd
dZdS )	TokenizerBertc                    sj  | _ |dkrsddd _|D ]}| jvr&|dvr&dt j d j|< qdd	  j D  _ fd
d|D }d _  jt fdd|D 7  _t| jd _ jD ]}| jj	v siJ d| dqYt jj	 _
d S |dkrddi _dd	  j D  _ fdd|D }tj|d}tj|d}t|||d _t| _t j _
d S t)Nr   z	[unused0]z	[unused1])z[BOS]z[EOS])[PAD][UNK]z[unused]c                 S      i | ]\}}||qS r   r   .0r   r   r   r   r   
<dictcomp>.       z&Tokenizer.__init__.<locals>.<dictcomp>c                       g | ]	} j ||qS r   spec_convert_dictgetr   tokselfr   r   
<listcomp>2   s    z&Tokenizer.__init__.<locals>.<listcomp>r   z[SEP]r   z[CLS]z[MASK]c                 3   s    | ]
}| j vr|V  qd S Nspecial_tokensr   xr!   r   r   	<genexpr>7   s    
z%Tokenizer.__init__.<locals>.<genexpr>)never_splitzspecial token 'z' is not in the vocabularyGPT2r   z<unk>c                 S   r   r   r   r   r   r   r   r   A   r   c                    s   g | ]	}| j vr|qS r   )r   r   r!   r   r   r#   E   s
    
z
vocab.jsonz
merges.txtr&   )tokenizer_typer   lenr   spec_revert_dictr'   tupleBertTokenizer
_tokenizervocab
vocab_sizeospathjoinGPT2Tokenizernum_specials
ValueError)r"   
vocab_pathr'   r-   tokenr    
vocab_filemerges_filer   r!   r   __init__!   sT   
 



zTokenizer.__init__c                 C   s   | j |S r%   )r2   tokenizer"   textr   r   r   r@   R   s   zTokenizer.tokenizec                    s`    j dkr fdd|D } j|}|S  fdd|D } j|} fdd|D }|S )Nr   c                    r   r   r   r   r!   r   r   r#   W       z3Tokenizer.convert_tokens_to_ids.<locals>.<listcomp>c                    r   r   r   r   r!   r   r   r#   [   rC   c                    s   g | ]
}| j   j qS r   r9   r4   r   ir!   r   r   r#   ]       )r-   r2   convert_tokens_to_ids)r"   tokensidsr   r!   r   rH   U   s   
zTokenizer.convert_tokens_to_idsc                    s`    j dkr j|} fdd|D }|S  fdd|D } j|} fdd|D }|S )Nr   c                    r   r   r/   r   r   r!   r   r   r#   c   rC   z3Tokenizer.convert_ids_to_tokens.<locals>.<listcomp>c                    s   g | ]
}| j   j qS r   rD   rE   r!   r   r   r#   f   rG   c                    r   r   rK   r   r!   r   r   r#   h   rC   )r-   r2   convert_ids_to_tokens)r"   rJ   rI   r   r!   r   rL   `   s   
zTokenizer.convert_ids_to_tokensc                    s    |}t dkrt   fdd|D }jdkr'd|dd}nd|}tfdd|D d	}t|}|S )
Nr   c                    s   g | ]}| vr|qS r   r   r   )ignore_tokensr   r   r#   o       z$Tokenizer.decode.<locals>.<listcomp>r    z ## c                    s   g | ]} j j| qS r   )r2   byte_decoderr   cr!   r   r   r#   t       utf-8)	rL   r.   setr-   r7   r	   	bytearraydecoder   )r"   rJ   rM   rI   r
   r   )rM   r"   r   rX   k   s   


zTokenizer.decodeN)__name__
__module____qualname__r?   r@   rH   rL   rX   r   r   r   r   r      s    1r   c                 C   sj   t  }d}t| ddd}	 | }|sn| }|||< |d7 }qW d   |S 1 s.w   Y  |S )z*Loads a vocabulary file into a dictionary.r   rrU   encodingT   N)collectionsOrderedDictopenreadlinestrip)r=   r3   indexreaderr<   r   r   r   
load_vocab   s    
rg   c                 C   s   |   } | sg S |  }|S )z@Runs basic whitespace cleaning and splitting on a piece of text.)rd   split)rB   rI   r   r   r   whitespace_tokenize   s
   ri   c                   @   s:   e Zd ZdZ				dddZdd Zd	d
 Zdd ZdS )r1   z?Runs end-to-end tokenization: punctuation splitting + wordpieceTNr$   c                 C   s   t j|std|t|| _tdd | j	 D | _
|| _|r,t||d| _t| jd| _|dur<|| _dS td| _dS )a  Constructs a BertTokenizer.

        Args:
          vocab_file: Path to a one-wordpiece-per-line vocabulary file
          do_lower_case: Whether to lower case the input
                         Only has an effect when do_wordpiece_only=False
          do_basic_tokenize: Whether to do basic tokenization before wordpiece.
          max_len: An artificial maximum length to truncate tokenized sequences to;
                         Effective maximum length is always the minimum of this
                         value (if specified) and the underlying BERT model's
                         sequence length.
          never_split: List of tokens which will never be split during tokenization.
                         Only has an effect when do_wordpiece_only=False
        zCan't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`c                 S   s   g | ]\}}||fqS r   r   )r   r    rJ   r   r   r   r#      rT   z*BertTokenizer.__init__.<locals>.<listcomp>do_lower_caser+   )r3   N   mB)r5   r6   isfiler:   formatrg   r3   r`   ra   r   ids_to_tokensdo_basic_tokenizeBasicTokenizerbasic_tokenizerWordpieceTokenizerwordpiece_tokenizerintmax_len)r"   r=   rk   rv   rp   r+   r   r   r   r?      s    


 zBertTokenizer.__init__c                 C   sL   g }| j r| j|D ]}| j|D ]}|| qq|S | j|}|S r%   )rp   rr   r@   rt   append)r"   rB   split_tokensr<   	sub_tokenr   r   r   r@      s   zBertTokenizer.tokenizec                 C   sH   g }|D ]
}| | j|  qt|| jkr"tdt|| j |S )z7Converts a sequence of tokens into ids using the vocab.zToken indices sequence length is longer than the specified maximum  sequence length for this BERT model ({} > {}). Running this sequence through BERT will result in indexing errors)rw   r3   r.   rv   loggerwarningrn   )r"   rI   rJ   r<   r   r   r   rH      s   
z#BertTokenizer.convert_tokens_to_idsc                 C   s"   g }|D ]
}| | j|  q|S )z?Converts a sequence of ids in wordpiece tokens using the vocab.)rw   ro   )r"   rJ   rI   rF   r   r   r   rL      s   z#BertTokenizer.convert_ids_to_tokens)TNTr$   )rY   rZ   r[   __doc__r?   r@   rH   rL   r   r   r   r   r1      s    
$
r1   c                   @   sN   e Zd ZdZ		dddZdd Zdd	 Zd
d Zdd Zdd Z	dd Z
dS )rq   zDRuns basic tokenization (punctuation splitting, lower casing, etc.).Tr$   c                 C   s   || _ || _dS )znConstructs a BasicTokenizer.

        Args:
          do_lower_case: Whether to lower case the input.
        Nrj   )r"   rk   r+   r   r   r   r?      s   
zBasicTokenizer.__init__c                 C   sn   |  |}| |}t|}g }|D ]}| jr%|| jvr%| }| |}|| | qtd	|}|S )zTokenizes a piece of text.rO   )
_clean_text_tokenize_chinese_charsri   rk   r+   lower_run_strip_accentsextend_run_split_on_puncr7   )r"   rB   orig_tokensrx   r<   output_tokensr   r   r   r@      s   


zBasicTokenizer.tokenizec                 C   sB   t d|}g }|D ]}t |}|dkrq
|| q
d|S )z$Strips accents from a piece of text.NFDMnrP   )unicodedata	normalizecategoryrw   r7   )r"   rB   outputcharcatr   r   r   r     s   

z!BasicTokenizer._run_strip_accentsc                 C   s   || j v r|gS t|}d}d}g }|t|k rC|| }t|r)||g d}n|r0|g  d}|d | |d7 }|t|k sdd |D S )z&Splits punctuation on a piece of text.r   TFr_   c                 S   s   g | ]}d  |qS )rP   )r7   r(   r   r   r   r#   -      z5BasicTokenizer._run_split_on_punc.<locals>.<listcomp>)r+   listr.   _is_punctuationrw   )r"   rB   charsrF   start_new_wordr   r   r   r   r   r     s$   

z!BasicTokenizer._run_split_on_puncc                 C   sT   g }|D ] }t |}| |r|d || |d q|| qd|S )z)Adds whitespace around any CJK character.rO   rP   )ord_is_chinese_charrw   r7   r"   rB   r   r   cpr   r   r   r~   /  s   



z&BasicTokenizer._tokenize_chinese_charsc                 C   s   |dko|dk}|p|dko|dk}|p|dko|dk}|p%|dko%|dk}|p/|d	ko/|d
k}|p9|dko9|dk}|pC|dkoC|dk}|pM|dkoM|dk}|rRdS dS )z6Checks whether CP is the codepoint of a CJK character.i N  i  i 4  iM  i   iߦ i  i? i@ i i  i i   i  i  i TFr   )r"   r   tmpr   r   r   r   <  s   
zBasicTokenizer._is_chinese_charc                 C   sX   g }|D ]"}t |}|dks|dkst|rqt|r!|d q|| qd|S )zBPerforms invalid character removal and whitespace cleanup on text.r   i  rO   rP   )r   _is_control_is_whitespacerw   r7   r   r   r   r   r}   S  s   
zBasicTokenizer._clean_textN)Tr$   )rY   rZ   r[   r|   r?   r@   r   r   r~   r   r}   r   r   r   r   rq      s    
rq   c                   @   s"   e Zd ZdZd	ddZdd ZdS )
rs   zRuns WordPiece tokenization.r   d   c                 C   s   || _ || _|| _d S r%   )r3   	unk_tokenmax_input_chars_per_word)r"   r3   r   r   r   r   r   r?   d  s   
zWordpieceTokenizer.__init__c                 C   s   g }t |D ]m}t|}t|| jkr|| j qd}d}g }|t|k ret|}d}	||k rQd||| }
|dkrAd|
 }
|
| jv rI|
}	n|d8 }||k s0|	du rXd}n||	 |}|t|k s&|rn|| j q|| q|S )a  Tokenizes a piece of text into its word pieces.

        This uses a greedy longest-match-first algorithm to perform tokenization
        using the given vocabulary.

        For example:
          >>> input = "unaffable"
          >>> output = ["un", "##aff", "##able"]

        Args:
          text: A single token or whitespace separated tokens. This should have
            already been passed through `BasicTokenizer`.

        Returns:
          A list of wordpiece tokens.
        Fr   NrP   z##r_   T)	ri   r   r.   r   rw   r   r7   r3   r   )r"   rB   r   r<   r   is_badstart
sub_tokensend
cur_substrsubstrr   r   r   r@   i  s>   

zWordpieceTokenizer.tokenizeN)r   r   )rY   rZ   r[   r|   r?   r@   r   r   r   r   rs   a  s    
rs   c                 C   s>   | dks| dks| dks| dkrdS t | }|dkrdS dS )z1Checks whether `chars` is a whitespace character.rO   	
TZsF)r   r   r   r   r   r   r   r     s    
r   c                 C   s8   | dks| dks| dkrdS t | }|drdS dS )z.Checks whether `chars` is a control character.r   r   r   FCT)r   r   
startswithr   r   r   r   r     s   

r   c                 C   sx   t | }|dko|dk}|p|dko|dk}|p|dko|dk}|p)|dko)|dk}|r.d	S t| }|d
r:d	S dS )z2Checks whether `chars` is a punctuation character.!   /   :   @   [   `   {   ~   TPF)r   r   r   r   )r   r   r   r   r   r   r   r     s   

r   )	lru_cachec                   C   s   dd S )Nc                 S   s   | S r%   r   )funcr   r   r   <lambda>  s    zlru_cache.<locals>.<lambda>r   r   r   r   r   r     s   r   c                     s   t jd dkr	tnt tttdtdd tttdtdd  tttdtd	d  } | d
d
 }d}tdD ]}|| vrT| | |d|  |d7 }q> fdd|D }tt	| |S )a9  
    Returns list of utf-8 byte and a corresponding list of unicode strings.
    The reversible bpe codes work on unicode strings.
    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
    This is a signficant percentage of your normal, say, 32K bpe vocab.
    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
    And avoids mapping to whitespace/control characters the bpe code barfs on.
    r      !~r_      ¡   ¬   ®   ÿN   c                    s   g | ]} |qS r   r   )r   n_chrr   r   r#     s    z$bytes_to_unicode.<locals>.<listcomp>)
sysversion_infounichrchrr   ranger   rw   dictzip)bscsr   br   r   r   bytes_to_unicode  s.   





r   c                 C   s6   t  }| d }| dd D ]}|||f |}q|S )zReturn set of symbol pairs in a word.

    Word is represented as tuple of symbols (symbols being variable-length strings).
    r   r_   N)rV   add)wordpairs	prev_charr   r   r   r   	get_pairs  s   r   c                   @   sb   e Zd ZdZ			dddZdd Zdd	 Zd
d Zdd Zdd Z	dddZ
dd Zdd ZdS )r8   zF
    GPT-2 BPE tokenizer. Peculiarities:
        - Byte-level BPE
    r	   Nc                 C   s   |d ur|nt d| _tt|dd| _dd | j D | _|| _t	 | _
dd | j
 D | _t|dd ddd	 }d
d |D }tt|tt|| _i | _td| _i | _i | _| | d S )Nrl   rU   r]   c                 S   r   r   r   r   r   r   r   r     r   z*GPT2Tokenizer.__init__.<locals>.<dictcomp>c                 S   r   r   r   r   r   r   r   r     r   r   r_   r   c                 S   s   g | ]}t | qS r   )r0   rh   )r   merger   r   r   r#     rN   z*GPT2Tokenizer.__init__.<locals>.<listcomp>zJ's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+)ru   rv   jsonloadrb   encoderr   decodererrorsr   byte_encoderrQ   readrh   r   r   r   r.   	bpe_rankscacherecompilepatr'   special_tokens_decoderset_special_tokens)r"   r=   r>   r   r'   rv   bpe_data
bpe_mergesr   r   r   r?     s    zGPT2Tokenizer.__init__c                 C   s   t | jt | j S r%   )r.   r   r'   r!   r   r   r   __len__)  s   zGPT2Tokenizer.__len__c                    s\   |s
i  _ i  _dS t fddt|D  _ dd  j  D  _td j  dS )z Add a list of additional tokens to the encoder.
            The additional tokens are indexed starting from the last index of the
            current vocabulary in the order of the `special_tokens` list.
        Nc                 3   s&    | ]\}}|t  j| fV  qd S r%   )r.   r   )r   rF   r    r!   r   r   r*   5  s    z3GPT2Tokenizer.set_special_tokens.<locals>.<genexpr>c                 S   r   r   r   r   r   r   r   r   7  r   z4GPT2Tokenizer.set_special_tokens.<locals>.<dictcomp>zSpecial tokens {})r'   r   r   	enumerater   rz   inforn   )r"   r'   r   r!   r   r   ,  s   
z GPT2Tokenizer.set_special_tokensc           
         sX  | j v r
 j | S t|}t|}|s|S 	 t| fddd}| jvr'ny|\}}g }d}|t|k rz|||}	||||	  |	}W n tyZ   |||d   Y n4w || |kr}|t|d k r}||d  |kr}|	||  |d7 }n|	||  |d7 }|t|k s5t|}|}t|dkrnt|}qd
|}| j |< |S )	NTc                    s    j | tdS )Ninf)r   r   float)pairr!   r   r   r   H  s    z#GPT2Tokenizer.bpe.<locals>.<lambda>)keyr   r_   r   rO   )r   r0   r   minr   r.   re   r   	Exceptionrw   r7   )
r"   r<   r   r   bigramfirstsecondnew_wordrF   jr   r!   r   bpe=  sV   





zGPT2Tokenizer.bpec                    s^   g }t  j|D ]#}d fdd|D }|dkrq	|dd  |dD  q	|S )z Tokenize a string. rP   c                 3   s,    | ]}t | jv r jt | V  qd S r%   )r   r   )r   r   r!   r   r   r*   l  s    z)GPT2Tokenizer.tokenize.<locals>.<genexpr>c                 s   s    | ]}|V  qd S r%   r   )r   	bpe_tokenr   r   r   r*   p  s    
rO   )r   findallr   r7   r   r   rh   )r"   rB   
bpe_tokensr<   r   r!   r   r@   h  s   

zGPT2Tokenizer.tokenizec                 C   s   g }t |t}tjd dkot |t}|s|r(|| jv r!| j| S | j|dS |D ]}|| jv r:|| j|  q*|| j|d q*t	|| j
krXtdt	|| j
 |S )z9 Converts a sequence of tokens into ids using the vocab. r   r   zToken indices sequence length is longer than the specified maximum  sequence length for this OpenAI GPT model ({} > {}). Running this sequence through the model will result in indexing errors)
isinstancestrr   r   unicoder'   r   r   rw   r.   rv   rz   r{   rn   )r"   rI   rJ   python_version_3python_version_2r<   r   r   r   rH   t  s$   



z#GPT2Tokenizer.convert_tokens_to_idsFc                 C   sB   g }|D ]}|| j v r|s|| j |  q|| j|  q|S )z9Converts a sequence of ids in BPE tokens using the vocab.)r   rw   r   )r"   rJ   skip_special_tokensrI   rF   r   r   r   rL     s   
z#GPT2Tokenizer.convert_ids_to_tokensc                 C   s   |  | |S r%   )rH   r@   rA   r   r   r   encode  s   zGPT2Tokenizer.encodec                    s>   d  fdd|D }t fdd|D jd jd}|S )NrP   c                       g | ]} j | qS r   )r   )r   r<   r!   r   r   r#     r   z(GPT2Tokenizer.decode.<locals>.<listcomp>c                    r   r   )rQ   rR   r!   r   r   r#     r   rU   )r   )r7   rW   rX   r   )r"   rI   rB   r   r!   r   rX     s
   zGPT2Tokenizer.decode)r	   NN)F)rY   rZ   r[   r|   r?   r   r   r   r@   rH   rL   r   rX   r   r   r   r   r8   	  s    
+
r8   )!
__future__r   r   r   r   r`   loggingr5   r   r   r   regexr   r   objectr   	getLoggerrY   rz   rg   ri   r1   rq   rs   r   r   r   	functoolsr   ImportErrorr   r   r8   r   r   r   r   <module>   s<   i
	Fv< 
