o
    ߥi8                     @   s   d Z ddlmZmZmZ ddlZddlZddlZddlZddl	Z	dd Z
dd Zdd	 Zd
d Zdd Zdd Zdd Zdd ZG dd deZG dd deZG dd deZdd Zdd Zdd ZdS ) zTokenization classes.    )absolute_importdivisionprint_functionNc           
      C   s   |sdS t d|}|du rdS |d}g d}g d}d}||v r-| s-d}d}d	}d
}	||v r;| r;d}d
}d}d}	|rHtd|||||	f dS )zHChecks whether the casing config is consistent with the checkpoint name.Nz$^.*?([A-Za-z0-9_-]+)/bert_model.ckpt   )zuncased_L-24_H-1024_A-16zuncased_L-12_H-768_A-12zmultilingual_L-12_H-768_A-12zchinese_L-12_H-768_A-12)zcased_L-12_H-768_A-12zcased_L-24_H-1024_A-16zmulti_cased_L-12_H-768_A-12FTFalse
lowercasedTruecaseda  You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. However, `%s` seems to be a %s model, so you should pass in `--do_lower_case=%s` so that the fine-tuning matches how the model was pre-training. If this error is wrong, please just comment out this check.)rematchgroup
ValueError)
do_lower_caseinit_checkpointm
model_namelower_modelscased_modelsis_bad_configactual_flag	case_nameopposite_flag r   e/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/multi_modal/clip/bert_tokenizer.py validate_case_matches_checkpoint   s8   
r   c                 C   s|   t jrt| tr
| S t| tr| ddS tdt|  t jr:t| tr+| ddS t| t	r2| S tdt|  td)zGConverts `text` to Unicode (if it's not already), assuming utf-8 input.utf-8ignoreUnsupported string type: %s#Not running on Python2 or Python 3?)
sixPY3
isinstancestrbytesdecoder   typePY2unicodetextr   r   r   convert_to_unicodeL   s   



r*   c                 C   sz   t jrt| tr
| S t| tr| ddS tdt|  t jr9t| tr'| S t| t	r1| 
dS tdt|  td)zAReturns text encoded in a way suitable for print or `tf.logging`.r   r   r   r   )r   r    r!   r"   r#   r$   r   r%   r&   r'   encoder(   r   r   r   printable_text`   s   




r,   c                 C   sn   t  }d}t| ddd }	 t| }|sn| }|||< |d7 }qW d   |S 1 s0w   Y  |S )z*Loads a vocabulary file into a dictionary.r   rr   )encodingTr   N)collectionsOrderedDictopenr*   readlinestrip)
vocab_filevocabindexreadertokenr   r   r   
load_vocabw   s    
r9   c                 C   s    g }|D ]	}| | |  q|S )z4Converts a sequence of [tokens|ids] using the vocab.)append)r5   itemsoutputitemr   r   r   convert_by_vocab   s   r>   c                 C   
   t | |S Nr>   )r5   tokensr   r   r   convert_tokens_to_ids      
rC   c                 C   r?   r@   rA   )	inv_vocabidsr   r   r   convert_ids_to_tokens   rD   rG   c                 C   s   |   } | sg S |  }|S )z@Runs basic whitespace cleaning and splitting on a piece of text.)r3   split)r)   rB   r   r   r   whitespace_tokenize   s
   rI   c                   @   sH   e Zd ZdZdddZdd Zdd Zd	d
 ZedddZ	dd Z
dS )FullTokenizerzRuns end-to-end tokenization.Tc                 C   s>   t || _dd | j D | _t|d| _t| jd| _d S )Nc                 S   s   i | ]\}}||qS r   r   ).0kvr   r   r   
<dictcomp>       z*FullTokenizer.__init__.<locals>.<dictcomp>r   )r5   )r9   r5   r;   rE   BasicTokenizerbasic_tokenizerWordpieceTokenizerwordpiece_tokenizer)selfr4   r   r   r   r   __init__   s   
zFullTokenizer.__init__c                 C   s6   g }| j |D ]}| j|D ]}|| qq|S r@   )rR   tokenizerT   r:   )rU   r)   split_tokensr8   	sub_tokenr   r   r   rW      s   zFullTokenizer.tokenizec                 C      t | j|S r@   )r>   r5   )rU   rB   r   r   r   rC         z#FullTokenizer.convert_tokens_to_idsc                 C   rZ   r@   )r>   rE   )rU   rF   r   r   r   rG      r[   z#FullTokenizer.convert_ids_to_tokensc                 C   s2   dd }d | dd }|r||}|S |S )z< Converts a sequence of tokens (string) in a single string. c                 S   sX   |  dd dd dd dd d	d
 dd dd dd dd dd} | S )z Clean up a list of simple English tokenization artifacts
            like spaces before punctuations and abbreviated forms.
            z ..z ??z !!z ,,z ' 'z n'tzn'tz 'mz'mz 'sz'sz 'vez'vez 'rez're)replace)
out_stringr   r   r   clean_up_tokenization   s   
zEFullTokenizer.convert_tokens_to_string.<locals>.clean_up_tokenization z ## )joinra   r3   )rB   clean_up_tokenization_spacesrc   r)   
clean_textr   r   r   convert_tokens_to_string   s   z&FullTokenizer.convert_tokens_to_stringc                 C   s
   t | jS r@   )lenr5   )rU   r   r   r   
vocab_size   rD   zFullTokenizer.vocab_sizeNT)__name__
__module____qualname____doc__rV   rW   rC   rG   staticmethodri   rk   r   r   r   r   rJ      s    
rJ   c                   @   sJ   e Zd ZdZdddZdd Zdd Zd	d
 Zdd Zdd Z	dd Z
dS )rQ   zDRuns basic tokenization (punctuation splitting, lower casing, etc.).Tc                 C   s
   || _ dS )znConstructs a BasicTokenizer.

        Args:
          do_lower_case: Whether to lower case the input.
        NrP   )rU   r   r   r   r   rV      s   
zBasicTokenizer.__init__c                 C   sl   t |}| |}| |}t|}g }|D ]}| jr$| }| |}|| | qtd	|}|S )zTokenizes a piece of text.rd   )
r*   _clean_text_tokenize_chinese_charsrI   r   lower_run_strip_accentsextend_run_split_on_puncrf   )rU   r)   orig_tokensrX   r8   output_tokensr   r   r   rW      s   


zBasicTokenizer.tokenizec                 C   sB   t d|}g }|D ]}t |}|dkrq
|| q
d|S )z$Strips accents from a piece of text.NFDMnre   )unicodedata	normalizecategoryr:   rf   )rU   r)   r<   charcatr   r   r   ru      s   

z!BasicTokenizer._run_strip_accentsc                 C   s   t |}d}d}g }|t|k r;|| }t|r!||g d}n|r(|g  d}|d | |d7 }|t|k sdd |D S )z&Splits punctuation on a piece of text.r   TFr   c                 S   s   g | ]}d  |qS )re   )rf   )rK   xr   r   r   
<listcomp>  rO   z5BasicTokenizer._run_split_on_punc.<locals>.<listcomp>)listrj   _is_punctuationr:   )rU   r)   charsistart_new_wordr<   r   r   r   r   rw      s    
z!BasicTokenizer._run_split_on_puncc                 C   sT   g }|D ] }t |}| |r|d || |d q|| qd|S )z)Adds whitespace around any CJK character.rd   re   )ord_is_chinese_charr:   rf   rU   r)   r<   r   cpr   r   r   rs     s   



z&BasicTokenizer._tokenize_chinese_charsc                 C   s   |dkr|dks@|dkr|dks@|dkr|dks@|dkr |dks@|d	kr(|d
ks@|dkr0|dks@|dkr8|dks@|dkrB|dkrBdS dS )z6Checks whether CP is the codepoint of a CJK character.i N  i  i 4  iM  i   iߦ i  i? i@ i i  i i   i  i  i TFr   )rU   r   r   r   r   r      s    
zBasicTokenizer._is_chinese_charc                 C   sX   g }|D ]"}t |}|dks|dkst|rqt|r!|d q|| qd|S )zBPerforms invalid character removal and whitespace cleanup on text.r   i  rd   re   )r   _is_control_is_whitespacer:   rf   r   r   r   r   rr   5  s   
zBasicTokenizer._clean_textNrl   )rm   rn   ro   rp   rV   rW   ru   rw   rs   r   rr   r   r   r   r   rQ      s    
rQ   c                   @   s"   e Zd ZdZd	ddZdd ZdS )
rS   zRuns WordPiece tokenization.[UNK]   c                 C   s   || _ || _|| _d S r@   )r5   	unk_tokenmax_input_chars_per_word)rU   r5   r   r   r   r   r   rV   F  s   
zWordpieceTokenizer.__init__c                 C   s   t |}g }t|D ]m}t|}t|| jkr|| j q
d}d}g }|t|k rit|}d}	||k rUd||| }
|dkrEd|
 }
|
| jv rM|
}	n|d8 }||k s4|	du r\d}n||	 |}|t|k s*|rr|| j q
|	| q
|S )a  Tokenizes a piece of text into its word pieces.

        This uses a greedy longest-match-first algorithm to perform tokenization
        using the given vocabulary.

        For example:
          input = "unaffable"
          output = ["un", "##aff", "##able"]

        Args:
          text: A single token or whitespace separated tokens. This should have
            already been passed through `BasicTokenizer.

        Returns:
          A list of wordpiece tokens.
        Fr   Nre   z##r   T)
r*   rI   r   rj   r   r:   r   rf   r5   rv   )rU   r)   ry   r8   r   is_badstart
sub_tokensend
cur_substrsubstrr   r   r   rW   K  s@   

zWordpieceTokenizer.tokenizeN)r   r   )rm   rn   ro   rp   rV   rW   r   r   r   r   rS   C  s    
rS   c                 C   s>   | dks| dks| dks| dkrdS t | }|dkrdS dS )z1Checks whether `chars` is a whitespace character.rd   	
TZsFr|   r~   r   r   r   r   r   r     s    
r   c                 C   s6   | dks| dks| dkrdS t | }|dv rdS dS )z.Checks whether `chars` is a control character.r   r   r   F)CcCfTr   r   r   r   r   r     s   
r   c                 C   sh   t | }|dkr|dks$|dkr|dks$|dkr|dks$|dkr&|dkr&d	S t| }|d
r2d	S dS )z2Checks whether `chars` is a punctuation character.!   /   :   @   [   `   {   ~   TPF)r   r|   r~   
startswith)r   r   r   r   r   r   r     s     

r   )rp   
__future__r   r   r   r/   osr
   r|   r   r   r*   r,   r9   r>   rC   rG   rI   objectrJ   rQ   rS   r   r   r   r   r   r   r   <module>   s*   3	2r>