o
    ߥi,                     @   s   d Z ddlmZmZmZ ddlZddlZddlZdd Zdd Z	dd	 Z
d
d Zdd ZG dd deZG dd deZG dd deZdd Zdd Zdd ZdS )zTokenization classes.    )absolute_importdivisionprint_functionNc                 C   s|   t jrt| tr
| S t| tr| ddS tdt|  t jr:t| tr+| ddS t| t	r2| S tdt|  td)zGConverts `text` to Unicode (if it's not already), assuming utf-8 input.utf-8ignoreUnsupported string type: %s#Not running on Python2 or Python 3?)
sixPY3
isinstancestrbytesdecode
ValueErrortypePY2unicodetext r   e/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/multi_modal/diffusion/tokenizer.pyconvert_to_unicode   s   



r   c                 C   sz   t jrt| tr
| S t| tr| ddS tdt|  t jr9t| tr'| S t| t	r1| 
dS tdt|  td)zAReturns text encoded in a way suitable for print or `tf.logging`.r   r   r   r   )r	   r
   r   r   r   r   r   r   r   r   encoder   r   r   r   printable_text+   s   




r   c                 C   sn   t  }d}t| ddd }	 t| }|sn| }|||< |d7 }qW d   |S 1 s0w   Y  |S )z*Loads a vocabulary file into a dictionary.r   rr   )encodingT   N)collectionsOrderedDictopenr   readlinestrip)
vocab_filevocabindexreadertokenr   r   r   
load_vocabB   s    
r'   c                 C   s    g }|D ]	}| | |  q|S )z7Converts a sequence of tokens into ids using the vocab.)append)r#   tokensidsr&   r   r   r   convert_tokens_to_idsQ   s   r+   c                 C   s   |   } | sg S |  }|S )z@Runs basic whitespace cleaning and splitting on a peice of text.)r!   split)r   r)   r   r   r   whitespace_tokenizeY   s
   r-   c                   @   s2   e Zd ZdZdddZdd Zdd Zd	d
 ZdS )FullTokenizerzRuns end-to-end tokenziation.Tc                 C   s>   t || _dd | j D | _t|d| _t| jd| _d S )Nc                 S   s   i | ]\}}||qS r   r   ).0kvr   r   r   
<dictcomp>g       z*FullTokenizer.__init__.<locals>.<dictcomp>do_lower_case)r#   )r'   r#   items	inv_vocabBasicTokenizerbasic_tokenizerWordpieceTokenizerwordpiece_tokenizer)selfr"   r5   r   r   r   __init__e   s   
zFullTokenizer.__init__c                 C   s6   g }| j |D ]}| j|D ]}|| qq|S N)r9   tokenizer;   r(   )r<   r   split_tokensr&   	sub_tokenr   r   r   r?   k   s   zFullTokenizer.tokenizec                 C   s   t | j|S r>   )r+   r#   )r<   r)   r   r   r   r+   s   s   z#FullTokenizer.convert_tokens_to_idsc                    s    fdd|D S )Nc                    s   g | ]} j | qS r   )r7   )r/   ir<   r   r   
<listcomp>w   r3   z7FullTokenizer.convert_ids_to_tokens.<locals>.<listcomp>r   )r<   r*   r   rC   r   convert_ids_to_tokensv   s   z#FullTokenizer.convert_ids_to_tokensNT)__name__
__module____qualname____doc__r=   r?   r+   rE   r   r   r   r   r.   b   s    
r.   c                   @   sJ   e Zd ZdZdddZdd Zdd Zd	d
 Zdd Zdd Z	dd Z
dS )r8   zDRuns basic tokenization (punctuation splitting, lower casing, etc.).Tc                 C   s
   || _ dS )znConstructs a BasicTokenizer.

        Args:
          do_lower_case: Whether to lower case the input.
        Nr4   )r<   r5   r   r   r   r=   }   s   
zBasicTokenizer.__init__c                 C   sl   t |}| |}| |}t|}g }|D ]}| jr$| }| |}|| | qtd	|}|S )zTokenizes a piece of text. )
r   _clean_text_tokenize_chinese_charsr-   r5   lower_run_strip_accentsextend_run_split_on_puncjoin)r<   r   orig_tokensr@   r&   output_tokensr   r   r   r?      s   


zBasicTokenizer.tokenizec                 C   sB   t d|}g }|D ]}t |}|dkrq
|| q
d|S )z$Strips accents from a piece of text.NFDMn )unicodedata	normalizecategoryr(   rR   )r<   r   outputcharcatr   r   r   rO      s   

z!BasicTokenizer._run_strip_accentsc                 C   s   t |}d}d}g }|t|k r;|| }t|r!||g d}n|r(|g  d}|d | |d7 }|t|k sdd |D S )z&Splits punctuation on a piece of text.r   TFr   c                 S   s   g | ]}d  |qS )rW   )rR   )r/   xr   r   r   rD      r3   z5BasicTokenizer._run_split_on_punc.<locals>.<listcomp>)listlen_is_punctuationr(   )r<   r   charsrB   start_new_wordr[   r\   r   r   r   rQ      s    
z!BasicTokenizer._run_split_on_puncc                 C   sT   g }|D ] }t |}| |r|d || |d q|| qd|S )z)Adds whitespace around any CJK character.rK   rW   )ord_is_chinese_charr(   rR   r<   r   r[   r\   cpr   r   r   rM      s   



z&BasicTokenizer._tokenize_chinese_charsc                 C   s   |dkr|dks@|dkr|dks@|dkr|dks@|dkr |dks@|d	kr(|d
ks@|dkr0|dks@|dkr8|dks@|dkrB|dkrBdS dS )z6Checks whether CP is the codepoint of a CJK character.i N  i  i 4  iM  i   iߦ i  i? i@ i i  i i   i  i  i TFr   )r<   rh   r   r   r   rf      s    
zBasicTokenizer._is_chinese_charc                 C   sX   g }|D ]"}t |}|dks|dkst|rqt|r!|d q|| qd|S )zBPerforms invalid character removal and whitespace cleanup on text.r   i  rK   rW   )re   _is_control_is_whitespacer(   rR   rg   r   r   r   rL      s   
zBasicTokenizer._clean_textNrF   )rG   rH   rI   rJ   r=   r?   rO   rQ   rM   rf   rL   r   r   r   r   r8   z   s    
r8   c                   @   s"   e Zd ZdZd	ddZdd ZdS )
r:   zRuns WordPiece tokenization.[UNK]d   c                 C   s   || _ || _|| _d S r>   )r#   	unk_tokenmax_input_chars_per_word)r<   r#   rm   rn   r   r   r   r=      s   
zWordpieceTokenizer.__init__c                 C   s   t |}g }t|D ]m}t|}t|| jkr|| j q
d}d}g }|t|k rit|}d}	||k rUd||| }
|dkrEd|
 }
|
| jv rM|
}	n|d8 }||k s4|	du r\d}n||	 |}|t|k s*|rr|| j q
|	| q
|S )a  Tokenizes a piece of text into its word pieces.

        This uses a greedy longest-match-first algorithm to perform tokenization
        using the given vocabulary.

        For example:
          >>> input = "unaffable"
          >>> output = ["un", "##aff", "##able"]

        Args:
          text: A single token or whitespace separated tokens. This should have
            already been passed through `BasicTokenizer.

        Returns:
          A list of wordpiece tokens.
        Fr   NrW   z##r   T)
r   r-   r`   ra   rn   r(   rm   rR   r#   rP   )r<   r   rT   r&   rc   is_badstart
sub_tokensend
cur_substrsubstrr   r   r   r?      s@   

zWordpieceTokenizer.tokenizeN)rk   rl   )rG   rH   rI   rJ   r=   r?   r   r   r   r   r:      s    
r:   c                 C   s>   | dks| dks| dks| dkrdS t | }|dkrdS dS )z1Checks whether `chars` is a whitespace character.rK   	
TZsF)rX   rZ   r\   r]   r   r   r   rj   (  s    
rj   c                 C   s8   | dks| dks| dkrdS t | }|drdS dS )z.Checks whether `chars` is a control character.ru   rv   rw   FCT)rX   rZ   
startswithry   r   r   r   ri   4  s   

ri   c                 C   sh   t | }|dkr|dks$|dkr|dks$|dkr|dks$|dkr&|dkr&d	S t| }|d
r2d	S dS )z2Checks whether `chars` is a punctuation character.!   /   :   @   [   `   {   ~   TPF)re   rX   rZ   r{   )r\   rh   r]   r   r   r   rb   @  s     

rb   )rJ   
__future__r   r   r   r   rX   r	   r   r   r'   r+   r-   objectr.   r8   r:   rj   ri   rb   r   r   r   r   <module>   s    	p>