o
    ߥiF                     @   s   d Z ddlZddlZddlZddlmZ ddlZddlZddl	Z	e dd Z
e dd Zdd	 Zd
d Zdd ZG dd deZdddZdS )z CLIP Tokenizer.    N)	lru_cachec                   C   s   t jt jt jtdS )Nzbpe_simple_vocab_16e6.txt.gz)ospathjoindirnameabspath__file__ r	   r	   `/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/multi_modal/gemm/tokenizer.pydefault_bpe   s   r   c                  C   s   t ttdtdd t ttdtdd  t ttdtdd  } | dd }d	}td
D ]}|| vrI| | |d
|  |d7 }q3dd |D }tt| |S )a9  
    Returns list of utf-8 byte and a corresponding list of unicode strings.
    The reversible bpe codes work on unicode strings.
    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
    This is a signficant percentage of your normal, say, 32K bpe vocab.
    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
    And avoids mapping to whitespace/control characters the bpe code barfs on.
    !~      ¡   ¬   ®   ÿNr      c                 S   s   g | ]}t |qS r	   )chr).0nr	   r	   r
   
<listcomp>1       z$bytes_to_unicode.<locals>.<listcomp>)listrangeordappenddictzip)bscsr   br	   r	   r
   bytes_to_unicode   s,   





r"   c                 C   s6   t  }| d }| dd D ]}|||f |}q|S )zReturn set of symbol pairs in a word.
    Word is represented as tuple of symbols (symbols being variable-length strings).
    r   r   N)setadd)wordpairs	prev_charcharr	   r	   r
   	get_pairs5   s   r)   c                 C   s"   t | } tt| } |  S N)ftfyfix_texthtmlunescapestriptextr	   r	   r
   basic_cleanA   s   
r2   c                 C   s   t dd| } |  } | S )Nz\s+ )resubr/   r0   r	   r	   r
   whitespace_cleanG   s   r6   c                   @   s8   e Zd Ze fdefddZdd Zdd Zdd	 Zd
S )SimpleTokenizerbpe_pathc                 C   s   t  | _dd | j D | _t| dd}|dd }dd |D }t	t  
 }|d	d |D  }|D ]
}|d
| q;|ddg tt|tt|| _dd | j D | _tt|tt|| _ddd| _tdtj| _d S )Nc                 S      i | ]\}}||qS r	   r	   r   kvr	   r	   r
   
<dictcomp>Q       z,SimpleTokenizer.__init__.<locals>.<dictcomp>utf-8
r   i  c                 S   s   g | ]}t | qS r	   )tuplesplit)r   merger	   r	   r
   r   T   s    z,SimpleTokenizer.__init__.<locals>.<listcomp>c                 S   s   g | ]}|d  qS )</w>r	   )r   r<   r	   r	   r
   r   V   r    <|startoftext|><|endoftext|>c                 S   r9   r	   r	   r:   r	   r	   r
   r=   [   r>   )rF   rG   z[<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+)r"   byte_encoderitemsbyte_decodergzipopenreaddecoderB   r   valuesr   r   extendr   r   r   lenencoderdecoder	bpe_rankscacher4   compile
IGNORECASEpat)selfr8   mergesvocabrC   r	   r	   r
   __init__O   s(   
zSimpleTokenizer.__init__c              
      s  | j v r
 j | S t|d d |d d f }t|}|s#|d S g }	 t| fddd}| jvr6n|\}}g }d}	|	t|k rz|||	}
|||	|
  |
}	W n  tyw } z|	| |||	d   W Y d }~n8d }~ww ||	 |kr|	t|d k r||	d  |kr|	||  |	d	7 }	n|	||	  |	d7 }	|	t|k sDt|}|}t|dkrnt|}q&t|d
krt
|d  d|}| j |< |S )NrD   Tc                    s    j | tdS )Ninf)rT   getfloat)pairrY   r	   r
   <lambda>q   s    z%SimpleTokenizer.bpe.<locals>.<lambda>)keyr   r      d   r3   )rU   rA   r)   minrT   rQ   indexrP   	Exceptionr   printr   )rY   tokenr%   r&   
error_listbigramfirstsecondnew_wordijerrr	   rb   r
   bpee   s`   






zSimpleTokenizer.bpec                    sn   g }t t| }t j|D ]#}d fdd|dD }| fdd 	|
dD  q|S )NrE   c                 3       | ]} j | V  qd S r*   )rH   )r   r!   rb   r	   r
   	<genexpr>       z)SimpleTokenizer.encode.<locals>.<genexpr>r?   c                 3   ru   r*   )rR   )r   	bpe_tokenrb   r	   r
   rv      rw   r3   )r6   r2   lowerr4   findallrX   r   encoderP   rt   rB   )rY   r1   
bpe_tokensrk   r	   rb   r
   r{      s   
zSimpleTokenizer.encodec                    sD   d  fdd|D }t fdd|D jddddd	}|S )
NrE   c                       g | ]} j | qS r	   )rS   )r   rk   rb   r	   r
   r      r>   z*SimpleTokenizer.decode.<locals>.<listcomp>c                    r}   r	   )rJ   )r   crb   r	   r
   r      r>   r?   replace)errorsrD   r3   )r   	bytearrayrN   r   )rY   tokensr1   r	   rb   r
   rN      s   zSimpleTokenizer.decodeN)	__name__
__module____qualname__r   strr\   rt   r{   rN   r	   r	   r	   r
   r7   M   s
    /
r7   M   Tc                    s   t |tr|g}jd jd   fdd|D }tjt||tjd}t|D ]0\}}t||krO|rC|d| } |d< ntd||  d	| t	|||dt|f< q,|S )
a  
    Returns the tokenized representation of given input string(s)
    Parameters
    ----------
    texts : Union[str, List[str]]
        An input string or a list of input strings to tokenize
    context_length : int
        The context length to use; all CLIP models use 77 as the context length
    truncate: bool
        Whether to truncate the text in case its encoding is longer than the context length
    Returns
    -------
    A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length].
    We return LongTensor when torch version is <1.8.0, since older index_select requires indices to be long.
    rF   rG   c                    s"   g | ]}g |  g qS r	   )r{   )r   r1   	eot_token	sot_token	tokenizerr	   r
   r      s    z!clip_tokenize.<locals>.<listcomp>)dtypeNr]   zInput z  is too long for context length )

isinstancer   rR   torchzerosrQ   int	enumerateRuntimeErrortensor)r   textscontext_lengthtruncate
all_tokensresultrq   r   r	   r   r
   clip_tokenize   s$   



r   )r   T)__doc__rK   r-   r   	functoolsr   r+   regexr4   r   r   r"   r)   r2   r6   objectr7   r   r	   r	   r	   r
   <module>   s"   

X