o
    ίi-                     @   s`  d Z ddlZddlZddlZddlZddlZddlmZmZ ddl	m
Z
mZmZmZ ddlZddlZddlZdejd< dZe dd	 Ze d
d Zdd Zdd Zdd Zdd Zdd Zdd ZdefddZddddZG dd deZ 	 d-d!eeee f d"e!d#e!d$e!d%e
d&e"fd'd(Z#d!eeee f d"e!d#e!d$e!d%e
f
d)d*Z$defd+d,Z%dS ).zp CLIP tokenizer

Copied from https://github.com/openai/CLIP. Originally MIT License, Copyright (c) 2021 OpenAI.
    N)	lru_cachepartial)CallableListOptionalUnionfalseTOKENIZERS_PARALLELISMM   c                   C   s   t jt jt jtdS )Nzbpe_simple_vocab_16e6.txt.gz)ospathjoindirnameabspath__file__ r   r   Q/home/ubuntu/.local/lib/python3.10/site-packages/core/vision_encoder/tokenizer.pydefault_bpe   s   r   c                  C   s   t ttdtdd t ttdtdd  t ttdtdd  } | dd }d	}td
D ]}|| vrI| | |d
|  |d7 }q3dd |D }tt| |S )a:  
    Returns list of utf-8 byte and a corresponding list of unicode strings.
    The reversible bpe codes work on unicode strings.
    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
    This is a significant percentage of your normal, say, 32K bpe vocab.
    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
    And avoids mapping to whitespace/control characters the bpe code barfs on.
    !~      ¡   ¬   ®   ÿNr      c                 S   s   g | ]}t |qS r   )chr).0nr   r   r   
<listcomp>6       z$bytes_to_unicode.<locals>.<listcomp>)listrangeordappenddictzip)bscsr   br   r   r   bytes_to_unicode   s    
r*   c                 C   s6   t  }| d }| dd D ]}|||f |}q|S )zReturn set of symbol pairs in a word.
    Word is represented as tuple of symbols (symbols being variable-length strings).
    r   r   N)setadd)wordpairs	prev_charcharr   r   r   	get_pairs:   s   r1   c                 C   s"   t | } tt| } |  S N)ftfyfix_texthtmlunescapestriptextr   r   r   basic_cleanF   s   
r:   c                 C   s   t dd| } |  } | S )N\s+ )resubr7   r8   r   r   r   whitespace_cleanL   s   r?   c                 C      t t| S r2   )canonicalize_textr:   xr   r   r   _clean_canonicalizeR      rD   c                 C   s   t t|  S r2   )r?   r:   lowerrB   r   r   r   _clean_lowerW   s   rG   c                 C   r@   r2   )r?   r:   rB   r   r   r   _clean_whitespace\   rE   rH   typec                 C   s4   | dkrt S | dkrtS | dkrtS J d|  d)NcanonicalizerF   
whitespaceFzInvalid clean function (z).)rD   rG   rH   rI   r   r   r   get_clean_fna   s   rM   )keep_punctuation_exact_stringc                C   s`   |  dd} |r|dd | |D } n| tddtj} |  } t	
dd| } |  S )a  Returns canonicalized `text` (lowercase and punctuation removed).

    From: https://github.com/google-research/big_vision/blob/53f18caf27a9419231bbf08d3388b07671616d3d/big_vision/evaluators/proj/image_text/prompt_engineering.py#L94

    Args:
      text: string to be canonicalized.
      keep_punctuation_exact_string: If provided, then this exact string kept.
        For example providing '{}' will keep any occurrences of '{}' (but will
        still remove '{' and '}' that appear separately).
    _r<   c                 s   s&    | ]}| td d tjV  qdS ) N)	translatestr	maketransstringpunctuation)r   partr   r   r   	<genexpr>y   s
    
z$canonicalize_text.<locals>.<genexpr>rP   r;   )replacer   splitrQ   rR   rS   rT   rU   rF   r=   r>   r7   )r9   rN   r   r   r   rA   l   s   

rA   c                   @   s   e Zd Ze deddfdedeee  dee dedef
d	d
Z	dd Z
dd Zdd Z	ddeeee f dee dejfddZdS )SimpleTokenizerNrF   rP   bpe_pathadditional_special_tokenscontext_lengthcleanreduction_maskc                    s  t   _dd  j D  _t| dd}|dd }dd |D }t	t  
 }|d	d |D  }|D ]
}|d
| q;ddg}	|rP|	|7 }	||	 tt|tt| _dd  j D  _tt|tt| _dd |	D  _d|	}
t|
d tj _t j _ fdd|	D  _ jd  _ jd  _| _t| _ |rt!| _"d S d  _"d S )Nc                 S      i | ]\}}||qS r   r   r   kvr   r   r   
<dictcomp>       z,SimpleTokenizer.__init__.<locals>.<dictcomp>utf-8
r   i  c                 S   s   g | ]}t | qS r   )tuplerY   )r   merger   r   r   r      s    z,SimpleTokenizer.__init__.<locals>.<listcomp>c                 S   s   g | ]}|d  qS )</w>r   )r   rc   r   r   r   r      r    rP   z<start_of_text>z<end_of_text>c                 S   r`   r   r   ra   r   r   r   rd      re   c                 S   s   i | ]}||qS r   r   r   tr   r   r   rd          |z:|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+c                       g | ]} j | qS r   encoderrk   selfr   r   r      re   r   )#r*   byte_encoderitemsbyte_decodergzipopenreaddecoderY   r!   valuesr$   r   extendr%   r&   r"   lenrq   decoder	bpe_rankscacher=   compile
IGNORECASEpat
vocab_sizeall_special_idssot_token_ideot_token_idr]   rM   clean_fnget_reduction_mask_fnreduction_fn)rs   r[   r\   r]   r^   r_   mergesvocabri   special_tokensspecialr   rr   r   __init__   s@   



zSimpleTokenizer.__init__c           
         sj  | j v r
 j | S t|d d |d d f }t|}|s#|d S 	 t| fddd}| jvr4nu|\}}g }d}|t|k rz|||}	||||	  |	}W n   |||d   Y n3|| |kr|t|d k r||d  |kr|||  |d	7 }n|||  |d7 }|t|k sBt|}|}t|dkrnt|}q$d
	|}| j |< |S )Nrj   Tc                    s    j | tdS )Ninf)r   getfloat)pairrr   r   r   <lambda>   rm   z%SimpleTokenizer.bpe.<locals>.<lambda>)keyr   r      r<   )
r   rh   r1   minr   r}   indexr|   r$   r   )
rs   tokenr-   r.   bigramfirstsecondnew_wordijr   rr   r   bpe   sH   


,


zSimpleTokenizer.bpec                    sh   g }  |}t j|D ]#}d fdd|dD }| fdd |dD  q|S )NrP   c                 3       | ]} j | V  qd S r2   )rt   )r   r)   rr   r   r   rW      s    z)SimpleTokenizer.encode.<locals>.<genexpr>rf   c                 3   r   r2   rp   )r   	bpe_tokenrr   r   r   rW      s    

r<   )	r   r=   findallr   r   encoder|   r   rY   )rs   r9   
bpe_tokensr   r   rr   r   r      s   

zSimpleTokenizer.encodec                    sD   d  fdd|D }t fdd|D jddddd	}|S )
NrP   c                    ro   r   )r~   )r   r   rr   r   r   r      re   z*SimpleTokenizer.decode.<locals>.<listcomp>c                    ro   r   )rv   )r   crr   r   r   r      re   rf   rX   )errorsrj   r<   )r   	bytearrayrz   rX   )rs   tokensr9   r   rr   r   rz      s   zSimpleTokenizer.decodetextsreturnc                    s   t |tr|g}|p j}|sJ d jdur% j|| j j jdS  fdd|D }tjt	||tj
d}t|D ]"\}}t	||krR|d| } j|d< t|||dt	|f< q=|S )a  Returns the tokenized representation of given input string(s)

        Parameters
        ----------
        texts : Union[str, List[str]]
            An input string or a list of input strings to tokenize
        context_length : int
            The context length to use; all CLIP models use 77 as the context length

        Returns
        -------
        A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]
        z!Please set a valid context lengthN)r]   r   r   	encode_fnc                    s&   g | ]} j g |  jg qS r   )r   r   r   r   r9   rr   r   r   r   	  s    z,SimpleTokenizer.__call__.<locals>.<listcomp>dtyper   )
isinstancerR   r]   r   r   r   r   torchzerosr}   long	enumeratetensor)rs   r   r]   
all_tokensresultr   r   r   rr   r   __call__   s,   




zSimpleTokenizer.__call__r2   )__name__
__module____qualname__r   DEFAULT_CONTEXT_LENGTHrR   r   r   intr   r   r   rz   r   r   
LongTensorr   r   r   r   r   rZ      s8    

()

rZ   Fr   r]   r   r   r   shufflec                    s    fdd| D }t jt||t jd}t|D ]H\}}	t |	}	t|	}
|
|d krH|d }t t|	}|d | }|sB| }|	| }	|}
|||df< |	||d|
d f< ||||
d f< q|S )Nc                       g | ]} |qS r   r   r   r   r   r   r      r    z(random_mask_tokenize.<locals>.<listcomp>r   r   r   r   )r   r   r}   r   r   r   randpermmsort)r   r]   r   r   r   r   r   r   r   r   
num_tokensnum_keepindicesr   r   r   random_mask_tokenize  s"   
r   c                    s    fdd| D }t jt||t jd}t|D ]7\}}t|}	|	|d kr:|d }
td|	|
 }||||
  }|g| |g }t |||d t|f< q|S )Nc                    r   r   r   r   r   r   r   r   <  r    z(simple_mask_tokenize.<locals>.<listcomp>r   r   r   )r   r   r}   r   r   randomrandintr   )r   r]   r   r   r   r   r   r   r   r   r   start_indexr   r   r   simple_mask_tokenize5  s   r   c                 C   s<   | dv sJ | dkrt S | dkrtS | dkrttddS dS )zNChoose strategy for dropping (masking) tokens to achieve target context length)simpler   r   r   r   r   T)r   N)r   r   r   rL   r   r   r   r   L  s   r   )F)&__doc__rw   r5   r   r   rT   	functoolsr   r   typingr   r   r   r   r3   regexr=   r   environr   r   r*   r1   r:   r?   rD   rG   rH   rR   rM   rA   objectrZ   r   boolr   r   r   r   r   r   r   <module>   sj    


 

