o
    ϯi<                     @   s   d Z ddlZddlZddlZddlmZ ddlmZmZ ddl	Z	ddl
ZddlZe dd Ze dd Zd	d
 Zdd Zdd ZG dd deZe Zddeeee f dedejfddZdS )zp CLIP tokenizer

Copied from https://github.com/openai/CLIP. Originally MIT License, Copyright (c) 2021 OpenAI.
    N)	lru_cache)UnionListc                   C   s   t jt jt jtdS )Nzbpe_simple_vocab_16e6.txt.gz)ospathjoindirnameabspath__file__ r   r   T/home/ubuntu/.local/lib/python3.10/site-packages/laion_clap/clap_module/tokenizer.pydefault_bpe   s   r   c                  C   s   t ttdtdd t ttdtdd  t ttdtdd  } | dd }d	}td
D ]}|| vrI| | |d
|  |d7 }q3dd |D }tt| |S )a9  
    Returns list of utf-8 byte and a corresponding list of unicode strings.
    The reversible bpe codes work on unicode strings.
    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
    This is a signficant percentage of your normal, say, 32K bpe vocab.
    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
    And avoids mapping to whitespace/control characters the bpe code barfs on.
    !~      ¡   ¬   ®   ÿNr      c                 S   s   g | ]}t |qS r   )chr).0nr   r   r   
<listcomp>(       z$bytes_to_unicode.<locals>.<listcomp>)listrangeordappenddictzip)bscsr   br   r   r   bytes_to_unicode   s   N
r$   c                 C   s6   t  }| d }| dd D ]}|||f |}q|S )zReturn set of symbol pairs in a word.
    Word is represented as tuple of symbols (symbols being variable-length strings).
    r   r   N)setadd)wordpairs	prev_charcharr   r   r   	get_pairs,   s   r+   c                 C   s"   t | } tt| } |  S N)ftfyfix_texthtmlunescapestriptextr   r   r   basic_clean8   s   
r4   c                 C   s   t dd| } |  } | S )Nz\s+ )resubr1   r2   r   r   r   whitespace_clean>   s   r8   c                   @   s:   e Zd Ze dfdefddZdd Zdd Zd	d
 ZdS )SimpleTokenizerNbpe_pathc                    sH  t   _dd  j D  _t| dd}|dd }dd |D }t	t  
 }|d	d |D  }|D ]
}|d
| q;|sMddg}nddg| }|| tt|tt| _dd  j D  _tt|tt| _dd |D  _d|}t|d tj _t j _ fdd|D  _d S )Nc                 S      i | ]\}}||qS r   r   r   kvr   r   r   
<dictcomp>G       z,SimpleTokenizer.__init__.<locals>.<dictcomp>utf-8
r   i  c                 S   s   g | ]}t | qS r   )tuplesplit)r   merger   r   r   r   J   s    z,SimpleTokenizer.__init__.<locals>.<listcomp>c                 S   s   g | ]}|d  qS )</w>r   )r   r>   r   r   r   r   L   r    <start_of_text><end_of_text>c                 S   r;   r   r   r<   r   r   r   r?   U   r@   c                 S   s   i | ]}||qS r   r   r   tr   r   r   r?   W       |z:|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+c                       g | ]} j | qS r   encoderrJ   selfr   r   r   \   r@   )r$   byte_encoderitemsbyte_decodergzipopenreaddecoderD   r   valuesr   r   extendr   r    r   lenrP   decoder	bpe_rankscacher6   compile
IGNORECASEpat
vocab_sizeall_special_ids)rR   r:   special_tokensmergesvocabrE   specialr   rQ   r   __init__E   s*   


zSimpleTokenizer.__init__c           
         sj  | j v r
 j | S t|d d |d d f }t|}|s#|d S 	 t| fddd}| jvr4nu|\}}g }d}|t|k rz|||}	||||	  |	}W n   |||d   Y n3|| |kr|t|d k r||d  |kr|||  |d	7 }n|||  |d7 }|t|k sBt|}|}t|dkrnt|}q$d
	|}| j |< |S )NrF   Tc                    s    j | tdS )Ninf)r^   getfloat)pairrQ   r   r   <lambda>h   rL   z%SimpleTokenizer.bpe.<locals>.<lambda>)keyr   r      r5   )
r_   rC   r+   minr^   r\   indexr[   r   r   )
rR   tokenr'   r(   bigramfirstsecondnew_wordijr   rQ   r   bpe^   sH   


,


zSimpleTokenizer.bpec                    sn   g }t t| }t j|D ]#}d fdd|dD }| fdd 	|
dD  q|S )NrG   c                 3       | ]} j | V  qd S r,   )rS   )r   r#   rQ   r   r   	<genexpr>       z)SimpleTokenizer.encode.<locals>.<genexpr>rA   c                 3   r|   r,   rO   )r   	bpe_tokenrQ   r   r   r}      r~   r5   )r8   r4   lowerr6   findallrb   r   encoder[   r{   rD   )rR   r3   
bpe_tokensrt   r   rQ   r   r      s   &zSimpleTokenizer.encodec                    sD   d  fdd|D }t fdd|D jddddd	}|S )
NrG   c                    rN   r   )r]   )r   rt   rQ   r   r   r      r@   z*SimpleTokenizer.decode.<locals>.<listcomp>c                    rN   r   )rU   )r   crQ   r   r   r      r@   rA   replace)errorsrF   r5   )r   	bytearrayrY   r   )rR   tokensr3   r   rQ   r   rY      s   (zSimpleTokenizer.decode)	__name__
__module____qualname__r   strri   r{   r   rY   r   r   r   r   r9   D   s
    )r9   M   textscontext_lengthreturnc                    s   t | tr| g} tjd tjd   fdd| D }tjt||tjd}t|D ]\}}t||kr;|d| }t	|||dt|f< q+|S )a  
    Returns the tokenized representation of given input string(s)

    Parameters
    ----------
    texts : Union[str, List[str]]
        An input string or a list of input strings to tokenize
    context_length : int
        The context length to use; all CLIP models use 77 as the context length

    Returns
    -------
    A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]
    rH   rI   c                    s"   g | ]}gt |  g qS r   )
_tokenizerr   )r   r3   	eot_token	sot_tokenr   r   r      s   " ztokenize.<locals>.<listcomp>)dtypeN)

isinstancer   r   rP   torchzerosr\   long	enumeratetensor)r   r   
all_tokensresultry   r   r   r   r   tokenize   s   


r   )r   )__doc__rV   r/   r   	functoolsr   typingr   r   r-   regexr6   r   r   r$   r+   r4   r8   objectr9   r   r   int
LongTensorr   r   r   r   r   <module>   s&    

Q*