o
    ̳iC                     @   s   d dl mZmZmZmZmZmZ d dlZd dl	m
Z
 dZG dd de
Zdeeef fdd	Zd
eedf deeeef  fddZdedefddZdedeeeef  fddZdS )    )AnyDictListMappingSetTupleN)BaseTokenizerz</w>c                	   @   s   e Zd ZdZddededefddZd	ed
ee fddZ	dee d
efddZ
	ddeeef ded
eeef fddZded
efddZdS )CLIPTokenizerar  
    Text tokenizer for CLIP.

    Based on the official implementation here:
    https://github.com/openai/CLIP/blob/main/clip/simple_tokenizer.py

    Args:
        path (str): the path to the CLIP merges file
        max_seq_len (int): the context length (all CLIP models use 77)
        truncate (bool): whether to truncate the text when longer than max_seq_len
    M   Tpathmax_seq_lentruncatec                 C   s   || _ || _t | _dd | j D | _t|}t| j }|	dd |D  |	dd |D  |	ddg dd t
|D | _d	d | j D | _d
d t
|D | _tdtj| _| jd | _| jd | _| j| _ddd| _d S )Nc                 S      i | ]\}}||qS  r   .0kvr   r   T/home/ubuntu/.local/lib/python3.10/site-packages/torchtune/models/clip/_tokenizer.py
<dictcomp>!       z*CLIPTokenizer.__init__.<locals>.<dictcomp>c                 S   s   g | ]}|t  qS r   )WORD_BOUNDARY)r   r   r   r   r   
<listcomp>&       z*CLIPTokenizer.__init__.<locals>.<listcomp>c                 S   s   g | ]}d  |qS ) )join)r   merger   r   r   r   '   r   <|startoftext|><|endoftext|>c                 S   r   r   r   )r   iwordr   r   r   r   *   r   c                 S   r   r   r   r   r   r   r   r   +   r   c                 S   r   r   r   )r   r   r   r   r   r   r   ,   r   z[<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+)r   r   )r   r   _bytes_to_unicodebyte_encoderitemsbyte_decoder_load_mergeslistvaluesextend	enumerateencoderdecoder	bpe_ranksrecompile
IGNORECASEpat	sot_token	eot_token	pad_tokencache)selfr   r   r   mergesvocabr   r   r   __init__   s,   zCLIPTokenizer.__init__textreturnc                    s   t | } jg}t j|D ],}d fdd|dD }| fdd 	|
dD  t| jkr= nq| j t| jkr^ jsRJ d|d j } j|d	< |S )
z
        Given a string, return the encoded list of token ids.

        Args:
            text (str): The text to encode.

        Returns:
            List[int]: The encoded list of token ids.
        r   c                 3       | ]} j | V  qd S N)r"   )r   br5   r   r   	<genexpr>J   s    z'CLIPTokenizer.encode.<locals>.<genexpr>utf-8c                 3   r;   r<   )r*   )r   	bpe_tokenr>   r   r   r?   K   s    

 zWTokenized text is larger than the maximum sequence length but truncate is set to False.N)_clean_textlowerr1   r-   findallr0   r   encoder(   _bpesplitlenr   appendr2   r   )r5   r9   tokenstokenr   r>   r   rG   <   s$   

zCLIPTokenizer.encoderL   c                    s@   d  fdd|D }t fdd|D jdddtdS )	z
        Given a list of token ids, return the decoded text, optionally including special tokens.

        Args:
            tokens (List[int]): The list of token ids to decode.

        Returns:
            str: The decoded text.
        r   c                       g | ]} j | qS r   )r+   )r   rM   r>   r   r   r   f   r   z(CLIPTokenizer.decode.<locals>.<listcomp>c                    rN   r   )r$   )r   cr>   r   r   r   h   r   r@   replace)errorsrB   )r   	bytearraydecoderP   r   )r5   rL   r9   r   r>   r   rS   \   s   
zCLIPTokenizer.decodeFsample	inferencec                 C   s   | d}| ||d< |S )a]  
        Tokenize the "text" field in the sample.

        Args:
            sample (Mapping[str, Any]): A sample with a "text" field containing a string to tokenize
            inference (bool): Unused by this tokenizer

        Returns:
            Mapping[str, Any]: The sample with added "tokens" field and the "messages" field removed.
        r9   rL   )poprG   )r5   rT   rU   r9   r   r   r   __call__m   s   
zCLIPTokenizer.__call__rM   c           
         sv  | j v r
 j | S t|dk r|t S t|dd |d t f }t|}	 t| fddd}| jvr8nw|\}}g }d}|t|k rz|||}	||||	  |	}W n t	yk   |||d  Y n4w || |kr|t|d	 k r||d	  |kr|
||  |d7 }n|
||  |d	7 }|t|k sFt|}t|d	krnt|}q(d
|}| j |< |S )z@
        Performs byte-pair encoding on a single token.
           NrC   Tc                    s    j | tdS )Ninf)r,   getfloat)pairr>   r   r   <lambda>   s    z$CLIPTokenizer._bpe.<locals>.<lambda>)keyr      rB   )r4   rJ   r   tuple
_get_pairsminr,   indexr(   
ValueErrorrK   r   )
r5   rM   r    pairsbigramfirstsecondnew_wordr   jr   r>   r   rH   ~   sH   


,

&
zCLIPTokenizer._bpeN)r
   T)F)__name__
__module____qualname____doc__strintboolr8   r   rG   rS   r   r   rW   rH   r   r   r   r   r	      s      


r	   r:   c                  C   s   t ttdtdd t ttdtdd  t ttdtdd  } | dd }d	}td
D ]}|| vrI| | |d
|  |d7 }q3dd |D }tt| |S )zQ
    Returns list of utf-8 byte and a corresponding list of unicode strings.
    !~r_      ¡   ¬   ®   ÿNr      c                 S   s   g | ]}t |qS r   )chr)r   nr   r   r   r      r   z%_bytes_to_unicode.<locals>.<listcomp>)r&   rangeordrK   dictzip)bscsrz   r=   r   r   r   r!      s    
r!   r    .c                 C   s6   t  }| d }| dd D ]}|||f |}q|S )z
    Return set of symbol pairs in a word.
    Word is represented as tuple of symbols (symbols being variable-length strings).
    r   r_   N)setadd)r    re   	prev_charcharr   r   r   ra      s   ra   r9   c                 C   s   |  ddS )zI
    Minimal version of CLIP's text cleaning via the `ftfy` package.
    u   ’')rP   )r9   r   r   r   rD      s   rD   r   c                 C   sz   g }t | dd+}t|D ]\}}| }|dkr|ds |s!q|t|  qW d    |S 1 s6w   Y  |S )Nr@   )encodingr   z	#version:)openr)   strip
startswithrK   r`   rI   )r   r6   fr   liner   r   r   r%      s   
r%   )typingr   r   r   r   r   r   regexr-   .torchtune.modules.transforms.tokenizers._utilsr   r   r	   rp   ro   r!   ra   rD   r%   r   r   r   r   <module>   s     -&"