o
    ߥiD                     @   s   d dl Z d dlZd dlZd dlmZ d dlZd dlZd dlZd dl	m
Z
mZ dgZe dd Ze dd Zd	d
 Zdd Zdd ZG dd deZG dd deZdS )    N)	lru_cache)BertWordPieceTokenizerCharBPETokenizerCLIPTokenizerc                  C   s2   t jt} d| dd d } t j| dS )N/zbpe_simple_vocab_16e6.txt.gz)ospathrealpath__file__joinsplit)root r   o/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/multi_modal/videocomposer/data/tokenizers.pydefault_bpe   s   r   c                  C   s   t ttdtdd t ttdtdd  t ttdtdd  } | dd }d	}td
D ]}|| vrI| | |d
|  |d7 }q3dd |D }tt| |S )a9  
    Returns list of utf-8 byte and a corresponding list of unicode strings.
    The reversible bpe codes work on unicode strings.
    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
    This is a signficant percentage of your normal, say, 32K bpe vocab.
    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
    And avoids mapping to whitespace/control characters the bpe code barfs on.
    !~      ¡   ¬   ®   ÿNr      c                 S   s   g | ]}t |qS r   )chr).0nr   r   r   
<listcomp>.       z$bytes_to_unicode.<locals>.<listcomp>)listrangeordappenddictzip)bscsr   br   r   r   bytes_to_unicode   s,   





r(   c                 C   s6   t  }| d }| dd D ]}|||f |}q|S )zReturn set of symbol pairs in a word.
    Word is represented as tuple of symbols (symbols being variable-length strings).
    r   r   N)setadd)wordpairs	prev_charcharr   r   r   	get_pairs2   s   r/   c                 C   s"   t | } tt| } |  S N)ftfyfix_texthtmlunescapestriptextr   r   r   basic_clean>   s   
r8   c                 C   s   t dd| } |  } | S )Nz\s+ )resubr5   r6   r   r   r   whitespace_cleanD   s   r<   c                   @   s8   e Zd Ze fdefddZdd Zdd Zdd	 Zd
S )SimpleTokenizerbpe_pathc                 C   s   t  | _dd | j D | _t| dd}|dd }dd |D }t	t  
 }|d	d |D  }|D ]
}|d
| q;|ddg tt|tt|| _dd | j D | _tt|tt|| _ddd| _tdtj| _d S )Nc                 S      i | ]\}}||qS r   r   r   kvr   r   r   
<dictcomp>N       z,SimpleTokenizer.__init__.<locals>.<dictcomp>utf-8
r   i  c                 S   s   g | ]}t | qS r   )tupler   )r   merger   r   r   r   Q   s    z,SimpleTokenizer.__init__.<locals>.<listcomp>c                 S   s   g | ]}|d  qS )</w>r   )r   rB   r   r   r   r   S   r    <|startoftext|><|endoftext|>c                 S   r?   r   r   r@   r   r   r   rC   X   rD   )rK   rL   z[<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+)r(   byte_encoderitemsbyte_decodergzipopenreaddecoder   r   valuesr"   r   extendr#   r$   r    lenencoderdecoder	bpe_rankscacher:   compile
IGNORECASEpat)selfr>   mergesvocabrH   r   r   r   __init__L   s(   
zSimpleTokenizer.__init__c              
      s  | j v r
 j | S t|d d |d d f }t|}|s#|d S 	 t| fddd}| jvr4n|\}}g }d}|t|k rz|||}	||||	  |	}W n tyt }
 z|||d   t	|
 W Y d }
~
n8d }
~
ww || |kr|t|d k r||d  |kr|
||  |d	7 }n|
||  |d7 }|t|k sBt|}|}t|dkrnt|}q$d
|}| j |< |S )Nr   rI   Tc                    s    j | tdS )Ninf)rY   getfloat)pairr^   r   r   <lambda>m   s    z%SimpleTokenizer.bpe.<locals>.<lambda>)keyr   r      r9   )rZ   rG   r/   minrY   rV   indexrU   	Exceptionprintr"   r   )r^   tokenr+   r,   bigramfirstsecondnew_wordijer   rf   r   bpeb   sZ   





zSimpleTokenizer.bpec                    sn   g }t t| }t j|D ]#}d fdd|dD }| fdd 	|
dD  q|S )NrJ   c                 3       | ]} j | V  qd S r0   )rM   )r   r'   rf   r   r   	<genexpr>       z)SimpleTokenizer.encode.<locals>.<genexpr>rE   c                 3   rw   r0   )rW   )r   	bpe_tokenrf   r   r   rx      ry   r9   )r<   r8   lowerr:   findallr]   r   encoderU   rv   r   )r^   r7   
bpe_tokensrn   r   rf   r   r}      s   
zSimpleTokenizer.encodec                    sD   d  fdd|D }t fdd|D jddddd	}|S )
NrJ   c                       g | ]} j | qS r   )rX   )r   rn   rf   r   r   r      rD   z*SimpleTokenizer.decode.<locals>.<listcomp>c                    r   r   )rO   )r   crf   r   r   r      rD   rE   replace)errorsrI   r9   )r   	bytearrayrS   r   )r^   tokensr7   r   rf   r   rS      s   zSimpleTokenizer.decodeN)	__name__
__module____qualname__r   strra   rv   r}   rS   r   r   r   r   r=   J   s
    ,
r=   c                   @   s&   e Zd Zd	ddZdd Zdd ZdS )
r   M   c                 C   sB   || _ tt d| _| jjd | _| jjd | _t| jj| _d S )N)r>   rK   rL   )	lengthr=   r   	tokenizerrW   	sos_token	eos_tokenrV   
vocab_size)r^   r   r   r   r   ra      s
   zCLIPTokenizer.__init__c                    sN   t |trt |S t |trt fdd|D S tdt| )Nc                    s   g | ]}  |qS r   )
_tokenizer)r   urf   r   r   r      rD   z*CLIPTokenizer.__call__.<locals>.<listcomp>z:Expected the "sequence" to be a string or a list, but got )
isinstancer   torch
LongTensorr   r   	TypeErrortype)r^   sequencer   rf   r   __call__   s   

zCLIPTokenizer.__call__c                 C   sJ   | j |d | jd  }| jg| | jg }|dg| jt|   }|S )Nri   r   )r   r}   r   r   r   rV   )r^   r7   r   r   r   r   r      s   zCLIPTokenizer._tokenizerN)r   )r   r   r   ra   r   r   r   r   r   r   r      s    
	
)rP   r3   r   	functoolsr   r1   regexr:   r   
tokenizersr   r   __all__r   r(   r/   r8   r<   objectr=   r   r   r   r   r   <module>   s$   

U