o
    ߥi9                     @   sv   d dl Z d dlZd dlZd dlmZ d dlZd dlZd dlZe dd Z	dd Z
dd Zd	d
 ZG dd deZdS )    N)	lru_cachec                  C   s   t ttdtdd t ttdtdd  t ttdtdd  } | d d  }d}td	D ]}|| vrI| | |d	|  |d7 }q3d
d |D }tt| |S )N!~      ¡   ¬   ®   ÿr      c                 S   s   g | ]}t |qS  )chr).0nr   r   h/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/cv/vop_retrieval/tokenization_clip.py
<listcomp>       z$bytes_to_unicode.<locals>.<listcomp>)listrangeordappenddictzip)bscsr   br   r   r   bytes_to_unicode   s,   





r   c                 C   s6   t  }| d }| dd  D ]}|||f |}q|S )Nr   r   )setadd)wordpairs	prev_charcharr   r   r   	get_pairs"   s   r"   c                 C   s"   t | } tt| } |  S N)ftfyfix_texthtmlunescapestriptextr   r   r   basic_clean+   s   
r+   c                 C   s   t dd| } |  } | S )Nz\s+ )resubr(   r)   r   r   r   whitespace_clean1   s   r/   c                   @   s4   e Zd Zdd Zdd Zdd Z			dd	d
ZdS )LengthAdaptiveTokenizerc                 C   s   t  | _dd | j D | _|}|dd }dd |D }tt   }|dd |D  }|D ]
}|d| q0|d	d
g t	t
|tt|| _dd | j D | _t	t
|tt|| _d	d
d| _tdtj| _| j| _|j|j | _d S )Nc                 S      i | ]\}}||qS r   r   r   kvr   r   r   
<dictcomp>;       z4LengthAdaptiveTokenizer.__init__.<locals>.<dictcomp>r   i  c                 S   s   g | ]}t | qS r   )tuplesplit)r   merger   r   r   r   >   s    z4LengthAdaptiveTokenizer.__init__.<locals>.<listcomp>c                 S   s   g | ]}|d  qS )</w>r   )r   r4   r   r   r   r   @   r    <|startoftext|><|endoftext|>c                 S   r1   r   r   r2   r   r   r   r5   E   r6   )r<   r=   z[<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+)r   byte_encoderitemsbyte_decoderr   valuesr   joinextendr   r   r   lenencoderdecoder	bpe_rankscacher-   compile
IGNORECASEpatvocabtp_prefix_token_numtp_suffix_token_numtp_token_num)selfconfigbpe_pathmergesrL   r9   r   r   r   __init__9   s,   z LengthAdaptiveTokenizer.__init__c           
         sr  | j v r
 j | S t|d d |d d f }t|}|s#|d S 	 t| fddd}| jvr4ny|\}}g }d}|t|k rz|||}	||||	  |	}W n tyg   |||d   Y n4w || |kr|t|d k r||d  |kr|	||  |d	7 }n|	||  |d7 }|t|k sBt|}|}t|dkrnt|}q$d

|}| j |< |S )Nr:   Tc                    s    j | tdS )Ninf)rG   getfloat)pairrP   r   r   <lambda>^   s    z-LengthAdaptiveTokenizer.bpe.<locals>.<lambda>)keyr   r      r,   )rH   r7   r"   minrG   rD   indexrC   
ValueErrorr   rB   )
rP   tokenr   r   bigramfirstsecondnew_wordijr   rZ   r   bpeS   sV   





zLengthAdaptiveTokenizer.bpec                    sn   g }t t| }t j|D ]#}d fdd|dD }| fdd 	|
dD  q|S )Nr;   c                 3       | ]} j | V  qd S r#   )r>   )r   r   rZ   r   r   	<genexpr>       z1LengthAdaptiveTokenizer.encode.<locals>.<genexpr>zutf-8c                 3   ri   r#   )rE   )r   	bpe_tokenrZ   r   r   rj      rk   r,   )r/   r+   lowerr-   findallrK   rB   encoderC   rh   r8   )rP   r*   
bpe_tokensra   r   rZ   r   ro   ~   s   
zLengthAdaptiveTokenizer.encodeptTc                    s   dj  }t|tr|g}jd jd   fdd|D }tjt||tjd}t|D ]3\}}	t|	|krWg|	d|d    g }
t	|
||d t|	f< q1t	|	||d t|	f< q1|S )NM   r<   r=   c                    s"   g | ]}g |  g qS r   )ro   )r   r*   	eot_tokenrP   	sot_tokenr   r   r      s    z4LengthAdaptiveTokenizer.__call__.<locals>.<listcomp>)dtyper   )
rO   
isinstancestrrE   torchzerosrD   long	enumeratetensor)rP   textsreturn_tensorspadding
truncationcontext_length
all_tokensresultrf   tokens
new_tokensr   rs   r   __call__   s&   



z LengthAdaptiveTokenizer.__call__N)rq   TT)__name__
__module____qualname__rT   rh   ro   r   r   r   r   r   r0   7   s    +r0   )gzipr&   os	functoolsr   r$   regexr-   ry   r   r"   r+   r/   objectr0   r   r   r   r   <module>   s   
	