o
    ߥi                     @   sn   d dl Z d dlZd dlmZ d dlZd dlZd dlZe dd Zdd Z	dd Z
d	d
 ZG dd deZdS )    N)	lru_cachec                  C   s   t ttdtdd t ttdtdd  t ttdtdd  } | d d  }d}td	D ]}|| vrI| | |d	|  |d7 }q3d
d |D }tt| |S )N!~      ¡   ¬   ®   ÿr      c                 S   s   g | ]}t |qS  )chr).0nr   r   b/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/multi_modal/soonet/tokenizer.py
<listcomp>       z$bytes_to_unicode.<locals>.<listcomp>)listrangeordappenddictzip)bscsr   br   r   r   bytes_to_unicode   s,   





r   c                 C   s6   t  }| d }| dd  D ]}|||f |}q|S )Nr   r   )setadd)wordpairs	prev_charcharr   r   r   	get_pairs   s   r"   c                 C   s"   t | } tt| } |  S N)ftfyfix_texthtmlunescapestriptextr   r   r   basic_clean(   s   
r+   c                 C   s   t dd| } |  } | S )Nz\s+ )resubr(   r)   r   r   r   whitespace_clean.   s   r/   c                   @   s6   e Zd Zdd Zdd Zdd Zdd Zdd
dZdS )SimpleTokenizerc                 C   s   t  | _dd | j D | _t| dd}|dd }dd |D }t	t  
 }|d	d |D  }|D ]
}|d
| q;|ddg tt|tt|| _dd | j D | _tt|tt|| _ddd| _tdtj| _d S )Nc                 S      i | ]\}}||qS r   r   r   kvr   r   r   
<dictcomp>8       z,SimpleTokenizer.__init__.<locals>.<dictcomp>utf-8
r   i  c                 S   s   g | ]}t | qS r   )tuplesplit)r   merger   r   r   r   ;   s    z,SimpleTokenizer.__init__.<locals>.<listcomp>c                 S   s   g | ]}|d  qS )</w>r   )r   r4   r   r   r   r   =   r    <|startoftext|><|endoftext|>c                 S   r1   r   r   r2   r   r   r   r5   B   r6   )r>   r?   z[<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+)r   byte_encoderitemsbyte_decodergzipopenreaddecoder:   r   valuesr   joinextendr   r   r   lenencoderdecoder	bpe_rankscacher-   compile
IGNORECASEpat)selfbpe_pathmergesvocabr;   r   r   r   __init__6   s(   
zSimpleTokenizer.__init__c           
         sr  | j v r
 j | S t|d d |d d f }t|}|s#|d S 	 t| fddd}| jvr4ny|\}}g }d}|t|k rz|||}	||||	  |	}W n tyg   |||d   Y n4w || |kr|t|d k r||d  |kr|	||  |d	7 }n|	||  |d7 }|t|k sBt|}|}t|dkrnt|}q$d

|}| j |< |S )Nr<   Tc                    s    j | tdS )Ninf)rM   getfloat)pairrR   r   r   <lambda>W   s    z%SimpleTokenizer.bpe.<locals>.<lambda>)keyr   r      r,   )rN   r9   r"   minrM   rJ   indexrI   
ValueErrorr   rH   )
rR   tokenr   r   bigramfirstsecondnew_wordijr   r\   r   bpeL   sV   





zSimpleTokenizer.bpec                    sn   g }t t| }t j|D ]#}d fdd|dD }| fdd 	|
dD  q|S )Nr=   c                 3       | ]} j | V  qd S r#   )r@   )r   r   r\   r   r   	<genexpr>{       z)SimpleTokenizer.encode.<locals>.<genexpr>r7   c                 3   rk   r#   )rK   )r   	bpe_tokenr\   r   r   rl   }   rm   r,   )r/   r+   lowerr-   findallrQ   rH   encoderI   rj   r:   )rR   r*   
bpe_tokensrc   r   r\   r   rq   w   s   
zSimpleTokenizer.encodec                    sD   d  fdd|D }t fdd|D jddddd	}|S )
Nr=   c                       g | ]} j | qS r   )rL   )r   rc   r\   r   r   r      r6   z*SimpleTokenizer.decode.<locals>.<listcomp>c                    rs   r   )rB   )r   cr\   r   r   r      r6   r7   replace)errorsr<   r,   )rH   	bytearrayrF   ru   )rR   tokensr*   r   r\   r   rF      s   zSimpleTokenizer.decodeM   c                    s   t |tr|g}jd jd   fdd|D }tjt||tjd}t|D ]!\}}t||kr@|d | } |d< t|||d t|f< q,|S )Nr>   r?   c                    s"   g | ]}g |  g qS r   )rq   )r   r*   	eot_tokenrR   	sot_tokenr   r   r      s    z,SimpleTokenizer.tokenize.<locals>.<listcomp>)dtyperW   )	
isinstancestrrK   torchzerosrJ   int	enumeratetensor)rR   textscontext_length
all_tokensresultrh   rx   r   rz   r   tokenize   s   


zSimpleTokenizer.tokenizeN)ry   )__name__
__module____qualname__rV   rj   rq   rF   r   r   r   r   r   r0   4   s    +
r0   )rC   r&   	functoolsr   r$   regexr-   r   r   r"   r+   r/   objectr0   r   r   r   r   <module>   s   
	