o
    ίi                     @   s  d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZ d dlm	Z	m
Z
mZ d dlZd dlmZ d dlmZ eeZeG dd	 d	ZG d
d de jZG dd deZG dd deZG dd deZdZd dddddddZdZG dd deZd"dede
e defd d!ZdS )#    N)copy)	dataclass)Path)ListOptionalTuple)SentencePieceProcessor)load_tiktoken_bpec                   @   s*   e Zd ZU dZeed< dZee ed< dS )TokenizerArgsbytesnameNpath)__name__
__module____qualname__r   str__annotations__r   r    r   r   B/home/ubuntu/.local/lib/python3.10/site-packages/core/tokenizer.pyr
      s   
 r
   c                   @   s`   e Zd Zejdd Zejdd Zej	ddedee	e
  dee	e e	e
 f fd	d
ZdS )	Tokenizerc                 C      d S Nr   selftokensadd_bosadd_eosr   r   r   encode      zTokenizer.encodec                 C   r   r   r   r   r   r   r   r   decode   r   zTokenizer.decodeNtextr   returnc                 C   s   dS )zPReturn the offsets of the tokens in the original text. Only used for evaluation.Nr   )r   r!   r   r   r   r   get_token_offsets!   s   zTokenizer.get_token_offsetsr   )r   r   r   abcabstractmethodr   r    r   r   r   intr   r#   r   r   r   r   r      s    


r   c                   @   s"   e Zd ZU dZeed< dd ZdS )MockTokenizer   n_wordsc                 C   s   |S r   r   r   r   r   r   r   ,   s   zMockTokenizer.encodeN)r   r   r   r)   r&   r   r   r   r   r   r   r'   )   s   
 r'   c                
   @   sp   e Zd Zdd ZddededefddZd	ee fd
dZ		dded	e
ee  deee ee f fddZdS )ByteTokenizerc                 C   s   d| _ d| _d| _d S )Nr(   i  i  )bos_ideos_idr)   )r   r   r   r   __init__1   s   
zByteTokenizer.__init__Fsr   r   c                 C   s(   | j g| t|  | jg|  }|S r   )r+   listr   r,   r   r.   r   r   r   r   r   r   r   6   s   $zByteTokenizer.encoder   c                 C   s    t dd |D }|jdddS )Nc                 S   s   g | ]}|d k r|qS )r(   r   ).0tr   r   r   
<listcomp>;   s    z(ByteTokenizer.decode.<locals>.<listcomp>utf-8backslashreplaceerrors)r   r    )r   r   byte_tokensr   r   r   r    :   s   zByteTokenizer.decodeNr!   r"   c                 C   sx   |d u r	|  |}g g }}d}|D ]%}|dk r7t|gjddd}|r.|| || |t| d7 }q||fS )Nr   r(   r4   ignorer6   )r   r   r    appendlen)r   r!   r   decoded_charsoffsetsbyte_postokencharr   r   r   r#   >   s   



zByteTokenizer.get_token_offsets)FFr   )r   r   r   r-   r   boolr   r   r&   r    r   r   r#   r   r   r   r   r*   0   s    
r*   c                
   @   x   e Zd ZdeddfddZdededefd	d
Zdee fddZ		ddede
ee  deee ee f fddZdS )SentencePieceTokenizer
model_pathr"   Nc                 C   s   t j|s
J |t|d| _td|  | j | _| j	 | _	| j
 | _
| j | _td| j d| j	 d| j
  | j | j ksMJ d S )N)
model_filez"Reloaded SentencePiece model from #words:  - BOS ID:  - EOS ID: )osr   isfiler   sp_modelloggerinfo
vocab_sizer)   r+   r,   pad_idget_piece_size)r   rD   r   r   r   r-   R   s   zSentencePieceTokenizer.__init__r.   r   r   c                 C   s8   t |tu sJ | jg| | j| | jg|  }|S r   )typer   r+   rK   r   r,   r0   r   r   r   r   b   s   "zSentencePieceTokenizer.encoder   c                 C      | j |S r   )rK   r    r   r   r   r   r    i      zSentencePieceTokenizer.decoder!   c                 C   s2   | j |j}dd |D }dd |D }||fS )Nc                 S      g | ]}|j qS r   )surfacer1   pr   r   r   r3   p       z<SentencePieceTokenizer.get_token_offsets.<locals>.<listcomp>c                 S   rT   r   )beginrV   r   r   r   r3   q   rX   )rK   encode_as_immutable_protopieces)r   r!   r   r[   substrsr=   r   r   r   r#   l   s   z(SentencePieceTokenizer.get_token_offsetsr   r   r   r   r   r-   rA   r   r   r&   r    r   r   r#   r   r   r   r   rC   Q   s    
rC   zs(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+                  )<|begin_of_text|><|end_of_text|>z<|fim_prefix|>z<|fim_middle|>z<|fim_end_fill|>z<|fim_pad|>z<|fim_suffix|>i c                
   @   rB   )TikTokenTokenizerrD   r"   Nc                 C   s   t |}tt}ttdt|  }|D ]
}||d| d< q|D ]}||  t|7  < q#tjj	t
|jt||d| _| jd| _| jd| _| jj| _td| j d| j d	| j  d S )
Nr(   z<|reserved_special_token_z|>)r   pat_strmergeable_ranksspecial_tokensrd   re   rF   rG   rH   )r	   r   DEFAULT_TIKTOKEN_SPECIAL_TOKENSsetrangevaluesr;   tiktokencoreEncodingr   stemDEFAULT_TIKTOKEN_PATTERN	tkt_modelencode_single_tokenr+   r,   n_vocabr)   rL   rM   )r   rD   rh   all_special_tokens_with_idsmissing_idsidr   r   r   r   r-      s&   
zTikTokenTokenizer.__init__r.   r   r   c                 C   sj   t |tsJ g }tdt|tD ]}||||t   q| jg| t| j	|g d | j
g|  S )Nr   )start)
isinstancer   rl   r;   TIKTOKEN_MAX_ENCODE_CHARSr:   r+   sumrs   encode_ordinary_batchr,   )r   r.   r   r   subsir   r   r   r      s   

zTikTokenTokenizer.encoder   c                 C   rR   r   )rs   r    r   r   r   r   r       rS   zTikTokenTokenizer.decoder!   c              	      s   |d ur| j |}n| j | j j dd}dg }}|D ]#}|td|d|d   ko0dk n    |tdd |D 7 }q fdd	t||d
d  d g D }||fS )Nall)allowed_specialr         c                 s   s*    | ]}d |  krdk sn dV  qdS )r   r   r^   Nr   )r1   cr   r   r   	<genexpr>   s   ( z6TikTokenTokenizer.get_token_offsets.<locals>.<genexpr>c                    s   g | ]
\}} || qS r   r   )r1   r.   er!   r   r   r3      s    z7TikTokenTokenizer.get_token_offsets.<locals>.<listcomp>r^   )rs   decode_tokens_bytesr   r:   maxr|   zip)r   r!   r   token_bytestext_lenr=   r?   r\   r   r   r   r#      s   
,&z#TikTokenTokenizer.get_token_offsetsr   r]   r   r   r   r   rf      s    
rf   r   r   r"   c                 C   sJ   | dkrt  S | dkrt S | dkrt|S | dkrt|S t|  d)Nr   mocksprn   z" tokenizer type is not implemented)r*   r'   rC   rf   NotImplementedError)r   r   r   r   r   build_tokenizer   s   r   r   ) r$   loggingrI   r   dataclassesr   pathlibr   typingr   r   r   rn   sentencepiecer   tiktoken.loadr	   	getLoggerr   rL   r
   ABCr   r'   r*   rC   rr   rj   r{   rf   r   r   r   r   r   r   <module>   s:   
!$	 <