o
    i~                     @   s0  d dl Z d dlmZmZ ddlmZmZ ddlmZ ddl	m
Z
 ddlmZmZ dd	lmZ er=ejejejejejfZnd
ZG dd dZdddededeeee ee gef fddZdedddefddZdedddefddZdedddefddZdedddefdd Zdedd!defd"d#ZdS )$    N)CallableOptional   )has_hf_transformerstransformers   )ByteBPEProcessor)SentencePieceProcessor)Tok2PiecesInTTok2PiecesModelT)WordPieceProcessor c                   @   s,   e Zd ZdZdedefddZd	ddZdS )
_HFPieceEncoderLoaderzVThis was formerly an inline function. However, only proper objects
    can be pickled.namerevisionc                C   s   || _ || _d S )Nr   r   )selfr   r   r   r   e/home/ubuntu/.local/lib/python3.10/site-packages/spacy_curated_transformers/tokenization/hf_loader.py__init__   s   
z_HFPieceEncoderLoader.__init__Nc                 C   s*   t stdtjj| j| jd}t||S )NzW`HFPieceEncoderLoader` requires the Hugging Face `transformers` package to be installedr   )r   
ValueErrorr   AutoTokenizerfrom_pretrainedr   r   _convert_encoder)r   modelXY	tokenizerr   r   r   __call__   s   
z_HFPieceEncoderLoader.__call__)NN)__name__
__module____qualname____doc__strr   r   r   r   r   r   r      s    r   mainr   r   r   returnc                 C   s   t | |dS )zConstruct a callback that initializes a HuggingFace piece encoder
    model. Used in conjunction with the HuggingFace model loader.

    name (str):
        Name of the HuggingFace model.
    revision (str):
        Name of the model revision/branch.
    r   )r   r   r   r   r    build_hf_piece_encoder_loader_v1*   s   r&   r   r   z$transformers.PreTrainedTokenizerBasec                 C   sv   t |tjrt| |S t |tjrt| |S t |tjtjfr$t| |S t |tj	r/t
| |S tdt| dt )Nz:Attempting to load an unsupported Hugging Face tokenizer (z#). Currently supported tokenizers: )
isinstancer   BertTokenizerFast_convert_wordpiece_encoderRobertaTokenizerFast_convert_byte_bpe_encoderXLMRobertaTokenizerFastCamembertTokenizerFast_convert_sentencepiece_encoderBertJapaneseTokenizer_convert_bert_japanese_encoderr   typeSUPPORTED_TOKENIZERSr   r   r   r   r   r   9   s$   




r   z!transformers.RobertaTokenizerFastc                 C   sl   |j d}t|}|d }dd |d D }t|d || jd< |j| jd< |j| jd	< |j| jd
< | S )NTr   c                 S   s   g | ]	}t |d qS ) )tuplesplit).0merger   r   r   
<listcomp>X   s    z-_convert_byte_bpe_encoder.<locals>.<listcomp>mergesvocabbyte_bpe_processor	bos_piece	eos_piece	unk_piece)	backend_tokenizerto_strjsonloadsr   attrs	bos_token	eos_token	unk_token)r   r   
serializeddeserializedvocab_mergesr:   r   r   r   r+   N   s   
r+   c                 C   s   t |j| djd< | S )Nencodersentencepiece_processor)r	   	from_file
vocab_fileget_refrD   r3   r   r   r   r.   a   s   
r.   ztransformers.BertTokenizerFastc                 C   s   d g|j  }|j D ]\}}|||< q|jjj}|j}t|| jd< |j	| jd< |j
| jd< |j| jd< || jd< |pA|duoA|| jd< | S )Nwordpiece_processorr=   r>   r?   	lowercaseFstrip_accents)
vocab_sizer;   itemsr@   
normalizerrR   do_lower_caser   rD   	cls_token	sep_tokenrG   )r   r   r;   pieceidxrR   rQ   r   r   r   r)   m   s   



r)   z"transformers.BertJapaneseTokenizerc                 C   s   t |jtjjjstd| jdkrtd| j d|j| j	d< |j
| j	d< |j| j	d< |jjr3dnd | j	d	< |j | j	d
< | S )NzFJapanese BERT models currently only support character subword encodingchar_encoderz9Attempting to initialize an incompatible piece encoder ('zj') with the Hugging Face Japanese BERT tokenizer. It can only be used with the `CharEncoder` piece encoderr=   r>   r?   NFKC	normalizer;   )r'   subword_tokenizerr   modelsbert_japaneseCharacterTokenizerr   r   rW   rD   rX   rG   normalize_textr;   copyr3   r   r   r   r0      s$   
r0   )rB   typingr   r   _compatr   r   bbpe_encoderr   sentencepiece_encoderr	   typesr
   r   wordpiece_encoderr   r(   r*   r,   r-   r/   r2   r   r#   r&   r   r+   r.   r)   r0   r   r   r   r   <module>   sp    



