o
    i                   
   @   s  d dl mZ d dlmZmZmZ d dlZd dlmZ d dl	m
Z
mZmZmZ ddlmZmZmZmZ eeded	ed
efddZeeded	ed
efddZd
efddZde
deded
eeef fddZdeded
eeee ee gef fddZdS )    )Path)CallableOptionalTupleN)ByteBPEProcessor)ModelRaggeddeserialize_attrserialize_attr   )Tok2PiecesBackpropTTok2PiecesInTTok2PiecesModelTTok2PiecesOutTvaluenamereturnc                 C   s   |j |jd}t|S )N)mergesvocab)r   r   srslymsgpack_dumps_r   r   modeldata r   h/home/ubuntu/.local/lib/python3.10/site-packages/spacy_curated_transformers/tokenization/bbpe_encoder.pyserialize_byte_bpe_processor   s   
r   c                 C   s   t |}t|d |d S )Nr   r   )r   msgpack_loadsr   r   r   r   r   deserialize_byte_bpe_processor   s   
r   c                   C   s   t dtti g dddddS )zConstruct a Byte-BPE piece encoder model that accepts a list
    of token sequences or documents and returns a corresponding list
    of piece identifiers.

    This model must be separately initialized using an appropriate
    loader.
    byte_bpe_encoderz<unk>z<s>z</s>)byte_bpe_processor	unk_piece	bos_piece	eos_piece)forwardattrs)r   byte_bpe_encoder_forwardr   r   r   r   r   build_byte_bpe_encoder_v1   s   r(   r   Xis_trainc                 C   s4  | j d }| j d }| j d }| j d }||}|d u r!td||}|d u r.td||}	|	d u r;tdg }
|D ]T}|g}dg}t|D ]-\}}|jrUg }n|d	krd||d  j|j }n|j}||}|| |	t
| qK|	| |	d |
	t| j|| j| q?|
d
d fS )Nr!   r#   r$   r"   z=Byte-BPE piece encoder vocabulary doesn't contain 'BOS' piecez=Byte-BPE piece encoder vocabulary doesn't contain 'EOS' piecez=Byte-BPE piece encoder vocabulary doesn't contain 'UNK' piecer   r   c                 S   s   g S )Nr   )dYr   r   r   <lambda>d   s    z*byte_bpe_encoder_forward.<locals>.<lambda>)r&   piece_id
ValueError	enumerateis_spacewhitespace_textencode_as_idsextendappendlenr   ops	asarray1i)r   r)   r*   bbpr#   r$   r"   bos_ideos_idunk_idpiecesdoc
doc_pieceslensidxtoken	piece_idsr2   r   r   r   r'   -   sR   












r'   
vocab_pathmerges_pathc                    s   d fdd	}|S )zConstruct a callback that initializes a Byte-BPE piece encoder
    model.

    vocab_path (Path):
        Path to the vocabulary file.
    merges_path (Path):
        Path to the merges file.
    Nc                    s   t j d| jd< | S )N)r   r   r!   )r   load_from_filesr&   )r   r)   YrE   rD   r   r   loadv   s   z.build_byte_bpe_encoder_loader_v1.<locals>.load)NNr   )rD   rE   rI   r   rH   r    build_byte_bpe_encoder_loader_v1g   s   rJ   )pathlibr   typingr   r   r   r   curated_tokenizersr   	thinc.apir   r   r	   r
   typesr   r   r   r   registerstrbytesr   r   r(   boolr'   rJ   r   r   r   r   <module>   sJ    

: