o
    i                  
   @   s(  d dl mZ d dlmZmZmZ d dlmZ d dlm	Z	m
Z
mZmZmZ ddlmZmZ ddlmZmZmZmZ eeded	ed
efddZeeded	ed
efddZd
efddZd
efddZd
efddZde	deded
eeef fddZded
eeee ee gef fddZ dS )    )Path)CallableOptionalTuple)SentencePieceProcessor)ModelRaggedchaindeserialize_attrserialize_attr   )build_camembert_adapterbuild_xlmr_adapter)Tok2PiecesBackpropTTok2PiecesInTTok2PiecesModelTTok2PiecesOutTvaluenamereturnc                 C   s   |  S N)to_protobuf_r   r   model r   q/home/ubuntu/.local/lib/python3.10/site-packages/spacy_curated_transformers/tokenization/sentencepiece_encoder.py!serialize_sentencepiece_processor   s   r   c                 C   s
   t |S r   )r   from_protobufr   r   r   r   deserialize_my_custom_class   s   
r   c                  C   "   t  } t| t }|d|  |S )a  Construct a SentencePiece piece encoder model that accepts a list
    of token sequences or documents and returns a corresponding list
    of piece identifiers with CamemBERT post-processing applied.

    This model must be separately initialized using an appropriate
    loader.
    encoder)build_sentencepiece_encoder_v1r	   r   set_refr!   r   r   r   r   (build_camembert_sentencepiece_encoder_v1      r%   c                  C   s$   t dtdt id} | d|  | S )zConstruct a SentencePiece piece encoder model that accepts a list
    of token sequences or documents and returns a corresponding list
    of piece identifiers.

    This model must be separately initialized using an appropriate
    loader.
    sentencepiece_encodersentencepiece_processor)forwardattrsr!   )r   sentencepiece_encoder_forwardr   r#   )r   r   r   r   r"   '   s   r"   c                  C   r    )a  Construct a SentencePiece piece encoder model that accepts a list
    of token sequences or documents and returns a corresponding list
    of piece identifiers with XLM-RoBERTa post-processing applied.

    This model must be separately initialized using an appropriate
    loader.
    r!   )r"   r	   r   r#   r$   r   r   r   #build_xlmr_sentencepiece_encoder_v18   r&   r,   r   Xis_trainc           
      C   s   | j d }g }|D ]C}| g}dg}|D ]}|jrg }	n||j}	||	 |t|	 q||  |d |t	| j
|| j
| q	|dd fS )Nr(   r   c                 S   s   g S r   r   )dYr   r   r   <lambda>g   s    z/sentencepiece_encoder_forward.<locals>.<lambda>)r*   bos_idis_spaceencode_as_idstextextendappendleneos_idr   ops	asarray1i)
r   r-   r.   spppiecesdoc
doc_pieceslenstoken	piece_idsr   r   r   r+   F   s(   





r+   pathc                    s   d fdd	}|S )zConstruct a callback that initializes a SentencePiece piece encoder
    model.

    path (Path):
        Path to the serialized SentencePiece model.
    Nc                    s   t t | jd< | S )Nr(   )r   	from_filestrr*   )r   r-   YrB   r   r   loadu   s   
z3build_sentencepiece_encoder_loader_v1.<locals>.load)NNr   )rB   rG   r   rF   r   %build_sentencepiece_encoder_loader_v1j   s   rH   N)!pathlibr   typingr   r   r   curated_tokenizersr   	thinc.apir   r   r	   r
   r   sentencepiece_adaptersr   r   typesr   r   r   r   registerrD   bytesr   r   r%   r"   r,   boolr+   rH   r   r   r   r   <module>   sN    

$