o
    i                     @   s   d dl Z d dlmZ d dlmZmZmZmZmZ d dl	m
Z
mZ ddlmZmZmZmZ defdd	Zd
e
dededeeef fddZddddddededededee deeee ee gef fddZdS )    N)Path)CallableDictOptionalOrderedDictTuple)ModelRagged   )Tok2PiecesBackpropTTok2PiecesInTTok2PiecesModelTTok2PiecesOutTreturnc                	   C   s   t dtdddddddS )	aa  Construct a character piece encoder model that accepts a list
    of token sequences or documents and returns a corresponding list
    of piece identifiers.

    This model must be separately initialized using an appropriate
    loader.

    normalize (str):
       Unicode normalization to apply before encoding a token. Defaults to
       "NFKC".
    char_encoder[CLS][SEP][UNK]NFKCN)	bos_piece	eos_piece	unk_piece	normalizevocab)forwardattrs)r   char_encoder_forward r   r   h/home/ubuntu/.local/lib/python3.10/site-packages/spacy_curated_transformers/tokenization/char_encoder.pybuild_char_encoder_v1
   s   r   modelXis_trainc                    s  | j d du rtd| j d }| j d }| j d }| j d }| }| }|  g }	|D ]K}
|g}dg}|
D ]&}|durHt||jn|j} fd	d
|D }|| |t| q;|| |d |	t| j	
|| j	
| q1|	dd fS )zConstruct a character piece encoder model that accepts a list
    of token sequences or documents and returns a corresponding list
    of piece identifiers.

    This model must be separately initialized using an appropriate
    loader.
    r   Nz\Character piece encoder vocabulary is not available. Use a loader to initialize the encoder.r   r   r   r   r
   c                    s   g | ]} | qS r   )get).0charunk_idr   r   r   
<listcomp>J   s    z(char_encoder_forward.<locals>.<listcomp>c                 S   s   g S )Nr   )dYr   r   r   <lambda>T   s    z&char_encoder_forward.<locals>.<lambda>)r   
ValueErrorunicodedatar   textextendappendlenr	   ops	asarray1i)r    r!   r"   r   r   r   r   bos_ideos_idpiecesdoc
doc_pieceslenstokenr-   	piece_idsr   r&   r   r   #   s<   








r   r   r   r   r   )r   r   r   r   pathr   r   r   r   c                    s   d fdd	}|S )zConstruct a callback that initializes a character piece encoder
    model.

    path (Path):
        Path to the serialized character model.
    Nc                    s   | j dkrtd| j  d | jd< | jd< | jd< | jd< t }tdd	"}|D ]}|d
}d ur?t|}t|||< q.W d    n1 sPw   Y  || jd< | S )Nr   z\Attempting to use the `CharEncoderLoader` piece encoder loader with an incompatible model ('z<'). It can only be used with the `CharEncoder` piece encoderr   r   r   r   zutf-8)encodingz
r   )	namer+   r   r   openrstripr,   r   r0   )r    r!   Yr   fr%   r   r   r   r;   r   r   r   loadi   s*   






z*build_char_encoder_loader_v1.<locals>.load)NNr   )r;   r   r   r   r   rC   r   rB   r   build_char_encoder_loader_v1W   s   rD   )r,   pathlibr   typingr   r   r   r   r   	thinc.apir   r	   typesr   r   r   r   r   boolr   strrD   r   r   r   r   <module>   sH    

7