o
    5ti	                     @   s   d dl Z d dlZd dlmZ ddlmZmZ ddlmZ e	dZ
dd	d
dd	d
ddd
ddd
dZG dd deZG dd deZG dd deZG dd deZdS )    N)	lru_cache   )SACREBLEU_DIRdownload_file   )BaseTokenizer	sacrebleuzRhttps://dl.fbaipublicfiles.com/fairseq/models/flores/sacrebleu_tokenizer_spm.model	flores101)url	signaturez)https://tinyurl.com/flores200sacrebleuspm	flores200z<https://www.dlnlp.ai/spBLEU-1K/spbleu-1k_tokenizer_spm.model	spBLEU-1K)spmr	   r   r   c                   @   s0   e Zd Zdd ZdddZedddd	 Zd
S )TokenizerSPMc                 C   s   | j S )N)nameself r   V/home/ubuntu/.local/lib/python3.10/site-packages/sacrebleu/tokenizers/tokenizer_spm.pyr   #   s   zTokenizerSPM.signaturer   c              	   C   s   t | d | _|dkrtd zdd l}W n ttfy#   tdw | | _t	j
tdt	j
t | d }t	j
|sKt | j d }t|| | j| d S )Nr   r   zRTokenizer 'spm' has been changed to 'flores101', and may be removed in the future.r   z^

Please install the sentencepiece library for SPM tokenization:

  pip install sentencepiece modelsr
   )
SPM_MODELSr   sacreloggerwarnsentencepieceImportErrorModuleNotFoundErrorSentencePieceProcessorspospathjoinr   basenameexistsr   Load)r   keyr   
model_pathr
   r   r   r   __init__&   s    

 
zTokenizerSPM.__init__i   )maxsizec                 C   s   d | j|S )zTokenizes all the characters in the input line.

        :param line: a segment to tokenize
        :return: the tokenized line
         )r    r   EncodeAsPieces)r   liner   r   r   __call__;   s   zTokenizerSPM.__call__N)r   )__name__
__module____qualname__r   r&   r   r+   r   r   r   r   r   "   s
    
r   c                          e Zd Z fddZ  ZS )Flores200Tokenizerc                       t  d d S )Nr   superr&   r   	__class__r   r   r&   F      zFlores200Tokenizer.__init__r,   r-   r.   r&   __classcell__r   r   r4   r   r0   E       r0   c                       r/   )Flores101Tokenizerc                    r1   )Nr	   r2   r   r4   r   r   r&   J   r6   zFlores101Tokenizer.__init__r7   r   r   r4   r   r:   I   r9   r:   c                       r/   )spBLEU1KTokenizerc                    r1   )Nr   r2   r   r4   r   r   r&   O   r6   zspBLEU1KTokenizer.__init__r7   r   r   r4   r   r;   N   r9   r;   )r   logging	functoolsr   utilsr   r   tokenizer_baser   	getLoggerr   r   r   r0   r:   r;   r   r   r   r   <module>   s.   
#