o
    i*                     @   s   d dl Z d dlmZ d dlmZ d dlmZ ddlmZm	Z	 ddl
mZ ddlmZmZ e r6d	d
lmZ ndZeeZdddZg dZG dd deZdgZdS )    N)copyfile)Optional)
processors   )
AddedTokenBatchEncoding)PreTrainedTokenizerFast)is_sentencepiece_availablelogging   )MBartTokenizerzsentencepiece.bpe.modelztokenizer.json)
vocab_filetokenizer_file)ar_ARcs_CZde_DEen_XXes_XXet_EEfi_FIfr_XXgu_INhi_INit_ITja_XXkk_KZko_KRlt_LTlv_LVmy_MMne_NPnl_XXro_ROru_RUsi_LKtr_TRvi_VNzh_CNc                       s  e Zd ZU dZeZddgZeZg Z	e
e ed< g Ze
e ed< 									
				d1 fdd	ZedefddZejdeddfddZ	d2de
e dee
e  de
e fddZ	d2de
e dee
e  de
e fddZdedee dee fddZ			d3d e
e ded!ee
e  dedef
 fd"d#Zd$d% Zd&d' Zd4d(d)Zd*eddfd+d,Zd2d-ed.ee dee fd/d0Z  ZS )5MBartTokenizerFastuO  
    Construct a "fast" MBART tokenizer (backed by HuggingFace's *tokenizers* library). Based on
    [BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models).

    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.

    The tokenization method is `<tokens> <eos> <language code>` for source language documents, and `<language code>
    <tokens> <eos>` for target language documents.

    Examples:

    ```python
    >>> from transformers import MBartTokenizerFast

    >>> tokenizer = MBartTokenizerFast.from_pretrained(
    ...     "facebook/mbart-large-en-ro", src_lang="en_XX", tgt_lang="ro_RO"
    ... )
    >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
    >>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
    >>> inputs = tokenizer(example_english_phrase, text_target=expected_translation_romanian, return_tensors="pt")
    ```	input_idsattention_maskprefix_tokenssuffix_tokensN<s></s><unk><pad><mask>c                    s   t |	trt|	dddn|	}	t  |d ur"  fdd|D  t jd
|||||||||	|
| d| |_fddtD _	|
d urK|
nd	_
j
_|_j
 d S )NTF)lstriprstripc                    s   g | ]}| vr|qS  r4   ).0t)_additional_special_tokensr4   e/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/mbart/tokenization_mbart_fast.py
<listcomp>a   s    z/MBartTokenizerFast.__init__.<locals>.<listcomp>)r   r   	bos_token	eos_token	sep_token	cls_token	unk_token	pad_token
mask_tokensrc_langtgt_langadditional_special_tokensc                    s   i | ]}|  |qS r4   )convert_tokens_to_ids)r5   	lang_codeselfr4   r8   
<dictcomp>u   s    z/MBartTokenizerFast.__init__.<locals>.<dictcomp>r   r4   )
isinstancestrr   FAIRSEQ_LANGUAGE_CODEScopyextendsuper__init__r   lang_code_to_id	_src_langrD   cur_lang_coderB   set_src_lang_special_tokens)rG   r   r   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   kwargs	__class__)r7   rG   r8   rO   I   s<   
zMBartTokenizerFast.__init__returnc                 C   s   | j S N)rQ   rF   r4   r4   r8   rA   ~   s   zMBartTokenizerFast.src_langnew_src_langc                 C   s   || _ | | j  d S rX   )rQ   rS   )rG   rY   r4   r4   r8   rA      s   token_ids_0token_ids_1c                 C   s,   |du r| j | | j S | j | | | j S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. The special tokens depend on calling set_lang.

        An MBART sequence has the following format, where `X` represents the sequence:

        - `input_ids` (for encoder) `X [eos, src_lang_code]`
        - `decoder_input_ids`: (for decoder) `X [eos, tgt_lang_code]`

        BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
        separator.

        Args:
            token_ids_0 (`list[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`list[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `list[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        N)r+   r,   )rG   rZ   r[   r4   r4   r8    build_inputs_with_special_tokens   s   z3MBartTokenizerFast.build_inputs_with_special_tokensc                 C   sP   | j g}| jg}|du rt|| | dg S t|| | | | | dg S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. mBART does not
        make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`list[int]`):
                List of IDs.
            token_ids_1 (`list[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `list[int]`: List of zeros.

        Nr   )sep_token_idcls_token_idlen)rG   rZ   r[   sepclsr4   r4   r8   $create_token_type_ids_from_sequences   s
   "z7MBartTokenizerFast.create_token_type_ids_from_sequencesreturn_tensorsrA   rB   c                 K   sJ   |du s|du rt d|| _| |fd|d|}| |}||d< |S )zIUsed by translation pipeline, to prepare inputs for the generate functionNzATranslation requires a `src_lang` and a `tgt_lang` for this modelT)add_special_tokensrc   forced_bos_token_id)
ValueErrorrA   rD   )rG   
raw_inputsrc   rA   rB   extra_kwargsinputstgt_lang_idr4   r4   r8   _build_translation_inputs   s   
z,MBartTokenizerFast._build_translation_inputsr   r"   	src_texts	tgt_textsc                    s"   || _ || _t j||fi |S rX   )rA   rB   rN   prepare_seq2seq_batch)rG   rl   rA   rm   rB   rT   rU   r4   r8   rn      s   z(MBartTokenizerFast.prepare_seq2seq_batchc                 C      |  | jS rX   )rS   rA   rF   r4   r4   r8   _switch_to_input_mode      z(MBartTokenizerFast._switch_to_input_modec                 C   ro   rX   )set_tgt_lang_special_tokensrB   rF   r4   r4   r8   _switch_to_target_mode   rq   z)MBartTokenizerFast._switch_to_target_modec                 C   |   |  || _g | _| j| jg| _| | j}| | j}tj|dg | |ddg | tt	|| | j| j d| j
_dS )z_Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code].$A$Bsinglepairspecial_tokensNrD   rR   r+   eos_token_idr,   convert_ids_to_tokensr   TemplateProcessinglistzip
_tokenizerpost_processor)rG   rA   prefix_tokens_strsuffix_tokens_strr4   r4   r8   rS         z.MBartTokenizerFast.set_src_lang_special_tokenslangc                 C   rt   )zcReset the special tokens to the target language setting. No prefix and suffix=[eos, tgt_lang_code].ru   rv   rw   Nr{   )rG   r   r   r   r4   r4   r8   rr      r   z.MBartTokenizerFast.set_tgt_lang_special_tokenssave_directoryfilename_prefixc                 C   s~   | j stdtj|std| d d S tj||r"|d ndtd  }tj	| j
tj	|kr<t| j
| |fS )NzhYour fast tokenizer does not have the necessary information to save the vocabulary for a slow tokenizer.zVocabulary path (z) should be a directory.- r   )can_save_slow_tokenizerrf   ospathisdirloggererrorjoinVOCAB_FILES_NAMESabspathr   r   )rG   r   r   out_vocab_filer4   r4   r8   save_vocabulary   s   z"MBartTokenizerFast.save_vocabulary)NNr-   r.   r.   r-   r/   r0   r1   NNNrX   )r   Nr"   )rW   N) __name__
__module____qualname____doc__r   vocab_files_namesmodel_input_namesr   slow_tokenizer_classr+   r   int__annotations__r,   rO   propertyrJ   rA   setterr   r\   rb   rk   r   rn   rp   rs   rS   rr   tupler   __classcell__r4   r4   rU   r8   r(   *   s   
 5






(r(   )r   shutilr   typingr   
tokenizersr   tokenization_utilsr   r   tokenization_utils_fastr   utilsr	   r
   tokenization_mbartr   
get_loggerr   r   r   rK   r(   __all__r4   r4   r4   r8   <module>   s    

 
d