o
    ei!                     @   s~   d dl mZmZmZmZ d dlmZ ddlmZ ddl	m
Z
 ddlmZ eeZddd	Zg d
ZG dd de
ZdgZdS )    )	Tokenizerdecoderspre_tokenizers
processors)Unigram   )
AddedToken)TokenizersBackend)loggingzsentencepiece.bpe.modelztokenizer.json)
vocab_filetokenizer_file)ar_ARcs_CZde_DEen_XXes_XXet_EEfi_FIfr_XXgu_INhi_INit_ITja_XXkk_KZko_KRlt_LTlv_LVmy_MMne_NPnl_XXro_ROru_RUsi_LKtr_TRvi_VNzh_CNc                       s   e Zd ZU dZeZddgZeZg Z	e
e ed< g Ze
e ed< 								
				d"deeB e
B dB f fddZedefddZejdeddfddZdededB dedB fddZdd Zdd Zd#ddZdeddfd d!Z  ZS )$MBartTokenizeruC  
    Construct an MBART tokenizer (backed by HuggingFace's *tokenizers* library). Based on
    [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models).

    This tokenizer inherits from [`TokenizersBackend`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.

    The tokenization method is `<tokens> <eos> <language code>` for source language documents, and `<language code>
    <tokens> <eos>` for target language documents.

    Examples:

    ```python
    >>> from transformers import MBartTokenizer

    >>> tokenizer = MBartTokenizer.from_pretrained(
    ...     "facebook/mbart-large-en-ro", src_lang="en_XX", tgt_lang="ro_RO"
    ... )
    >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
    >>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
    >>> inputs = tokenizer(example_english_phrase, text_target=expected_translation_romanian, return_tensors="pt")
    ```	input_idsattention_maskprefix_tokenssuffix_tokensN<s></s><unk><pad><mask>vocabc                    s  t |trt|dddn|}t  |d ur"  fdd|D  |d u rVt|dft|dft|dft|dfg}|dg7 }tD ]	}||df qC|t|df |_tt	jddd	_
d j
_tt tjd
dddgj
_tjd
dddj
_t jd||||||||	|
 d
| fddtD _d_ddddd_jj t|jd< dd j D _|	d ur|	nd_j_|
_j d S )NTF)lstriprstripc                    s   g | ]}| vr|qS  r3   ).0t)_additional_special_tokensr3   j/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/mbart/tokenization_mbart.py
<listcomp>T   s    z+MBartTokenizer.__init__.<locals>.<listcomp>g        )   ▁g       r   )unk_idbyte_fallbackr9   always)replacementprepend_schemesplit)
	bos_token	eos_token	sep_token	cls_token	unk_token	pad_token
mask_tokensrc_langtgt_langadditional_special_tokensc                    s   i | ]}|  |qS r3   )convert_tokens_to_ids)r4   	lang_codeselfr3   r7   
<dictcomp>   s    z+MBartTokenizer.__init__.<locals>.<dictcomp>   r      )r+   r.   r,   r-   r/   c                 S   s   i | ]\}}||qS r3   r3   )r4   kvr3   r3   r7   rN      s    r   r3   ) 
isinstancestrr   FAIRSEQ_LANGUAGE_CODEScopyextendappend_vocabr   r   
_tokenizer
normalizerr   SequenceWhitespaceSplit	Metaspacepre_tokenizerr   decodersuper__init__lang_code_to_idfairseq_offsetfairseq_tokens_to_idsupdaterJ   itemsfairseq_ids_to_tokens	_src_langcur_lang_coderH   set_src_lang_special_tokens)rM   r0   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   kwargsrK   	__class__)r6   rM   r7   rb   @   sn   





zMBartTokenizer.__init__returnc                 C   s   | j S N)ri   rL   r3   r3   r7   rG      s   zMBartTokenizer.src_langnew_src_langc                 C   s   || _ | | j  d S rp   )ri   rk   )rM   rq   r3   r3   r7   rG      s   return_tensorsrG   rH   c                 K   sJ   |du s|du rt d|| _| |fd|d|}| |}||d< |S )zIUsed by translation pipeline, to prepare inputs for the generate functionNzATranslation requires a `src_lang` and a `tgt_lang` for this modelT)add_special_tokensrr   forced_bos_token_id)
ValueErrorrG   rJ   )rM   
raw_inputsrr   rG   rH   extra_kwargsinputstgt_lang_idr3   r3   r7   _build_translation_inputs   s   
z(MBartTokenizer._build_translation_inputsc                 C   s   |  | jS rp   )rk   rG   rL   r3   r3   r7   _switch_to_input_mode   s   z$MBartTokenizer._switch_to_input_modec                 C   s   | j d u r	| j| _ | | j S rp   )rH   ri   set_tgt_lang_special_tokensrL   r3   r3   r7   _switch_to_target_mode   s   
z%MBartTokenizer._switch_to_target_modec                 C   |   |  || _g | _| j| jg| _| | j}| | j}tj|dg | |ddg | tt	|| | j| j d| j
_dS )z_Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code].$A$Bsinglepairspecial_tokensNrJ   rj   r)   eos_token_idr*   convert_ids_to_tokensr   TemplateProcessinglistziprZ   post_processor)rM   rG   prefix_tokens_strsuffix_tokens_strr3   r3   r7   rk         z*MBartTokenizer.set_src_lang_special_tokenslangc                 C   r~   )zcReset the special tokens to the target language setting. No prefix and suffix=[eos, tgt_lang_code].r   r   r   Nr   )rM   r   r   r   r3   r3   r7   r|      r   z*MBartTokenizer.set_tgt_lang_special_tokens)Nr+   r,   r,   r+   r-   r.   r/   NNN)ro   N)__name__
__module____qualname____doc__VOCAB_FILES_NAMESvocab_files_namesmodel_input_namesr   modelr)   r   int__annotations__r*   rT   dictrb   propertyrG   setterrz   r{   r}   rk   r|   __classcell__r3   r3   rm   r7   r&   !   sH   
 T

r&   N)
tokenizersr   r   r   r   tokenizers.modelsr   tokenization_pythonr   tokenization_utils_tokenizersr	   utilsr
   
get_loggerr   loggerr   rU   r&   __all__r3   r3   r3   r7   <module>   s   

 
0