o
    eic7                     @   s   d dl mZmZmZmZmZmZ d dlmZ ddl	m
Z
mZ ddlmZ ddlmZ eeZddd	Zg d
ZG dd deZdgZdS )    )Regex	Tokenizerdecodersnormalizerspre_tokenizers
processors)BPE   )
AddedTokenBatchEncoding)TokenizersBackend)loggingzsentencepiece.bpe.modelztokenizer.json)
vocab_filetokenizer_file)ace_Arabace_Latnacm_Arabacq_Arabaeb_Arabafr_Latnajp_Arabaka_Latnamh_Ethiapc_Arabarb_Arabars_Arabary_Arabarz_Arabasm_Bengast_Latnawa_Devaayr_Latnazb_Arabazj_Latnbak_Cyrlbam_Latnban_Latnbel_Cyrlbem_Latnben_Bengbho_Devabjn_Arabbjn_Latnbod_Tibtbos_Latnbug_Latnbul_Cyrlcat_Latnceb_Latnces_Latncjk_Latnckb_Arabcrh_Latncym_Latndan_Latndeu_Latndik_Latndyu_Latndzo_Tibtell_Grekeng_Latnepo_Latnest_Latneus_Latnewe_Latnfao_Latnpes_Arabfij_Latnfin_Latnfon_Latnfra_Latnfur_Latnfuv_Latngla_Latngle_Latnglg_Latngrn_Latnguj_Gujrhat_Latnhau_Latnheb_Hebrhin_Devahne_Devahrv_Latnhun_Latnhye_Armnibo_Latnilo_Latnind_Latnisl_Latnita_Latnjav_Latnjpn_Jpankab_Latnkac_Latnkam_Latnkan_Kndakas_Arabkas_Devakat_Georknc_Arabknc_Latnkaz_Cyrlkbp_Latnkea_Latnkhm_Khmrkik_Latnkin_Latnkir_Cyrlkmb_Latnkon_Latnkor_Hangkmr_Latnlao_Laoolvs_Latnlij_Latnlim_Latnlin_Latnlit_Latnlmo_Latnltg_Latnltz_Latnlua_Latnlug_Latnluo_Latnlus_Latnmag_Devamai_Devamal_Mlymmar_Devamin_Latnmkd_Cyrlplt_Latnmlt_Latnmni_Bengkhk_Cyrlmos_Latnmri_Latnzsm_Latnmya_Mymrnld_Latnnno_Latnnob_Latnnpi_Devanso_Latnnus_Latnnya_Latnoci_Latngaz_Latnory_Oryapag_Latnpan_Gurupap_Latnpol_Latnpor_Latnprs_Arabpbt_Arabquy_Latnron_Latnrun_Latnrus_Cyrlsag_Latnsan_Devasat_Bengscn_Latnshn_Mymrsin_Sinhslk_Latnslv_Latnsmo_Latnsna_Latnsnd_Arabsom_Latnsot_Latnspa_Latnals_Latnsrd_Latnsrp_Cyrlssw_Latnsun_Latnswe_Latnswh_Latnszl_Latntam_Tamltat_Cyrltel_Telutgk_Cyrltgl_Latntha_Thaitir_Ethitaq_Latntaq_Tfngtpi_Latntsn_Latntso_Latntuk_Latntum_Latntur_Latntwi_Latntzm_Tfnguig_Arabukr_Cyrlumb_Latnurd_Arabuzn_Latnvec_Latnvie_Latnwar_Latnwol_Latnxho_Latnydd_Hebryor_Latnyue_Hantzho_Hanszho_Hantzul_Latnc                       sf  e Zd ZU dZeZddgZeZg Z	e
e ed< g Ze
e ed< 									
						d0deeeef B dB dee
e B dB f fddZedefddZejdeddfddZdededB dedB fddZ								d1de
e ded e
e dB ded!edB d"edB d#ededB d$edefd%d&Zd'd( Zd)d* Zd2d+d,Zd-eddfd.d/Z  ZS )3NllbTokenizera	  
    Construct an NLLB tokenizer (backed by HuggingFace's *tokenizers* library). Based on
    [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models).

    This tokenizer inherits from [`TokenizersBackend`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.

    The tokenization method is `<tokens> <eos> <language code>` for source language documents, and `<language code>
    <tokens> <eos>` for target language documents.

    Examples:

    ```python
    >>> from transformers import NllbTokenizer

    >>> tokenizer = NllbTokenizer.from_pretrained(
    ...     "facebook/nllb-200-distilled-600M", src_lang="eng_Latn", tgt_lang="fra_Latn"
    ... )
    >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
    >>> expected_translation_french = "Le chef de l'ONU affirme qu'il n'y a pas de solution militaire en Syrie."
    >>> inputs = tokenizer(example_english_phrase, text_target=expected_translation_french, return_tensors="pt")
    ```

    Args:
        vocab_file (`str`, *optional*):
            Path to the vocabulary file.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token that was used during pretraining.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.
        sep_token (`str`, *optional*, defaults to `"</s>"`):
            The separator token.
        cls_token (`str`, *optional*, defaults to `"<s>"`):
            The classifier token.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding.
        mask_token (`str`, *optional*, defaults to `"<mask>"`):
            The token used for masking values.
        src_lang (`str`, *optional*):
            The language to use as source language for translation.
        tgt_lang (`str`, *optional*):
            The language to use as target language for translation.
        legacy_behaviour (`bool`, *optional*, defaults to `False`):
            Whether to use legacy behaviour (suffix pattern) or new behaviour (prefix pattern).
    	input_idsattention_maskprefix_tokenssuffix_tokensN<s></s><unk><pad><mask>Fvocabmergesc                    s  |d ur|}n|d u rt }t|	trt|	ddddn|	}	|| _|d u r5t|dt|dt|dt|di}|| _|p;g | _tt| j| jd t|ddd| _	t
t
td	d
t
 t
tdd
g| j	_tjdddd| j	_tjdddd| j	_t jd|||||||
||	||d| d| _ddddd| _dd | j D | _|
d ur|
nd| _| | j| _|| _| | j d S )NT)
normalizedlstripspecialr         r	   F)r   r   dropout	unk_tokenfuse_unkbyte_fallbackz[\n\r\t] z {2,}u   ▁always)replacementprepend_schemesplit)	bos_token	eos_token	sep_token	cls_tokenr   	pad_tokensrc_langtgt_lang
mask_tokenextra_special_tokenslegacy_behaviour)r   r   r   r   c                 S   s   i | ]\}}||qS  r   ).0kvr   r   h/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/nllb/tokenization_nllb.py
<dictcomp>   s    z*NllbTokenizer.__init__.<locals>.<dictcomp>r>   r   ) FAIRSEQ_LANGUAGE_CODES
isinstancestrr
   r   _vocab_mergesr   r   
_tokenizerr   SequenceReplacer   NFKC
normalizerr   	Metaspacepre_tokenizerr   decodersuper__init__fairseq_offsetfairseq_tokens_to_idsitemsfairseq_ids_to_tokens	_src_langconvert_tokens_to_idscur_lang_coder   set_src_lang_special_tokens)selfr   r   r   r   r   r   r   r   r   r   r   additional_special_tokensr   r   kwargs	__class__r   r  r  Y   sz   
zNllbTokenizer.__init__returnc                 C   s   | j S N)r  r  r   r   r  r      s   zNllbTokenizer.src_langnew_src_langc                 C   s   || _ | | j  d S r!  )r  r  )r  r#  r   r   r  r      s   return_tensorsr   r   c                 K   sJ   |du s|du rt d|| _| |fd|d|}| |}||d< |S )zIUsed by translation pipeline, to prepare inputs for the generate functionNzATranslation requires a `src_lang` and a `tgt_lang` for this modelT)add_special_tokensr$  forced_bos_token_id)
ValueErrorr   r  )r  
raw_inputsr$  r   r   extra_kwargsinputstgt_lang_idr   r   r  _build_translation_inputs   s   
z'NllbTokenizer._build_translation_inputsr>   rH   longestT	src_texts	tgt_texts
max_lengthmax_target_lengthpadding
truncationc
                 K   s   || _ || _|d u r| j}| |fd||||	d|
}|d u r!|S |d u r'|}|   | |fd||||	d|
}|d |d< |   |S )NT)r%  r$  r0  r2  r3  )r%  r$  r2  r0  r3  r   labels)r   r   model_max_length_switch_to_target_mode_switch_to_input_mode)r  r.  r   r/  r   r0  r1  r2  r$  r3  r  model_inputsr4  r   r   r  prepare_seq2seq_batch   sD   
	z#NllbTokenizer.prepare_seq2seq_batchc                 C   s   |  | jS r!  )r  r   r"  r   r   r  r7    s   z#NllbTokenizer._switch_to_input_modec                 C   s   | j d u r	| j| _ | | j S r!  )r   r  set_tgt_lang_special_tokensr"  r   r   r  r6    s   
z$NllbTokenizer._switch_to_target_modec                 C      |  || _| jrg | _| j| jg| _n
| jg| _| jg| _| | j}| | j}tj|dg | |ddg | t	t
|| | j| j d| j_dS )zReset the special tokens to the source lang setting.
        - In legacy mode: No prefix and suffix=[eos, src_lang_code].
        - In default mode: Prefix=[src_lang_code], suffix = [eos]
        $A$Bsinglepairspecial_tokensNr  r  r   r   eos_token_idr   convert_ids_to_tokensr   TemplateProcessinglistzipr	  post_processor)r  r   prefix_tokens_strsuffix_tokens_strr   r   r  r  
  s   

z)NllbTokenizer.set_src_lang_special_tokenslangc                 C   r;  )zReset the special tokens to the target lang setting.
        - In legacy mode: No prefix and suffix=[eos, tgt_lang_code].
        - In default mode: Prefix=[tgt_lang_code], suffix = [eos]
        r<  r=  r>  NrB  )r  rK  rI  rJ  r   r   r  r:  !  s   

z)NllbTokenizer.set_tgt_lang_special_tokens)NNr   r   r   r   r   r   r   NNNNF)r>   NrH   NNr-  NT)r   N)__name__
__module____qualname____doc__VOCAB_FILES_NAMESvocab_files_namesmodel_input_namesr   modelr   rF  int__annotations__r   r  dictr  propertyr   setterr,  boolr   r9  r7  r6  r  r:  __classcell__r   r   r  r  r   !   s   
 0^

	

6
r   N)
tokenizersr   r   r   r   r   r   tokenizers.modelsr   tokenization_pythonr
   r   tokenization_utils_tokenizersr   utilsr   
get_loggerrL  loggerrP  r  r   __all__r   r   r   r  <module>   s    

  
