o
    iA                     @   s   d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZm	Z	m
Z
 d dlZddlmZ ddlmZ ddlmZ eeZd	d
ddddZdZeddG dd deZdedeeef dejfddZdeddfddZdede
eef fddZdgZdS )    N)Path)copyfile)AnyOptionalUnion   )PreTrainedTokenizer)logging)requiresz
source.spmz
target.spmz
vocab.jsonztarget_vocab.jsonztokenizer_config.json)
source_spm
target_spmvocabtarget_vocab_filetokenizer_config_fileu   ▁)sentencepiece)backendsc                
       s  e Zd ZdZeZddgZ										dDd
eee	e
f  ddf fddZdd Zde	de	fddZdd Zde	fddZde	dee	 fddZdede	fddZ fddZ fdd Zd!ee	 de	fd"d#ZdEdee fd$d%Zd&d' Zd(d) Zedefd*d+ZdEd,e	d-ee	 dee	 fd.d/Zdefd0d1Zd2d3 Zd4d5 Z defd6d7Z!d8eddfd9d:Z"d;d< Z#d=d> Z$		dFd?ed@ee dAe%dee fdBdCZ&  Z'S )GMarianTokenizeraB  
    Construct a Marian tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        source_spm (`str`):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .spm extension) that
            contains the vocabulary for the source language.
        target_spm (`str`):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .spm extension) that
            contains the vocabulary for the target language.
        source_lang (`str`, *optional*):
            A string representing the source language.
        target_lang (`str`, *optional*):
            A string representing the target language.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        model_max_length (`int`, *optional*, defaults to 512):
            The maximum sentence length the model accepts.
        additional_special_tokens (`list[str]`, *optional*, defaults to `["<eop>", "<eod>"]`):
            Additional special tokens used by the tokenizer.
        sp_model_kwargs (`dict`, *optional*):
            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
            to set:

            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

              - `nbest_size = {0,1}`: No sampling is performed.
              - `nbest_size > 1`: samples from the nbest_size results.
              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.

            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.

    Examples:

    ```python
    >>> from transformers import MarianForCausalLM, MarianTokenizer

    >>> model = MarianForCausalLM.from_pretrained("Helsinki-NLP/opus-mt-en-de")
    >>> tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de")
    >>> src_texts = ["I am a small frog.", "Tom asked his teacher for advice."]
    >>> tgt_texts = ["Ich bin ein kleiner Frosch.", "Tom bat seinen Lehrer um Rat."]  # optional
    >>> inputs = tokenizer(src_texts, text_target=tgt_texts, return_tensors="pt", padding=True)

    >>> outputs = model(**inputs)  # should work
    ```	input_idsattention_maskN<unk></s><pad>   Fsp_model_kwargsreturnc                    s.  |d u ri n|| _ t| sJ d| || _t|| _t|| jvr)tdt|	| jv s2J |rHt|| _dd | j	 D | _
g | _ndd | j	 D | _
dd | jD | _|| _|| _||g| _t|| j | _t|| j | _| j| _| j| _|   t jd	|||||	|
| j ||d	| d S )
Nzcannot find spm source z <unk> token must be in the vocabc                 S      i | ]\}}||qS  r   .0kvr   r   k/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/transformers/models/marian/tokenization_marian.py
<dictcomp>       z,MarianTokenizer.__init__.<locals>.<dictcomp>c                 S   r   r   r   r   r   r   r!   r"      r#   c                 S   s$   g | ]}| d r|dr|qS )>><<)
startswithendswith)r   r   r   r   r!   
<listcomp>   s   $ z,MarianTokenizer.__init__.<locals>.<listcomp>)	source_langtarget_lang	unk_token	eos_token	pad_tokenmodel_max_lengthr   r   separate_vocabsr   )r   r   existsr/   	load_jsonencoderstrKeyErrortarget_encoderitemsdecodersupported_language_codesr)   r*   	spm_filesload_spm
spm_source
spm_targetcurrent_spmcurrent_encoder_setup_normalizersuper__init__)selfr   r   r   r   r)   r*   r+   r,   r-   r.   r   r/   kwargs	__class__r   r!   rA   k   sD   



zMarianTokenizer.__init__c              	   C   sN   zddl m} || jj| _W d S  ttfy&   td dd | _Y d S w )Nr   )MosesPunctNormalizerz$Recommended: pip install sacremoses.c                 S   s   | S Nr   )xr   r   r!   <lambda>   s    z3MarianTokenizer._setup_normalizer.<locals>.<lambda>)	
sacremosesrF   r)   	normalizepunc_normalizerImportErrorFileNotFoundErrorwarningswarn)rB   rF   r   r   r!   r?      s   
z!MarianTokenizer._setup_normalizerrH   c                 C   s   |r|  |S dS )zHCover moses empty string edge case. They return empty list for '' input! )rL   )rB   rH   r   r   r!   rK      s   zMarianTokenizer.normalizec                 C   s   | j || j | j S rG   )r>   getr+   )rB   tokenr   r   r!   _convert_token_to_id   s   z$MarianTokenizer._convert_token_to_idtextc                 C   sN   g }| dr#|d }dkr#||d|d   ||d d }||fS )z6Remove language codes like >>fr<< before sentencepiecer$   r%   N   )r&   findappend)rB   rU   codeend_locr   r   r!   remove_language_code   s
   z$MarianTokenizer.remove_language_codec                 C   s&   |  |\}}| jj|td}|| S )N)out_type)r\   r=   encoder3   )rB   rU   rZ   piecesr   r   r!   	_tokenize   s   zMarianTokenizer._tokenizeindexc                 C   s   | j || jS )z?Converts an index (integer) in a token (str) using the decoder.)r7   rR   r+   )rB   ra   r   r   r!   _convert_id_to_token   s   z$MarianTokenizer._convert_id_to_tokenc                       t  j|fi |S )ao  
        Convert a list of lists of token ids into a list of strings by calling decode.

        Args:
            sequences (`Union[list[int], list[list[int]], np.ndarray, torch.Tensor, tf.Tensor]`):
                List of tokenized input ids. Can be obtained using the `__call__` method.
            skip_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not to remove special tokens in the decoding.
            clean_up_tokenization_spaces (`bool`, *optional*):
                Whether or not to clean up the tokenization spaces. If `None`, will default to
                `self.clean_up_tokenization_spaces` (available in the `tokenizer_config`).
            use_source_tokenizer (`bool`, *optional*, defaults to `False`):
                Whether or not to use the source tokenizer to decode sequences (only applicable in sequence-to-sequence
                problems).
            kwargs (additional keyword arguments, *optional*):
                Will be passed to the underlying model specific decode method.

        Returns:
            `list[str]`: The list of decoded sentences.
        )r@   batch_decode)rB   	sequencesrC   rD   r   r!   rd      s   zMarianTokenizer.batch_decodec                    rc   )a  
        Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
        tokens and clean up tokenization spaces.

        Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`.

        Args:
            token_ids (`Union[int, list[int], np.ndarray, torch.Tensor, tf.Tensor]`):
                List of tokenized input ids. Can be obtained using the `__call__` method.
            skip_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not to remove special tokens in the decoding.
            clean_up_tokenization_spaces (`bool`, *optional*):
                Whether or not to clean up the tokenization spaces. If `None`, will default to
                `self.clean_up_tokenization_spaces` (available in the `tokenizer_config`).
            use_source_tokenizer (`bool`, *optional*, defaults to `False`):
                Whether or not to use the source tokenizer to decode sequences (only applicable in sequence-to-sequence
                problems).
            kwargs (additional keyword arguments, *optional*):
                Will be passed to the underlying model specific decode method.

        Returns:
            `str`: The decoded sentence.
        )r@   decode)rB   	token_idsrC   rD   r   r!   rf      s   zMarianTokenizer.decodetokensc                 C   sv   | j r| jn| j}g }d}|D ]}|| jv r$|||| d 7 }g }q|| q|||7 }|td}| S )zQUses source spm if _decode_use_source_tokenizer is True, and target spm otherwiserQ    )	_decode_use_source_tokenizerr;   r<   all_special_tokensdecode_piecesrY   replaceSPIECE_UNDERLINEstrip)rB   rh   sp_modelcurrent_sub_tokens
out_stringrS   r   r   r!   convert_tokens_to_string   s   
z(MarianTokenizer.convert_tokens_to_stringc                 C   s$   |du r
|| j g S || | j g S )z=Build model inputs from a sequence by appending eos_token_id.N)eos_token_id)rB   token_ids_0token_ids_1r   r   r!    build_inputs_with_special_tokens  s   z0MarianTokenizer.build_inputs_with_special_tokensc                 C   s   | j | _| j| _d S rG   )r;   r=   r2   r>   rB   r   r   r!   _switch_to_input_mode  s   z%MarianTokenizer._switch_to_input_modec                 C   s   | j | _| jr| j| _d S d S rG   )r<   r=   r/   r5   r>   rx   r   r   r!   _switch_to_target_mode  s   z&MarianTokenizer._switch_to_target_modec                 C   s
   t | jS rG   )lenr2   rx   r   r   r!   
vocab_size  s   
zMarianTokenizer.vocab_sizesave_directoryfilename_prefixc              	   C   s  t j|std| d d S g }| jrOt j||r |d ndtd  }t j||r1|d ndtd  }t| j	| t| j
| || || nt j||rY|d ndtd  }t| j	| || ttd td g| j| j| jgD ]Z\}}}	t j||r|d nd| }
t j|t j|
krt j|rt||
 ||
 q|t j|st|
d	}|	 }|| W d    n1 sw   Y  ||
 q|t|S )
NzVocabulary path (z) should be a directory-rQ   r   r   r   r   wb)ospathisdirloggererrorr/   joinVOCAB_FILES_NAMES	save_jsonr2   r5   rY   zipr9   r;   r<   abspathisfiler   openserialized_model_protowritetuple)rB   r}   r~   saved_filesout_src_vocab_fileout_tgt_vocab_fileout_vocab_filespm_save_filenamespm_orig_path	spm_modelspm_save_pathficontent_spiece_modelr   r   r!   save_vocabulary  sR   


$

zMarianTokenizer.save_vocabularyc                 C   s   |   S rG   )get_src_vocabrx   r   r   r!   	get_vocabL  s   zMarianTokenizer.get_vocabc                 C      t | jfi | jS rG   )dictr2   added_tokens_encoderrx   r   r   r!   r   O     zMarianTokenizer.get_src_vocabc                 C   r   rG   )r   r5   added_tokens_decoderrx   r   r   r!   get_tgt_vocabR  r   zMarianTokenizer.get_tgt_vocabc                 C   s"   | j  }|tg d |S )N)r;   r<   r=   rL   r   )__dict__copyupdater   fromkeys)rB   stater   r   r!   __getstate__U  s
   
zMarianTokenizer.__getstate__dc                    sF   | _ t dsi  _ fdd jD \ _ _ j _   d S )Nr   c                 3   s    | ]	}t | jV  qd S rG   )r:   r   )r   frx   r   r!   	<genexpr>c  s    z/MarianTokenizer.__setstate__.<locals>.<genexpr>)r   hasattrr   r9   r;   r<   r=   r?   )rB   r   r   rx   r!   __setstate__\  s   
zMarianTokenizer.__setstate__c                 O   s   dS )zJust EOS   r   )rB   argsrC   r   r   r!   num_special_tokens_to_addg  s   z)MarianTokenizer.num_special_tokens_to_addc                    s(   t | j  | j  fdd|D S )Nc                    s   g | ]
}| v r
d ndqS )r   r   r   )r   rH   all_special_idsr   r!   r(   n  s    z7MarianTokenizer._special_token_mask.<locals>.<listcomp>)setr   removeunk_token_id)rB   seqr   r   r!   _special_token_maskk  s   
z#MarianTokenizer._special_token_maskru   rv   already_has_special_tokensc                 C   s:   |r|  |S |du r|  |dg S |  || dg S )zCGet list where entries are [1] if a token is [eos] or [pad] else 0.Nr   )r   )rB   ru   rv   r   r   r   r!   get_special_tokens_maskp  s
   
z'MarianTokenizer.get_special_tokens_mask)	NNNr   r   r   r   NFrG   )NF)(__name__
__module____qualname____doc__r   vocab_files_namesmodel_input_namesr   r   r3   r   rA   r?   rK   rT   r\   listr`   intrb   rd   rf   rs   rw   ry   rz   propertyr|   r   r   r   r   r   r   r   r   r   boolr   __classcell__r   r   rD   r!   r   ,   sd    :>	 -r   r   r   r   c                 C   s   t jdi |}||  |S )Nr   )r   SentencePieceProcessorLoad)r   r   spmr   r   r!   r:   |  s   
r:   c                 C   s@   t |d}tj| |dd W d    d S 1 sw   Y  d S )NwrW   )indent)r   jsondump)datar   r   r   r   r!   r     s   "r   c                 C   s8   t | d}t|W  d    S 1 sw   Y  d S )Nr)r   r   load)r   r   r   r   r!   r1     s   $r1   ) r   r   rO   pathlibr   shutilr   typingr   r   r   r   tokenization_utilsr   utilsr	   utils.import_utilsr
   
get_loggerr   r   r   rn   r   r3   r   r   r:   r   r   r1   __all__r   r   r   r!   <module>   s4   
	   Q
