o
    ei7@                     @   s   d dl mZ ddlmZ ddlmZ ddlmZ ddlm	Z	 ddl
mZ e	eZdZd	d
dZg dg ddZddddddddZeddG dd deZdgZdS )    )Any   )BatchEncoding)
AddedToken)SentencePieceBackend)logging)requiresu   ▁zsentencepiece.bpe.modelztokenizer.json)
vocab_filetokenizer_file)__java__
__python__	__en_XX__)r   r   r   __javascript____php____ruby____go__)basemultir   r   r   r   r   r   r   )javapythonen_XX
javascriptphprubygo)sentencepiece)backendsc                       s|  e Zd ZU dZeZddgZg Zee	 e
d< g Zee	 e
d< 								
						d:deeef dB f fddZedd Zdd ZedefddZejdeddfddZdededB dedB fddZdd  Zd!d" Z	#		$d;d%ee ded&ee dB dedef
 fd'd(Zd)d* Zd+d, Zd<d-d.Zd/eddfd0d1Zd/edefd2d3Zd4edefd5d6Zd= fd8d9	Z   Z!S )>PLBartTokenizera  
    Construct an PLBART tokenizer.

    Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
    [SentencePiece](https://github.com/google/sentencepiece).

    The tokenization method is `<tokens> <eos> <language code>` for source language documents, and `<language code>
    <tokens> <eos>` for target language documents.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        src_lang (`str`, *optional*):
            A string representing the source language.
        tgt_lang (`str`, *optional*):
            A string representing the target language.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The start of sequence token.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.
        sep_token (`str`, *optional*, defaults to `"</s>"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
        cls_token (`str`, *optional*, defaults to `"<s>"`):
            The cls token, which is a special token used as the first token for all tasks.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        mask_token(`str`, *optional*, defaults to `"<mask>"`):
            The token used for masking values. This is the token used when training this model with masking tasks. This
            is only used in the `"base"` tokenizer type. For `"multi"` tokenizer, masking is never done for the
            downstream tasks.
        language_codes (`str`, *optional*, defaults to `"base"`):
            What language codes to use. Should be one of `"base"` or `"multi"`.
        sp_model_kwargs (`dict`, *optional*):
            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
            to set:
            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
              - `nbest_size = {0,1}`: No sampling is performed.
              - `nbest_size > 1`: samples from the nbest_size results.
              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.
            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.

    Examples:

    ```python
    >>> from transformers import PLBartTokenizer

    >>> tokenizer = PLBartTokenizer.from_pretrained("uclanlp/plbart-python-en_XX", src_lang="python", tgt_lang="en_XX")
    >>> example_python_phrase = "def maximum(a,b,c):NEW_LINE_INDENTreturn max([a,b,c])"
    >>> expected_translation_english = "Returns the maximum value of a b c."
    >>> inputs = tokenizer(example_python_phrase, text_target=expected_translation_english, return_tensors="pt")
    ```	input_idsattention_maskprefix_tokenssuffix_tokens<s></s><unk><pad><mask>r   NTsp_model_kwargsc                    s  t |trt|dddn|}|d u ri n|_|
}
|}|	_tj }|_i _i _	ddddd_
d	d
 j
 D _d_t| |d ur\  fdd|D  t jd'i d|d|d|d|d|d|d|d|d|
d|d djd|d|	dddd| tj_fdd
t|D _d d
 j D _	ddddd_
jd!krtjtj j j
d"< j
j d#d
 j
 D _h d$}|tj  d}|D ]}j|d }|d ur	j|d  d}q|r    d}j D ]\}}|jv r(qt|ddddd%j|< d}q|rC    jd!kr]|
_jd urXjj nj_n|
d urd|
nd&_jj _|_j d S )(NTF)lstriprstripr         r   )r"   r%   r#   r$   c                 S      i | ]\}}||qS  r-   .0kvr-   r-   l/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/plbart/tokenization_plbart.py
<dictcomp>       z,PLBartTokenizer.__init__.<locals>.<dictcomp>c                    s   g | ]}| vr|qS r-   r-   )r/   t)_additional_special_tokensr-   r2   
<listcomp>   s    z,PLBartTokenizer.__init__.<locals>.<listcomp>r	   	bos_token	eos_token	unk_token	sep_token	cls_token	pad_token
mask_tokensrc_langtgt_langadditional_special_tokensr'   clean_up_tokenization_spaceslanguage_codesspecial_tokens_patternprefix_suffixtoken_type_ids_pattern	all_zerosc                    s"   i | ]\}}| j |  j qS r-   )sp_model_sizefairseq_offset)r/   icodeselfr-   r2   r3      s    c                 S   r,   r-   r-   r.   r-   r-   r2   r3      r4   r   r&   c                 S   r,   r-   r-   r.   r-   r-   r2   r3      r4   >   r"   r#   r%   r$   r&   )special
normalizedr(   r)   r   r-   ) 
isinstancestrr   r'   !_convert_lang_code_special_formatrC   FAIRSEQ_LANGUAGE_CODESr	   lang_code_to_idid_to_lang_codefairseq_tokens_to_idsitemsfairseq_ids_to_tokensrI   listextendsuper__init__lensp_modelrH   	enumerateupdate_added_tokens_encoderpop_added_tokens_decoder_update_trie_update_total_vocab_size	_src_langcur_lang_code_idr@   set_src_lang_special_tokens)rM   r	   r8   r9   r;   r<   r:   r=   r>   rC   r?   r@   r'   rA   rB   kwargsfairseq_language_codesreserved_tokensremovedtokenidxsynced	__class__)r6   rM   r2   r\   t   s   


		


 

zPLBartTokenizer.__init__c                 C   s`   t t| di }t| dd}t| drt | jnd}t| dddkr*|| | d S || | S )NrT   rI   r*   r^   r   rC   r   )r]   getattrhasattrr^   )rM   lang_code_countrI   
base_vocabr-   r-   r2   
vocab_size   s   zPLBartTokenizer.vocab_sizec                    st   | j   t| j D ]}| j|}|dkr| jn|| j }| vr(| |< q  fdd| j	
 D   S )z,Override to use fairseq vocabulary structurer   c                    s   i | ]\}}| vr||qS r-   r-   )r/   rm   rn   vocabr-   r2   r3      s    z-PLBartTokenizer.get_vocab.<locals>.<dictcomp>)rV   copyranger^   get_piece_size	IdToPieceunk_token_idrI   r`   ra   rW   )rM   rJ   sp_tokenvocab_idr-   rw   r2   	get_vocab   s   
zPLBartTokenizer.get_vocabreturnc                 C   s   | j S N)rf   rL   r-   r-   r2   r?      s   zPLBartTokenizer.src_langnew_src_langc                 C   s    |  |}|| _| | j d S r   )rR   rf   rh   )rM   r   r-   r-   r2   r?     s   
return_tensorsr?   r@   c                 K   s^   |du s|du rt d| || _| || _| |fd|d|}| | j}||d< |S )zIUsed by translation pipeline, to prepare inputs for the generate functionNzATranslation requires a `src_lang` and a `tgt_lang` for this modelT)add_special_tokensr   forced_bos_token_id)
ValueErrorrR   r?   r@   convert_tokens_to_ids)rM   
raw_inputsr   r?   r@   extra_kwargsinputstgt_lang_idr-   r-   r2   _build_translation_inputs	  s   z)PLBartTokenizer._build_translation_inputsc                 C   s4   || j v r
| j | S | j|}|r|| j S | jS )z0Converts a token (str) in an id using the vocab.)rV   r^   	PieceToIdrI   r}   )rM   rm   spm_idr-   r-   r2   _convert_token_to_id  s   

z$PLBartTokenizer._convert_token_to_idc                 C   s&   || j v r
| j | S | j|| j S )z=Converts an index (integer) in a token (str) using the vocab.)rX   r^   r|   rI   )rM   indexr-   r-   r2   _convert_id_to_token  s   

z$PLBartTokenizer._convert_id_to_tokenr   r   	src_texts	tgt_textsc                    s.   |  || _|  || _t j||fi |S r   )rR   r?   r@   r[   prepare_seq2seq_batch)rM   r   r?   r   r@   ri   rp   r-   r2   r   %  s   z%PLBartTokenizer.prepare_seq2seq_batchc                 C      |  | jS r   )rh   r?   rL   r-   r-   r2   _switch_to_input_mode1     z%PLBartTokenizer._switch_to_input_modec                 C   r   r   )set_tgt_lang_special_tokensr@   rL   r-   r-   r2   _switch_to_target_mode4  r   z&PLBartTokenizer._switch_to_target_modec                 C   R   |  |}|dur| j| nd| _g | _| jdur"| j| jg| _dS | jg| _dS )z_Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code].NrR   rT   cur_lang_coder    eos_token_idr!   )rM   r?   r-   r-   r2   rh   7  s   

z+PLBartTokenizer.set_src_lang_special_tokenslangc                 C   r   )zcReset the special tokens to the target language setting. No prefix and suffix=[eos, tgt_lang_code].Nr   rM   r   r-   r-   r2   r   A  s   

z+PLBartTokenizer.set_tgt_lang_special_tokensc                 C   s   t ||}|S )z;Convert Language Codes to format tokenizer uses if required)FAIRSEQ_LANGUAGE_CODES_MAPgetr   r-   r-   r2   rR   L  s   z1PLBartTokenizer._convert_lang_code_special_format
out_stringc                 C   sX   | dd dd dd dd d	d
 dd dd dd dd dd}|S )a  
        Clean up a list of simple English tokenization artifacts like spaces before punctuations and abbreviated forms.

        Args:
            out_string (`str`): The text to clean up.

        Returns:
            `str`: The cleaned-up string.
        z ..z ??z !!z ,,z ' 'z n'tzn'tz 'mz'mz 'sz'sz 'vez'vez 'rez're)replace)rM   r   r-   r-   r2   clean_up_tokenizationQ  s   
z%PLBartTokenizer.clean_up_tokenizationFc                    s   t  jd||| jd|S )zOOverride to use self.clean_up_tokenization_spaces as default for batched input.)	token_idsskip_special_tokensrB   Nr-   )r[   decoderB   )rM   r   r   rB   ri   rp   r-   r2   r   i  s   zPLBartTokenizer.decode)r"   r#   r#   r"   r$   r%   r&   r   NNNNT)r   Nr   )r   N)FN)"__name__
__module____qualname____doc__VOCAB_FILES_NAMESvocab_files_namesmodel_input_namesr    rY   int__annotations__r!   dictrQ   r   r\   propertyrv   r   r?   setterr   r   r   r   r   r   r   rh   r   rR   r   r   __classcell__r-   r-   rp   r2   r   /   sv   
 =u

		


r   N)typingr   tokenization_pythonr   tokenization_utils_baser    tokenization_utils_sentencepiecer   utilsr   utils.import_utilsr   
get_loggerr   loggerSPIECE_UNDERLINEr   rS   r   r   __all__r-   r-   r-   r2   <module>   s2   

  
E