o
    i/                     @   s   d dl Z d dlmZmZ d dlZd dlmZ d dlm	Z	m
Z
mZmZmZ ddlmZ ddlmZ ddlmZ ed	d
G dd dejjZdgZdS )    N)OptionalUnion)BertTokenizer)FastBertTokenizerShrinkLongestTrimmercase_fold_utf8combine_segmentspad_model_inputs   )keras)requires   )tftensorflow_text)backendsc                       s   e Zd ZdZ										d!dededee d	ee d
ee dedededee dededef fddZ	e
d"ddZe
deeejf fddZdd Z							d#ddZdd  Z  ZS )$TFBertTokenizera  
    This is an in-graph tokenizer for BERT. It should be initialized similarly to other tokenizers, using the
    `from_pretrained()` method. It can also be initialized with the `from_tokenizer()` method, which imports settings
    from an existing standard tokenizer object.

    In-graph tokenizers, unlike other Hugging Face tokenizers, are actually Keras layers and are designed to be run
    when the model is called, rather than during preprocessing. As a result, they have somewhat more limited options
    than standard tokenizer classes. They are most useful when you want to create an end-to-end model that goes
    straight from `tf.string` inputs to outputs.

    Args:
        vocab_list (`list`):
            List containing the vocabulary.
        do_lower_case (`bool`, *optional*, defaults to `True`):
            Whether or not to lowercase the input when tokenizing.
        cls_token_id (`str`, *optional*, defaults to `"[CLS]"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
        sep_token_id (`str`, *optional*, defaults to `"[SEP]"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
        pad_token_id (`str`, *optional*, defaults to `"[PAD]"`):
            The token used for padding, for example when batching sequences of different lengths.
        padding (`str`, defaults to `"longest"`):
            The type of padding to use. Can be either `"longest"`, to pad only up to the longest sample in the batch,
            or `"max_length", to pad all inputs to the maximum length supported by the tokenizer.
        truncation (`bool`, *optional*, defaults to `True`):
            Whether to truncate the sequence to the maximum length.
        max_length (`int`, *optional*, defaults to `512`):
            The maximum length of the sequence, used for padding (if `padding` is "max_length") and/or truncation (if
            `truncation` is `True`).
        pad_to_multiple_of (`int`, *optional*, defaults to `None`):
            If set, the sequence will be padded to a multiple of this value.
        return_token_type_ids (`bool`, *optional*, defaults to `True`):
            Whether to return token_type_ids.
        return_attention_mask (`bool`, *optional*, defaults to `True`):
            Whether to return the attention_mask.
        use_fast_bert_tokenizer (`bool`, *optional*, defaults to `True`):
            If True, will use the FastBertTokenizer class from Tensorflow Text. If False, will use the BertTokenizer
            class instead. BertTokenizer supports some additional options, but is slower and cannot be exported to
            TFLite.
    NlongestT   
vocab_listdo_lower_casecls_token_idsep_token_idpad_token_idpadding
truncation
max_lengthpad_to_multiple_ofreturn_token_type_idsreturn_attention_maskuse_fast_bert_tokenizerc              	      s  t    |rt|ftj|d|| _n+tjjtjj|tj	tj
tj|tjdtjdtjddd}t|ftj|d|| _|| _|| _|d u rO|dn|| _|d u r[|d	n|| _|d u rg|d
n|| _t|d dd| _|| _|| _|| _|	| _|
| _|| _d S )N)token_out_typelower_case_nfd_strip_accents)out_type)dtype)keys	key_dtypevaluesvalue_dtyper   )num_oov_buckets)r    
lower_casez[CLS]z[SEP]z[PAD]r
   axis)super__init__r   r   int64tf_tokenizerlookupStaticVocabularyTableKeyValueTensorInitializerstringrangesizeBertTokenizerLayerr   r   indexr   r   r   r   paired_trimmerr   r   r   r   r   r   )selfr   r   r   r   r   r   r   r   r   r   r   r   tokenizer_kwargslookup_table	__class__ a/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/bert/tokenization_bert_tf.pyr-   ;   sJ   

	
zTFBertTokenizer.__init__	tokenizerPreTrainedTokenizerBasec           	      K   s   | dd}|du r|jn|}| dd}|du r|jn|}| dd}|du r+|jn|}| dd}|du r:|jn|}| }t| dd d}d	d
 |D }| d|||||d|S )a  
        Initialize a `TFBertTokenizer` from an existing `Tokenizer`.

        Args:
            tokenizer (`PreTrainedTokenizerBase`):
                The tokenizer to use to initialize the `TFBertTokenizer`.

        Examples:

        ```python
        from transformers import AutoTokenizer, TFBertTokenizer

        tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
        tf_tokenizer = TFBertTokenizer.from_tokenizer(tokenizer)
        ```
        r   Nr   r   r   c                 S   s   | d S )Nr   r>   )xr>   r>   r?   <lambda>   s    z0TFBertTokenizer.from_tokenizer.<locals>.<lambda>)keyc                 S   s   g | ]}|d  qS )r   r>   ).0entryr>   r>   r?   
<listcomp>   s    z2TFBertTokenizer.from_tokenizer.<locals>.<listcomp>r   r   r   r   r   r>   )popr   r   r   r   	get_vocabsorteditems)	clsr@   kwargsr   r   r   r   vocabr   r>   r>   r?   from_tokenizerk   s(   zTFBertTokenizer.from_tokenizerpretrained_model_name_or_pathc                 O   s\   zt j|g|R i |}W n   ddlm} |j|g|R i |}Y | j|fi |S )a  
        Instantiate a `TFBertTokenizer` from a pre-trained tokenizer.

        Args:
            pretrained_model_name_or_path (`str` or `os.PathLike`):
                The name or path to the pre-trained tokenizer.

        Examples:

        ```python
        from transformers import TFBertTokenizer

        tf_tokenizer = TFBertTokenizer.from_pretrained("google-bert/bert-base-uncased")
        ```
        r   )BertTokenizerFast)r   from_pretrainedtokenization_bert_fastrR   rP   )rM   rQ   init_inputsrN   r@   rR   r>   r>   r?   rS      s   zTFBertTokenizer.from_pretrainedc                 C   s&   | j rt|}| j|}|ddS )Nr   )r   r   r/   tokenize
merge_dims)r9   textstokensr>   r>   r?   unpaired_tokenize   s   z!TFBertTokenizer.unpaired_tokenizec	                 C   s  |d u r| j }|dvrtd|d ur|d urtd|d u r"| j}|d u r)| j}|d u r0| j}|d u r7| j}|d u r>| j}t|tj	sIt
|}|d urXt|tj	sXt
|}|d urp|jjdkrftd|jjdkrptd|jjdkr|d d df |d d df }}| |}|d u r|r|d d d |d f }t|f| j| jd	\}	}
n| |}|r| j||g\}}t||f| j| jd	\}	}
|d
kr|	jdd}|d ur|tj| |  }n|}t|	|| jd\}	}d|	i}|r||d< |r
t|
|| jd\}
}|
|d< |S )N)r   r   z1Padding must be either 'longest' or 'max_length'!zJmax_length cannot be overridden at call time when truncating paired texts!r   zJtext argument should not be multidimensional when a text pair is supplied!z)text_pair should not be multidimensional!   r   )start_of_sequence_idend_of_segment_idr   r*   )max_seq_length	pad_value	input_idsattention_masktoken_type_ids)r   
ValueErrorr   r   r   r   r   
isinstancer   Tensorconvert_to_tensorshaperankr[   r   r   r   r8   trimbounding_shapemathfloordivr	   r   )r9   text	text_pairr   r   r   r   r   r   ra   rc   
pad_lengthrb   output_r>   r>   r?   call   sp   

"



zTFBertTokenizer.callc                 C   s   | j | j| j| j| jdS )NrH   rH   )r9   r>   r>   r?   
get_config   s   zTFBertTokenizer.get_config)
NNNr   Tr   NTTT)r@   rA   )NNNNNNN)__name__
__module____qualname____doc__listboolr   intstrr-   classmethodrP   r   osPathLikerS   r[   rs   rt   __classcell__r>   r>   r<   r?   r      sf    0	
0&	
Hr   )r~   typingr   r   
tensorflowr   r   r   r6   r   r   r   r   r	   modeling_tf_utilsr   utils.import_utilsr   tokenization_bertlayersLayerr   __all__r>   r>   r>   r?   <module>   s     
v