o
    i                     @   s  d Z ddlZddlZddlZddlZddlmZ ddlmZm	Z	m
Z
 ddlmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZmZmZmZ ddlm Z  ddl!m"Z" ddl#m$Z$m%Z%m&Z&m'Z'm(Z( e r|ddl)m*Z* ndZ*e+e,Z-ee.e/e	e. e	e. f f g dde rdndffde rdnde rdndffdde rdndffdde rdndffdde rdndffd de rd!ndffd"de rdndffd#d$e rd%nde rd&ndffd'd(de rdndffd)e r	d*nddffd+d,d-e rd.nde r d/ndffd0d1e r,d2ndffd3d4de r:d5ndffd6d7d8de rJdndffd9d:e rVd;ndffd<de rbd=ndffd>de rnd5ndffd?d@e rzdAndffdBde rdndffdCdDe rdEnde rdFndffdGdHe rdnde rdndffdIde rdndffdJd@e rdAndffdKde rdndffdLde rdndffdMdNe rdOnde rdPndffdQdRe rdSndffdTde r
d!ndffdUde rd!ndffdVde r"dndffdWdXe r.dYndffdZd[e r:d\ndffd]e rEd^nde rLd_ndffd`dade rZd5ndffdbdcddd@e rjdAndffded:e rvd;ndffdfdge rdhndffdie rdjnde rdkndffdle rdnde rdndffdme rdnde rdndffdne rdnde rdndffdoe rdnde rdndffdpdqe rdnde rdndffdrdse rdtndffdudve rdwndffdxdye rdzndffd{d:e r d;ndffd|de r,dndffd}de r8dndffd~de rDdndffde rOdnddffdde r]d:nde rdd;ndffdde rpd5ndffdde r|dndffde rdnddffddde rd;ndffdde rdndffddde rdndffde rdnde rdndffde rdnde rdndffde rdnde rdndffde rdnde rdndffde rdnde r
dndffde rdnde rdndffdde r(dndffdde r4d5ndffdde r@d5ndffdde rLd5ndffdde rXd5ndffdde rdd5ndffde rodnddffdd:e r|d;ndffdd:e rd;ndffdd:e rd;ndffdde rdndffddde rd5ndffdd:e rd;ndffddddddde rdndffdde rdndffdde rd5ndffdde rdndffddd@e rdAndffdde rdndffdde rdndffdde r&dndffdd:e r2d;ndffdd:e r>d;ndffddXe rJdYndffde rUdnde r\dndffdde rhdndffde rsdnde rzdndffdde rdnde rdndffdde rd5ndffdde rdndffdde rdndffdde rdndffdde rdndffdde rdndffdde rdndffde rdnde rdndffde rdnde rdndffde rdnde rdndffdde r$dndffdde r0dndffdde r<dndffdde rHdndffdde rTdndffde r_dnde rfdndffddde rtdndffde rdnddffdde rdndffdde rdndffde rdnddffde rdnde rdndffde rdnde rdndffdd@e rdAndffdde rdndffdde rdndffdde rd:nde r d;ndffde rdne rdnde re sdndffde r'dne r-dnde r8e s8dndffde rCdne rIdnde rTe sTdndffde r_dne rednde rpe spdndffdde r|dndffde rdnddffdde rdndffdde rd ndffdde rd5ndffdde rd5ndffdde rd5ndffdde rאdndffdde rdndffdd@e rdAndffd	e rd
nde rdndffdde rdndffdde r dndffdde r/dndffddde r?d5ndffdde rLdndffde rYdnde radndffde rndnde rvdndffde rdnde rdndffdde rdndffdde rdndffdde rd;ndffdde rdndffdde rdndffdde rdndffdd e rd!ndffd"d:e rd;ndffd#de 	r dndffd$de 	rdndffd%de 	rdndffd&d'e 	r)d1nde 	r0d2ndffd(e 	r<d1nde 	rCd2ndffd)d*e 	rRdnde 	rYdndffd+dRe 	rfdSndffd,de 	rsdndffd-de 	rdndffd.d/de 	rdndffd0de 	rdne 	rd5ndffd1e 	rd2nddffd3d4de 	rdndffd5dXe 	rdYndffd6dXe 	rdYndffd7dXe 	rdYndffd8dXe 	rdYndffd9dXe 
rdYndffd:dXe 
rdYndffd;dXe 
rdYndffd<dXe 
r)dYndffd=dXe 
r6dYndffd>dXe 
rCdYndffd?dXe 
rPdYndffd@dXe 
r]dYndffdAdBdCe 
rodDndffdEe 
r{dnde 
rdndffdFe 
rdGnde 
rdHndffdIe 
rdJnde 
rdKndffdLdMe 
rdNndffdOd@e 
rdAndffdPd@e 
rdAndffdQdRdSe 
rdTndffdUde 
rdndffdVe rdWnde r	dXndffdYe rdWnde rdXndffdZe r*dnde r1dndffd[e r>d\nddffd]e rKdnde rRdndffd^de r_d5ndffd_e rld`nddffdadbe r}dcnddffdddedfe rdgndffdhde rdndffdid:e rd;ndffdje rdnde rdndffdke rdnde rdndffdle rdnde rdndffdmdndodpde rdndffdqe rdrnde rdsndffdte rdnde r!dndffdude r.dndffdvde r;dndffdwde rHdndffdxde rUdndffdydze rddnde roe sod5ndffd{d|d}d~dde rdndffdde rdndffde rdnde rdndffdde rdnddffde rdnde rdndffde rdnde rdndffde rdnde rdndffdde rdndffde rdnde rdndffde r$dnde r+dndffde r7dnde r>dndffde rJdnde rQdndffZ0e"e$e0Z1dd e$2 D Z3de.de
e4e df fddZ5								dde
e.ej6e. f de	e
e.ej6e. f  de7de	e7 de	e8e.e.f  de	e
e7e.f  de	e. de7de.de8e.ef fddZ9G dd dZ:ddgZ;dS (  zAuto Tokenizer class.    N)OrderedDict)AnyOptionalUnion)is_mistral_common_available   )PretrainedConfig)get_class_from_dynamic_moduleresolve_trust_remote_code)load_gguf_checkpoint)PreTrainedTokenizer)TOKENIZER_CONFIG_FILE)cached_fileextract_commit_hashis_g2p_en_availableis_sentencepiece_availableis_tokenizers_availablelogging   )EncoderDecoderConfig   )_LazyAutoMapping)CONFIG_MAPPING_NAMES
AutoConfigconfig_class_to_model_typemodel_type_to_module_name!replace_list_option_in_docstrings)PreTrainedTokenizerFastaimv2CLIPTokenizerCLIPTokenizerFastalbertAlbertTokenizerAlbertTokenizerFastalignBertTokenizerBertTokenizerFastarceeLlamaTokenizerLlamaTokenizerFastaria
aya_visionCohereTokenizerFastbark)bart)BartTokenizerBartTokenizerFastbarthezBarthezTokenizerBarthezTokenizerFast)bartpho)BartphoTokenizerNbertzbert-generationBertGenerationTokenizer)zbert-japanese)BertJapaneseTokenizerN)bertweet)BertweetTokenizerNbig_birdBigBirdTokenizerBigBirdTokenizerFastbigbird_pegasusPegasusTokenizerPegasusTokenizerFast)biogpt)BioGptTokenizerNbitnetr   )
blenderbot)BlenderbotTokenizerBlenderbotTokenizerFast)zblenderbot-small)BlenderbotSmallTokenizerNblipzblip-2GPT2TokenizerGPT2TokenizerFastbloomBloomTokenizerFastbltbridgetowerRobertaTokenizerRobertaTokenizerFastbros)byt5)ByT5TokenizerN	camembertCamembertTokenizerCamembertTokenizerFast)canine)CanineTokenizerN	chameleonchinese_clipclapclipclipseg)clvp)ClvpTokenizerN
code_llamaCodeLlamaTokenizerCodeLlamaTokenizerFastcodegenCodeGenTokenizerCodeGenTokenizerFastcoherecohere2colpalicolqwen2Qwen2TokenizerQwen2TokenizerFastconvbertConvBertTokenizerConvBertTokenizerFastcpmCpmTokenizerCpmTokenizerFast)cpmant)CpmAntTokenizerNcsm)ctrl)CTRLTokenizerN)zdata2vec-audioWav2Vec2CTCTokenizerNzdata2vec-textdbrxdebertaDebertaTokenizerDebertaTokenizerFastz
deberta-v2DebertaV2TokenizerDebertaV2TokenizerFastdeepseek_v2deepseek_v3deepseek_vldeepseek_vl_hybrid)dia)DiaTokenizerN	diffllama
distilbertDistilBertTokenizerDistilBertTokenizerFastdprDPRQuestionEncoderTokenizerDPRQuestionEncoderTokenizerFastelectraElectraTokenizerElectraTokenizerFastemu3ernieernie4_5ernie4_5_moeernie_mErnieMTokenizer)esm)EsmTokenizerNexaone4falconfalcon_mambaGPTNeoXTokenizerFastfastspeech2_conformerFastSpeech2ConformerTokenizer)flaubert)FlaubertTokenizerN	flex_olmofnetFNetTokenizerFNetTokenizerFast)fsmt)FSMTTokenizerNfunnelFunnelTokenizerFunnelTokenizerFastgemmaGemmaTokenizerGemmaTokenizerFastgemma2gemma3gemma3_textgemma3ngemma3n_textgitglmglm4glm4_moeglm4v	glm4v_moezgpt-sw3GPTSw3Tokenizergpt2gpt_bigcodegpt_neogpt_neox)gpt_neox_japanese)GPTNeoXJapaneseTokenizerNgpt_ossgptj)zgptsan-japanese)GPTSanJapaneseTokenizerN)graniterI   N)
granitemoer   )granitemoehybridr   )granitemoesharedr   zgrounding-dinogroupvitheliumherbertHerbertTokenizerHerbertTokenizerFast)hubertrw   ibertideficsidefics2idefics3instructblipinstructblipvideointernvljambajanusjetmoe)jukebox)JukeboxTokenizerNzkosmos-2XLMRobertaTokenizerXLMRobertaTokenizerFastz
kosmos-2.5layoutlmLayoutLMTokenizerLayoutLMTokenizerFast
layoutlmv2LayoutLMv2TokenizerLayoutLMv2TokenizerFast
layoutlmv3LayoutLMv3TokenizerLayoutLMv3TokenizerFast	layoutxlmLayoutXLMTokenizerLayoutXLMTokenizerFastledLEDTokenizerLEDTokenizerFastliltllamallama4llama4_textllava
llava_nextllava_next_videollava_onevision
longformerLongformerTokenizerLongformerTokenizerFastlongt5T5TokenizerT5TokenizerFast)luke)LukeTokenizerNlxmertLxmertTokenizerLxmertTokenizerFastm2m_100M2M100Tokenizermambamamba2marianMarianTokenizermbartMBartTokenizerMBartTokenizerFastmbart50MBart50TokenizerMBart50TokenizerFastmegazmegatron-bert
metaclip_2)zmgp-str)MgpstrTokenizerNminimax	ministralMistralCommonTokenizermistralmistral3mixtralmllamamlukeMLukeTokenizerzmm-grounding-dino
mobilebertMobileBertTokenizerMobileBertTokenizerFast
modernbert	moonshinemoshimpnetMPNetTokenizerMPNetTokenizerFastmptmramt5MT5TokenizerMT5TokenizerFastmusicgenmusicgen_melodymvpMvpTokenizerMvpTokenizerFast)myt5)MyT5TokenizerNnemotronnezhanllbNllbTokenizerNllbTokenizerFastznllb-moenystromformerolmoolmo2olmo3olmoezomdet-turbo	oneformerz
openai-gptOpenAIGPTTokenizerOpenAIGPTTokenizerFastoptowlv2owlvit	paligemma)parakeet)ParakeetCTCTokenizerNpegasus	pegasus_x)	perceiver)PerceiverTokenizerN	persimmonphiphi3phimoe)phobert)PhobertTokenizerN
pix2structpixtralplbartPLBartTokenizer)
prophetnet)ProphetNetTokenizerNqdqbertqwen2qwen2_5_omni
qwen2_5_vlqwen2_audio	qwen2_moeqwen2_vlqwen3	qwen3_moe
qwen3_nextqwen3_omni_moeqwen3_vlqwen3_vl_moe)rag)RagTokenizerNrealmRealmTokenizerRealmTokenizerFastrecurrent_gemmareformerReformerTokenizerReformerTokenizerFastrembertRemBertTokenizerRemBertTokenizerFast	retribertRetriBertTokenizerRetriBertTokenizerFastrobertazroberta-prelayernorm)roc_bert)RoCBertTokenizerNroformerRoFormerTokenizerRoFormerTokenizerFastrwkvseamless_m4tSeamlessM4TTokenizerSeamlessM4TTokenizerFastseamless_m4t_v2shieldgemma2siglipSiglipTokenizersiglip2smollm3speech_to_textSpeech2TextTokenizer)speech_to_text_2)Speech2Text2TokenizerNspeecht5SpeechT5Tokenizer)splinter)SplinterTokenizerSplinterTokenizerFastsqueezebertSqueezeBertTokenizerSqueezeBertTokenizerFaststablelm
starcoder2switch_transformerst5t5gemma)tapas)TapasTokenizerN)tapex)TapexTokenizerN)z
transfo-xl)TransfoXLTokenizerNtvpudopUdopTokenizerUdopTokenizerFastumt5video_llavaviltvipllavavisual_bert)vits)VitsTokenizerNvoxtral)wav2vec2rw   )zwav2vec2-bertrw   )zwav2vec2-conformerrw   )wav2vec2_phoneme)Wav2Vec2PhonemeCTCTokenizerNwhisperWhisperTokenizerWhisperTokenizerFastxclipxglmXGLMTokenizerXGLMTokenizerFast)xlm)XLMTokenizerNzxlm-prophetnetXLMProphetNetTokenizerzxlm-robertazxlm-roberta-xlxlnetXLNetTokenizerXLNetTokenizerFastxlstmxmodyosozambazamba2c                 C   s   i | ]\}}||qS  r  ).0kvr  r  g/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/transformers/models/auto/tokenization_auto.py
<dictcomp>$  s    r  
class_namereturnc              	   C   s   | dkrt S t D ]6\}}| |v r@t|}|dv r%| dkr%tdd}n	td| d}zt|| W   S  ty?   Y q
w q
tj	
 D ]}|D ]}t|dd | krZ|    S qJqFtd}t|| rkt|| S d S )	Nr   )r  r  r  r  z.tokenization_mistral_commontransformers.ztransformers.models__name__)r   TOKENIZER_MAPPING_NAMESitemsr   	importlibimport_modulegetattrAttributeErrorTOKENIZER_MAPPING_extra_contentvalueshasattr)r  module_name
tokenizersmodule	tokenizermain_moduler  r  r  tokenizer_class_from_name'  s0   


r  F pretrained_model_name_or_path	cache_dirforce_downloadresume_downloadproxiestokenrevisionlocal_files_only	subfolderc	                 K   s   |	 dd}
|
durtdt |durtd|
}|	d}t| t||||||||ddd|d}|du r<t	d i S t
||}t|d	d
}t|}W d   n1 sWw   Y  ||d< |S )a  
    Loads the tokenizer configuration from a pretrained model tokenizer configuration.

    Args:
        pretrained_model_name_or_path (`str` or `os.PathLike`):
            This can be either:

            - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
              huggingface.co.
            - a path to a *directory* containing a configuration file saved using the
              [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.

        cache_dir (`str` or `os.PathLike`, *optional*):
            Path to a directory in which a downloaded pretrained model configuration should be cached if the standard
            cache should not be used.
        force_download (`bool`, *optional*, defaults to `False`):
            Whether or not to force to (re-)download the configuration files and override the cached versions if they
            exist.
        resume_download:
            Deprecated and ignored. All downloads are now resumed by default when possible.
            Will be removed in v5 of Transformers.
        proxies (`dict[str, str]`, *optional*):
            A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
            'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
        token (`str` or *bool*, *optional*):
            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
            when running `hf auth login` (stored in `~/.huggingface`).
        revision (`str`, *optional*, defaults to `"main"`):
            The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
            git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
            identifier allowed by git.
        local_files_only (`bool`, *optional*, defaults to `False`):
            If `True`, will only try to load the tokenizer configuration from local files.
        subfolder (`str`, *optional*, defaults to `""`):
            In case the tokenizer config is located inside a subfolder of the model repo on huggingface.co, you can
            specify the folder name here.

    <Tip>

    Passing `token=True` is required when you want to use a private model.

    </Tip>

    Returns:
        `dict`: The configuration of the tokenizer.

    Examples:

    ```python
    # Download configuration from huggingface.co and cache.
    tokenizer_config = get_tokenizer_config("google-bert/bert-base-uncased")
    # This model does not have a tokenizer config so the result will be an empty dict.
    tokenizer_config = get_tokenizer_config("FacebookAI/xlm-roberta-base")

    # Save a pretrained tokenizer locally and you can reload its config
    from transformers import AutoTokenizer

    tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
    tokenizer.save_pretrained("tokenizer-test")
    tokenizer_config = get_tokenizer_config("tokenizer-test")
    ```use_auth_tokenNrThe `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.V`token` and `use_auth_token` are both specified. Please set only the argument `token`._commit_hashF)r  r  r  r  r  r  r  r   _raise_exceptions_for_gated_repo%_raise_exceptions_for_missing_entries'_raise_exceptions_for_connection_errorsr  z\Could not locate the tokenizer configuration file, will try to use the model config instead.zutf-8)encoding)popwarningswarnFutureWarning
ValueErrorgetr   r   loggerinfor   openjsonload)r  r  r  r  r  r  r  r  r  kwargsr  commit_hashresolved_config_filereaderresultr  r  r  get_tokenizer_configE  sF   I


r  c                   @   s:   e Zd ZdZdd Zeeedd Ze	d
dd	Z
dS )AutoTokenizera  
    This is a generic tokenizer class that will be instantiated as one of the tokenizer classes of the library when
    created with the [`AutoTokenizer.from_pretrained`] class method.

    This class cannot be instantiated directly using `__init__()` (throws an error).
    c                 C   s   t d)Nz}AutoTokenizer is designed to be instantiated using the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)` method.)OSError)selfr  r  r  __init__  s   zAutoTokenizer.__init__c                 O   sn  | dd}|durtdt |ddurtd||d< | dd}d|d< | d	d}| d
d}| dd}|d}	|durd}
t|d}|du rctd| dddd tD  d|\}}|rw|durrt|}
nt	
d |
du rt|}
|
du rtd| d|
j|g|R i |S t|fi |}d|v r|d |d< |d}d}d|v rt|d ttfr|d }n|d dd}|du rt|ts|	rt||	fi |}t|ddd }tjd)i |}ntj|fd|i|}|j}t|drd|jv r|jd }|du}t|tv p-|duo-t|dup-t|d du}|r[|r@|d dur@|d }n|d }d|v rQ|dd }nd}t|||||}|r|rt||fi |}
| d d}|
  |
j|g|R d|i|S |durd}
|r|ds| d}t|}
|
du r|}t|}
|
du rtd| d!|
j|g|R i |S t|trt|j t|j!urt	
d"|j!j" d#|j j" d$ |j!}t#t|j$}|dur#tt| \}}|r|s|du r|j|g|R i |S |dur|j|g|R i |S td%td&|j" d'dd(d tD  d)*a]  
        Instantiate one of the tokenizer classes of the library from a pretrained model vocabulary.

        The tokenizer class to instantiate is selected based on the `model_type` property of the config object (either
        passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when it's missing, by
        falling back to using pattern matching on `pretrained_model_name_or_path`:

        List options

        Params:
            pretrained_model_name_or_path (`str` or `os.PathLike`):
                Can be either:

                    - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.
                    - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved
                      using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
                    - A path or url to a single saved vocabulary file if and only if the tokenizer only requires a
                      single vocabulary file (like Bert or XLNet), e.g.: `./my_model_directory/vocab.txt`. (Not
                      applicable to all derived classes)
            inputs (additional positional arguments, *optional*):
                Will be passed along to the Tokenizer `__init__()` method.
            config ([`PretrainedConfig`], *optional*)
                The configuration object used to determine the tokenizer class to instantiate.
            cache_dir (`str` or `os.PathLike`, *optional*):
                Path to a directory in which a downloaded pretrained model configuration should be cached if the
                standard cache should not be used.
            force_download (`bool`, *optional*, defaults to `False`):
                Whether or not to force the (re-)download the model weights and configuration files and override the
                cached versions if they exist.
            resume_download:
                Deprecated and ignored. All downloads are now resumed by default when possible.
                Will be removed in v5 of Transformers.
            proxies (`dict[str, str]`, *optional*):
                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
            revision (`str`, *optional*, defaults to `"main"`):
                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
                identifier allowed by git.
            subfolder (`str`, *optional*):
                In case the relevant files are located inside a subfolder of the model repo on huggingface.co (e.g. for
                facebook/rag-token-base), specify it here.
            use_fast (`bool`, *optional*, defaults to `True`):
                Use a [fast Rust-based tokenizer](https://huggingface.co/docs/tokenizers/index) if it is supported for
                a given model. If a fast tokenizer is not available for a given model, a normal Python-based tokenizer
                is returned instead.
            tokenizer_type (`str`, *optional*):
                Tokenizer type to be loaded.
            trust_remote_code (`bool`, *optional*, defaults to `False`):
                Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
                should only be set to `True` for repositories you trust and in which you have read the code, as it will
                execute code present on the Hub on your local machine.
            kwargs (additional keyword arguments, *optional*):
                Will be passed to the Tokenizer `__init__()` method. Can be used to set special tokens like
                `bos_token`, `eos_token`, `unk_token`, `sep_token`, `pad_token`, `cls_token`, `mask_token`,
                `additional_special_tokens`. See parameters in the `__init__()` for more details.

        Examples:

        ```python
        >>> from transformers import AutoTokenizer

        >>> # Download vocabulary from huggingface.co and cache.
        >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")

        >>> # Download vocabulary from huggingface.co (user-uploaded) and cache.
        >>> tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-german-cased")

        >>> # If vocabulary files are in a directory (e.g. tokenizer was saved using *save_pretrained('./test/saved_model/')*)
        >>> # tokenizer = AutoTokenizer.from_pretrained("./test/bert_saved_model/")

        >>> # Download vocabulary from huggingface.co and define model-specific arguments
        >>> tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base", add_prefix_space=True)
        ```r  Nr  r  r  configT
_from_autouse_fasttokenizer_typetrust_remote_code	gguf_filezPassed `tokenizer_type` z3 does not exist. `tokenizer_type` should be one of z, c                 s   s    | ]}|V  qd S Nr  r  cr  r  r  	<genexpr>+  s    z0AutoTokenizer.from_pretrained.<locals>.<genexpr>r  zt`use_fast` is set to `True` but the tokenizer class does not have a fast version.  Falling back to the slow version.zTokenizer class z is not currently imported.r  tokenizer_classauto_mapr  F)return_tensorsFastr   r   z--code_revisionz- does not exist or is not currently imported.z The encoder model config class: z3 is different from the decoder model config class: z. It is not recommended to use the `AutoTokenizer.from_pretrained()` method in this case. Please use the encoder and decoder specific tokenizer classes.zzThis tokenizer cannot be instantiated. Please make sure you have `sentencepiece` installed in order to use this tokenizer.z!Unrecognized configuration class z8 to build an AutoTokenizer.
Model type should be one of c                 s   s    | ]}|j V  qd S r  )r  r  r  r  r  r    s    r  )%r  r  r  r  r  r  r  joinr  r  warningfrom_pretrainedr  
isinstancetuplelistr   r   r   r   	for_modelr  r  r  typer  splitr
   r	   register_for_auto_classendswithr   decoderencoder	__class__r   r  )clsr  inputsr  r  r  r  r  r  r  r  tokenizer_class_tupletokenizer_class_nametokenizer_fast_class_nametokenizer_configconfig_tokenizer_classtokenizer_auto_map	gguf_pathconfig_dicthas_remote_codehas_local_code	class_refupstream_repo_tokenizer_class_candidate
model_typetokenizer_class_pytokenizer_class_fastr  r  r  r    s   M


















zAutoTokenizer.from_pretrainedNFc                 C   s   |du r|du rt d|durt|trt d|dur&t|tr&t d|durD|durDt|trD|j|krDt d|j d| d| tjv r[t|  \}}|du rU|}|du r[|}tj| ||f|d dS )	a  
        Register a new tokenizer in this mapping.


        Args:
            config_class ([`PretrainedConfig`]):
                The configuration corresponding to the model to register.
            slow_tokenizer_class ([`PretrainedTokenizer`], *optional*):
                The slow tokenizer to register.
            fast_tokenizer_class ([`PretrainedTokenizerFast`], *optional*):
                The fast tokenizer to register.
        NzKYou need to pass either a `slow_tokenizer_class` or a `fast_tokenizer_classz:You passed a fast tokenizer in the `slow_tokenizer_class`.z:You passed a slow tokenizer in the `fast_tokenizer_class`.zThe fast tokenizer class you are passing has a `slow_tokenizer_class` attribute that is not consistent with the slow tokenizer class you passed (fast tokenizer has z and you passed z!. Fix one of those so they match!)exist_ok)r  
issubclassr   r   slow_tokenizer_classr  r  register)config_classr"  fast_tokenizer_classr   existing_slowexisting_fastr  r  r  r#    s2   

zAutoTokenizer.register)NNF)r  
__module____qualname____doc__r  classmethodr   r  r  staticmethodr#  r  r  r  r  r    s     cr  r  )NFNNNNFr  )<r*  r  r  osr  collectionsr   typingr   r   r   transformers.utils.import_utilsr   configuration_utilsr   dynamic_module_utilsr	   r
   modeling_gguf_pytorch_utilsr   tokenization_utilsr   tokenization_utils_baser   utilsr   r   r   r   r   r   encoder_decoderr   auto_factoryr   configuration_autor   r   r   r   r   tokenization_utils_fastr   
get_loggerr  r  strr  r  r  r  CONFIG_TO_TYPEr  r  PathLikebooldictr  r  __all__r  r  r  r  <module>   s>   	
	 "()*+,-./01235;=CELSY[abcdefhnopqrstv}           !  #  )  *  +  ,  -  .  /  1  7  8  :  =  >  ?  @  A  C  J  Q  X  _  f  l  m  n  o  p  q  r  s  t  u  v  w  x  y  z  {  |  }  ~                                     	    
                            #    $    %    &    '    (    )    +    2    9    ?    @    A    B    C    E    K    L    M    N    O    P    R    Y    _    `    b    h    j    q    z                                                                                    &      '      (      )      *      +      -      4      ;      A      B      C      D      F      I      K      N      O      P      Q      R      T      [      a      i      o      p      q      r      s      u      }      ~                             	        
                                        "        (        )        *        +        ,        .        5        <        B        C        E        H        I        J        L        S        Z        `        b        h        i        j        k        l        n        q        r        t        {                   	          
                                                                                                    !          '          (          )          *          +          ,          .          4          5          7          >          E          K          M          T          [          b          
l& 	
o  !