o
    ॵi                     @   sv   d dl Z d dlmZ d dlZd dlmZ d dlmZ d dlm	Z	 d dl
mZ d dlmZ e ZdgZG d	d dZdS )
    N)Mapping)AutoTokenizer)Models)
OutputKeys)ModeKeys)
get_loggerNLPTokenizerc                   @   s\   e Zd Z				ddedefddZedd Zedd	 Zd
d Z	dddZ
dddZdS )r   N	model_diruse_fastc                 C   s2   || _ || _|| _| jdu ri | _|| _d| _dS )a  The transformers tokenizer preprocessor base class.

        Any nlp preprocessor which uses the huggingface tokenizer can inherit from this class.

        Args:
            model_dir (str, `optional`): The local path containing the files used to create a preprocessor.
            use_fast (str, `optional`): Use the fast version of tokenizer
            tokenize_kwargs (dict, `optional`): These args will be directly fed into the tokenizer.
        N)r	   
model_typetokenize_kwargs	_use_fast
_tokenizer)selfr	   r   r
   r    r   g/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/preprocessors/nlp/transformers_tokenizer.py__init__   s   

zNLPTokenizer.__init__c                 C   s   | j d u r
|  | _ | j S N)r   build_tokenizer)r   r   r   r   	tokenizer-   s   

zNLPTokenizer.tokenizerc                 C   s   | j d u rW| j d u r| jd u rd| _ n9| j d u rLtjtj| jdrLttj| jdddd}t|}|	d| _ W d    n1 sGw   Y  | j d u rSdn| j | _ | j S )NFztokenizer_config.jsonrzutf-8)encodingr
   )
r   r	   ospathisfilejoinopenjsonloadget)r   fjson_configr   r   r   r
   3   s"   

zNLPTokenizer.use_fastc           
      C   s  | j }| j}|tjtjtjtjtjtjtj	fv r3ddl
m}m} | jr%|n|}|dur0||S | S |tjkrSddl
m}m} | jrE|n|}|durP||S | S |tjkrsddlm}m}	 | jre|	n|}|durp||S | S |dusyJ tj|| jdS )zBuild a tokenizer by the model type.

        NOTE: The fast tokenizers have a multi-thread problem, use it carefully.

        Returns:
            The initialized tokenizer.
        r   )BertTokenizerBertTokenizerFastN)XLMRobertaTokenizerXLMRobertaTokenizerFast)LlamaTokenizerLlamaTokenizerFast)r
   )r   r	   r   
structbertgpt3palmplugmegatron_bertplug_mentalfid_plugtransformersr"   r#   r
   from_pretrainedvecor$   r%   llamamodelscope.models.nlpr&   r'   r   )
r   r   r	   r"   r#   r   r$   r%   r&   r'   r   r   r   r   C   sD   	

zNLPTokenizer.build_tokenizerc                 K   sl   | d|dd |d< |d d u r|d dd | j D }|| || j | j||fi |S )N
max_lengthsequence_lengthc                 S   s   i | ]\}}||qS r   r   ).0kvr   r   r   
<dictcomp>i   s    z)NLPTokenizer.__call__.<locals>.<dictcomp>)r   popr   itemsupdater   )r   text	text_pairkwargsr   r   r   r   __call__d   s   


zNLPTokenizer.__call__c                 C   s$   || j v r
| j | S | jj||S r   )r   r   init_kwargsr   )r   keydefault_valuer   r   r   get_tokenizer_kwargn   s   

z NLPTokenizer.get_tokenizer_kwarg)NNNNr   )__name__
__module____qualname__strboolr   propertyr   r
   r   r@   rD   r   r   r   r   r      s"    



!
)r   collections.abcr   r   r/   r   modelscope.metainfor   modelscope.outputsr   modelscope.utils.constantr   modelscope.utils.loggerr   logger__all__r   r   r   r   r   <module>   s   