o
    ei                     @   sd   d dl Z d dlZd dlmZmZmZmZ d dlmZ ddl	m
Z
 dddZG d	d
 d
e
Zd
gZdS )    N)	Tokenizerdecoderspre_tokenizers
processors)Unigram   )TokenizersBackendzspiece.modelztokenizer.json)
vocab_filetokenizer_filec                       s   e Zd ZdZeZddgZeZ							d fd	d
	Z	dd Z
dd Z			ddeee B dededB dedef
 fddZ  ZS )LasrTokenizera  
    Construct a LASR tokenizer (backed by HuggingFace's *tokenizers* library). Based on
    [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models).

    This tokenizer inherits from [`TokenizersBackend`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`, *optional*):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
            The token used is the `sep_token`.

            </Tip>

        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        extra_ids (`int`, *optional*, defaults to 100):
            Add a number of extra ids added to the vocabulary for use as sentinels. These tokens are accessible as
            "<extra_id_{%d}>" where "{%d}" is a number between 0 and extra_ids-1. These tokens can be retrieved by
            calling get_sentinel_tokens method and token ids can be by calling get_sentinel_token_ids method
        additional_special_tokens (`list[str]`, *optional*):
            Additional special tokens used by the tokenizer.
        vocab (`str`, `dict` or `list`, *optional*):
            Custom vocabulary dict. If not provided, a minimal vocabulary is created using the special tokens.
    	input_idsattention_mask</s><unk><pad>d   Nc                    s  || _ |d ur6dd |D }	t|	dk r |dd t|D 7 }n!|dkr5|t|	kr5td| d| dnd	d t|D }	|	}|d urI|| _n*t|d
ft|d
ft|d
fdg| _t|d ddD ]}
| jd|
 dd
f qdtt| jddd| _	d | j	_
tt tjddddg| j	_tjdddd| j	_t jd|||||d| tjddgg dd| jfgd| j	_d S )Nc                 S   s   g | ]
}d t |v r|qS )
<extra_id_)str).0x r   h/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/lasr/tokenization_lasr.py
<listcomp>Y   s    z*LasrTokenizer.__init__.<locals>.<listcomp>   c                 S      g | ]}d | dqS r   >r   r   ir   r   r   r   [       r   zBoth extra_ids (z!) and additional_special_tokens (zm) are provided to LasrTokenizer. In this case the additional_special_tokens must include the extra_ids tokensc                 S   r   r   r   r   r   r   r   r   c   r   g        )   ▁g       r   r   r   F)unk_idbyte_fallbackr    alwaysT)replacementprepend_schemesplit)	eos_token	unk_token	pad_token	extra_idsadditional_special_tokens$Ar   )r-   r   z$Br   )singlepairspecial_tokensr   )
_extra_idslenrange
ValueError_vocab_scoresr   appendr   r   
_tokenizer
normalizerr   SequenceWhitespaceSplit	Metaspacepre_tokenizerr   decodersuper__init__r   TemplateProcessingeos_token_idpost_processor)selfr(   r)   r*   r+   r,   vocabr	   kwargsextra_tokensr   	__class__r   r   r?   J   sd   


	zLasrTokenizer.__init__c                 C   s   t ttdd | jS )zQGet the list of sentinel tokens (extra_id tokens) from additional_special_tokens.c                 S   s   t td| d uS )Nz<extra_id_\d+>)boolresearch)r   r   r   r   <lambda>       z3LasrTokenizer.get_sentinel_tokens.<locals>.<lambda>)listsetfilterr,   rC   r   r   r   get_sentinel_tokens   s   z!LasrTokenizer.get_sentinel_tokensc                    s    fdd   D S )z&Get the token IDs for sentinel tokens.c                    s   g | ]}  |qS r   )convert_tokens_to_idsr   tokenrQ   r   r   r      s    z8LasrTokenizer.get_sentinel_token_ids.<locals>.<listcomp>)rR   rQ   r   rQ   r   get_sentinel_token_ids   s   z$LasrTokenizer.get_sentinel_token_idsFT	token_idsskip_special_tokensclean_up_tokenization_spacesgroup_tokensreturnc                    sT   t |tr|g}|rdd t|D } fdd|D }t jd|||d|S )Nc                 S   s   g | ]}|d  qS )r   r   )r   token_groupr   r   r   r      rM   z)LasrTokenizer._decode.<locals>.<listcomp>c                    s   g | ]	}| j kr|qS r   )pad_token_idrT   rQ   r   r   r      s    )rW   rX   rY   r   )
isinstanceint	itertoolsgroupbyr>   _decode)rC   rW   rX   rY   rZ   rE   rG   rQ   r   rb      s   
zLasrTokenizer._decode)r   r   r   r   NNN)FNT)__name__
__module____qualname____doc__VOCAB_FILES_NAMESvocab_files_namesmodel_input_namesr   modelr?   rR   rV   r_   rN   rI   r   rb   __classcell__r   r   rG   r   r   !   s:    $L
r   )r`   rJ   
tokenizersr   r   r   r   tokenizers.modelsr   tokenization_utils_tokenizersr   rg   r   __all__r   r   r   r   <module>   s   
 
