o
    eiC_                  	   @   s   d Z ddlZddlZddlZddlZddlmZ ddlmZ e	e
ZdddZd	d
 ZG dd deZ	 dZdZededdddddf	Zedde ejejB ejB ZedZeeejejB ejB ZedZd+ddZd,d d!ZG d"d# d#Zd$d% Zd&d' Zd-d)d*Z dgZ!dS ).z!Tokenization classes for BERTweet    N   )PreTrainedTokenizer)logging	vocab.txt	bpe.codes)
vocab_filemerges_filec                 C   s>   t  }| d }| dd D ]}|||f |}qt |}|S )z
    Return set of symbol pairs in a word.

    Word is represented as tuple of symbols (symbols being variable-length strings).
    r      N)setadd)wordpairs	prev_charchar r   p/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/bertweet/tokenization_bertweet.py	get_pairs#   s   r   c                	       s   e Zd ZdZeZ								d% fdd		Zed
d Zdd Z	dd Z
dd Zdd Zdd Zdd Zdd Zdd Zd&dededB deed f fd!d"Zd#d$ Z  ZS )'BertweetTokenizera	  
    Constructs a BERTweet tokenizer, using Byte-Pair-Encoding.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        merges_file (`str`):
            Path to the merges file.
        normalization (`bool`, *optional*, defaults to `False`):
            Whether or not to apply a normalization preprocess.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the beginning of
            sequence. The token used is the `cls_token`.

            </Tip>

        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
            The token used is the `sep_token`.

            </Tip>

        sep_token (`str`, *optional*, defaults to `"</s>"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
        cls_token (`str`, *optional*, defaults to `"<s>"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        mask_token (`str`, *optional*, defaults to `"<mask>"`):
            The token used for masking values. This is the token used when training this model with masked language
            modeling. This is the token which the model will try to predict.
    F<s></s><unk><pad><mask>c                    sZ  zddl m} || _W n ty   td d | _Y nw || _|| _i | _d| jt	|< d| jt	|	< d| jt	|< d| jt	|< | 
| dd | j D | _t|d	d
}| dd d }W d    n1 snw   Y  dd |D }tt|tt|| _i | _|| _t | _ddd| _t jd|||||||	|
dddd| d S )Nr   )demojizezsemoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0r	      r   c                 S   s   i | ]\}}||qS r   r   ).0kvr   r   r   
<dictcomp>       z.BertweetTokenizer.__init__.<locals>.<dictcomp>utf-8encoding
c                 S   s    g | ]}t | d d qS )Nr$   )tuplesplit)r   merger   r   r   
<listcomp>   s     z.BertweetTokenizer.__init__.<locals>.<listcomp>'z...)u   ’u   …	all_zerosTcls_double_sep)normalization	bos_token	eos_token	sep_token	cls_token	unk_token	pad_token
mask_tokentoken_type_ids_pattern%token_type_ids_include_special_tokensspecial_tokens_patternr   )emojir   	demojizerImportErrorloggerwarningr   r   encoderstradd_from_fileitemsdecoderopenreadr&   dictziprangelen	bpe_rankscacher,   TweetTokenizertweetPreprocessorspecial_punctssuper__init__)selfr   r   r,   r-   r.   r/   r0   r1   r2   r3   kwargsr   merges_handlemerges	__class__r   r   rM   h   sT   



zBertweetTokenizer.__init__c                 C   s
   t | jS N)rF   r<   rN   r   r   r   
vocab_size   s   
zBertweetTokenizer.vocab_sizec                 C   s   t | jfi | jS rT   )rC   r<   added_tokens_encoderrU   r   r   r   	get_vocab   s   zBertweetTokenizer.get_vocabc           
         s  | j v r
 j | S t|}tt|d d |d d g }t|}|s'|S 	 t| fddd}| jvr8ny|\}}g }d}|t|k rz|||}	W n ty`   |	||d   Y n?w |	|||	  |	}|| |kr|t|d k r||d  |kr|
||  |d	7 }n|
||  |d7 }|t|k sFt|}|}t|dkrnt|}q(d
|}|d d }| j |< |S )Nr$   z</w>Tc                    s    j | tdS )Ninf)rG   getfloat)pairrU   r   r   <lambda>   s    z'BertweetTokenizer.bpe.<locals>.<lambda>keyr   r	   r   @@ )rH   r%   listr   minrG   rF   index
ValueErrorextendappendjoin)
rN   tokenr   r   bigramfirstsecondnew_wordijr   rU   r   bpe   sN   

"
,


zBertweetTokenizer.bpec                 C   sH   | j r| |}g }td|}|D ]}|t| |d q|S )zTokenize a string.z\S+\n? )r,   normalizeTweetrefindallrf   rb   rp   r&   )rN   textsplit_tokenswordsri   r   r   r   	_tokenize   s   
zBertweetTokenizer._tokenizec                    s    j D ]}|| j | }q j|}d fdd|D }|ddddddd	d
dd}|dddddddddddd}|dddddddd }d| S )!z'
        Normalize a raw Tweet
        rq   c                    s   g | ]}  |qS r   )normalizeToken)r   ri   rU   r   r   r(      r   z4BertweetTokenizer.normalizeTweet.<locals>.<listcomp>zcannot zcan not zn't z n't zn 't zca n'tzcan'tzai n'tzain'tz'm z 'm z're z 're z's z 's z'll z 'll z'd z 'd z've z 've z p . m .z  p.m.z p . m z p.m z a . m .z a.m.z a . m z a.m )rK   replacerJ   tokenizerh   r&   )rN   tweetpuncttokens	normTweetr   rU   r   rr      s.   



	z BertweetTokenizer.normalizeTweetc                 C   sj   |  }|drdS |ds|drdS t|dkr3|| jv r'| j| S | jdur1| |S |S |S )z-
        Normalize tokens in a Tweet
        @z@USERhttpwwwHTTPURLr	   N)lower
startswithrF   rK   r8   )rN   ri   lowercased_tokenr   r   r   ry     s   




z BertweetTokenizer.normalizeTokenc                 C   s   | j || j | jS )z0Converts a token (str) in an id using the vocab.)r<   rZ   r1   )rN   ri   r   r   r   _convert_token_to_id  s   z&BertweetTokenizer._convert_token_to_idc                 C   s   | j || jS )z=Converts an index (integer) in a token (str) using the vocab.)r@   rZ   r1   )rN   rd   r   r   r   _convert_id_to_token  s   z&BertweetTokenizer._convert_id_to_tokenc                 C   s   d |dd }|S )z:Converts a sequence of tokens (string) in a single string.rq   r`    )rh   rz   strip)rN   r~   
out_stringr   r   r   convert_tokens_to_string#  s   z*BertweetTokenizer.convert_tokens_to_stringNsave_directoryfilename_prefixreturn.c                 C   sB  t j|std| d dS t| di }|r| dnd}t j|||dd }t|d	d
d'}t	| j
 dd dD ]\}}|dkrS|| d| d q@W d   n1 s^w   Y  t j|||dd }	t|	d	d
d}
|
dd t	| j dd dD  W d   ||	fS 1 sw   Y  ||	fS )zF
        Save the vocabulary and merges files to a directory.
        zVocabulary path (z) should be a directoryr   vocab_files_names-r   r   r   wr    r!   c                 S      | d S Nr	   r   kvr   r   r   r]   =      z3BertweetTokenizer.save_vocabulary.<locals>.<lambda>r^      rq   r#   Nr   r   c                 s   s"    | ]\}}d  |d V  qdS )rq   r#   N)rh   )r   
bpe_tokenstoken_indexr   r   r   	<genexpr>E  s
    
z4BertweetTokenizer.save_vocabulary.<locals>.<genexpr>c                 S   r   r   r   r   r   r   r   r]   G  r   )ospathisdirr:   errorgetattrrh   rZ   rA   sortedr<   r?   write
writelinesrG   )rN   r   r   r   prefixr   fri   token_id
merge_filewriterr   r   r   save_vocabulary.  s,   


z!BertweetTokenizer.save_vocabularyc           	   
   C   s   t |trCz!t|ddd}| | W d   W dS 1 sw   Y  W dS  ty4 } z|d}~w tyB   td| dw | }|D ]!}| }|	d}|dkr\t
d	|d| }t| j| j|< qIdS )
zi
        Loads a pre-existing dictionary from a text file and adds its symbols to this instance.
        rr    r!   NzIncorrect encoding detected in z, please rebuild the datasetrq   r$   z5Incorrect dictionary format, expected '<token> <cnt>')
isinstancer=   rA   r>   FileNotFoundErrorUnicodeError	Exception	readlinesr   rfindre   rF   r<   )	rN   r   fdfnfelineslineTmplineidxr   r   r   r   r>   L  s.   

zBertweetTokenizer.add_from_file)Fr   r   r   r   r   r   r   rT   )__name__
__module____qualname____doc__VOCAB_FILES_NAMESr   rM   propertyrV   rX   rp   rx   rr   ry   r   r   r   r=   r%   r   r>   __classcell__r   r   rR   r   r   3   s0    2@
,"$r   ac  
    (?:
      [<>]?
      [:;=8]                     # eyes
      [\-o\*\']?                 # optional nose
      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
      |
      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
      [\-o\*\']?                 # optional nose
      [:;=8]                     # eyes
      [<>]?
      |
      <3                         # heart
    )u  			# Capture 1: entire matched URL
  (?:
  https?:				# URL protocol and colon
    (?:
      /{1,3}				# 1-3 slashes
      |					#   or
      [a-z0-9%]				# Single letter or digit or '%'
                                       # (Trying not to match e.g. "URI::Escape")
    )
    |					#   or
                                       # looks like domain name followed by a slash:
    [a-z0-9.\-]+[.]
    (?:[a-z]{2,13})
    /
  )
  (?:					# One or more:
    [^\s()<>{}\[\]]+			# Run of non-space, non-()<>{}[]
    |					#   or
    \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...)
    |
    \([^\s]+?\)				# balanced parens, non-recursive: (...)
  )+
  (?:					# End with:
    \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...)
    |
    \([^\s]+?\)				# balanced parens, non-recursive: (...)
    |					#   or
    [^\s`!()\[\]{};:'".,<>?«»“”‘’]	# not a space or one of these punct chars
  )
  |					# OR, the following to match naked domains:
  (?:
    (?<!@)			        # not preceded by a @, avoid matching foo@_gmail.com_
    [a-z0-9]+
    (?:[.\-][a-z0-9]+)*
    [.]
    (?:[a-z]{2,13})
    \b
    /?
    (?!@)			        # not succeeded by a @,
                            # avoid matching "foo.na" in "foo.na@example.com"
  )
a	  
    (?:
      (?:            # (international)
        \+?[01]
        [ *\-.\)]*
      )?
      (?:            # (area code)
        [\(]?
        \d{3}
        [ *\-.\)]*
      )?
      \d{3}          # exchange
      [ *\-.\)]*
      \d{4}          # base
    )z	<[^>\s]+>z[\-]+>|<[\-]+z(?:@[\w_]+)z(?:\#+[\w_]+[\w\'_\-]*[\w_]+)z#[\w.+-]+@[\w-]+\.(?:[\w-]\.?)+[\w-]a  
    (?:[^\W\d_](?:[^\W\d_]|['\-_])+[^\W\d_]) # Words with apostrophes or dashes.
    |
    (?:[+\-]?\d+[,/.:-]\d+[+\-]?)  # Numbers, including fractions, decimals.
    |
    (?:[\w_]+)                     # Words without apostrophes or dashes.
    |
    (?:\.(?:\s*\.){1,})            # Ellipsis dots.
    |
    (?:\S)                         # Everything else that isn't whitespace.
    z(%s)|z([^a-zA-Z0-9])\1{3,}z&(#?(x?))([^&;\s]+);strictc                 C   s&   |d u rd}t | tr| ||S | S )Nr    )r   bytesdecode)ru   r"   errorsr   r   r   _str_to_unicode  s
   
r   r   Tr    c                    s     fdd}t |t| |S )u  
    Remove entities from text by converting them to their corresponding unicode character.

    Args:
        text:
            A unicode string or a byte string encoded in the given *encoding* (which defaults to 'utf-8').
        keep (list):
            List of entity names which should not be replaced. This supports both numeric entities (`&#nnnn;` and
            `&#hhhh;`) and named entities (such as `&nbsp;` or `&gt;`).
        remove_illegal (bool):
            If `True`, entities that can't be converted are removed. Otherwise, entities that can't be converted are
            kept "as is".
    Returns: A unicode string with the entities removed.

    See https://github.com/scrapy/w3lib/blob/master/w3lib/html.py

    Examples:

    ```python
    >>> from nltk.tokenize.casual import _replace_html_entities

    >>> _replace_html_entities(b"Price: &pound;100")
    'Price: \xa3100'

    >>> print(_replace_html_entities(b"Price: &pound;100"))
    Price: £100
    ```c              	      s   |  d}|  dr=z'|  drt|d}nt|d}d|  kr%dkr0n n	t|fdW S W n ty<   d }Y nw | v rF|  d	S tjj|}|d urbzt	|W S  tt
fya   Y nw rfd
S |  d	S )Nr   r	   r      
         cp1252r   r   )groupintr   r   re   htmlentitiesname2codepointrZ   chrOverflowError)matchentity_bodynumberkeepremove_illegalr   r   _convert_entity;  s,   





z/_replace_html_entities.<locals>._convert_entity)ENT_REsubr   )ru   r   r   r"   r   r   r   r   _replace_html_entities  s   r   c                   @   s"   e Zd ZdZd	ddZdd ZdS )
rI   a  
    Examples:

    ```python
    >>> # Tokenizer for tweets.
    >>> from nltk.tokenize import TweetTokenizer

    >>> tknzr = TweetTokenizer()
    >>> s0 = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--"
    >>> tknzr.tokenize(s0)
    ['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3', 'and', 'some', 'arrows', '<', '>', '->', '<--']

    >>> # Examples using *strip_handles* and *reduce_len parameters*:
    >>> tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
    >>> s1 = "@remy: This is waaaaayyyy too much for you!!!!!!"
    >>> tknzr.tokenize(s1)
    [':', 'This', 'is', 'waaayyy', 'too', 'much', 'for', 'you', '!', '!', '!']
    ```TFc                 C   s   || _ || _|| _d S rT   preserve_case
reduce_lenstrip_handles)rN   r   r   r   r   r   r   rM   r  s   
zTweetTokenizer.__init__c                 C   sR   t |}| jrt|}| jrt|}td|}t|}| j	s'dd |D }|S )z
        Args:
            text: str

        Returns: list(str) A tokenized list of strings; concatenating this list returns the original string if
        `preserve_case=False`
        \1\1\1c                 S   s"   g | ]}t |r|n| qS r   )EMOTICON_REsearchr   )r   xr   r   r   r(     s   " z+TweetTokenizer.tokenize.<locals>.<listcomp>)
r   r   remove_handlesr   reduce_lengtheningHANG_REr   WORD_RErt   r   )rN   ru   	safe_textrw   r   r   r   r{   w  s   	
zTweetTokenizer.tokenizeNTFF)r   r   r   r   rM   r{   r   r   r   r   rI   ^  s    
rI   c                 C      t d}|d| S )za
    Replace repeated character sequences of length 3 or greater with sequences of length 3.
    z	(.)\1{2,}r   regexcompiler   ru   patternr   r   r   r     s   
r   c                 C   r   )z4
    Remove Twitter username handles from text.
    zv(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){20}(?!@))|(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){1,19})(?![A-Za-z0-9_]*@)rq   r   r   r   r   r   r     s   r   Fc                 C   s   t |||d| S )z:
    Convenience function for wrapping the tokenizer.
    r   )rI   r{   )ru   r   r   r   r   r   r   casual_tokenize  s   r   )Nr   )r   Tr    r   )"r   r   r   rs   r   tokenization_pythonr   utilsr   
get_loggerr   r:   r   r   r   	EMOTICONSURLSREGEXPSr   rh   VERBOSEIUNICODEr   r   r   r   r   r   rI   r   r   r   __all__r   r   r   r   <module>   sN   
  >&.$0



@8

