o
    ik                     @   s   d dl Z d dlZd dlmZ d dlZeejej d ZeejZeej	Z
eejZG dd deZdd Zdd	 Zejejejed
ZG dd deZdS )    N)anyascii_c                   @   s"   e Zd ZdZdddZdd ZdS )TSResultmatchnormstartendcaseis_exactNc                 C   s(   || _ || _|| _|| _|| _|| _d S Nr   )selfr   r   r   r	   r
   exact r   G/home/ubuntu/.local/lib/python3.10/site-packages/textsearch/__init__.py__init__   s   
zTSResult.__init__c                    s&   d  jjd fdd jD S )Nz{}({}), c              	      s"   g | ]}d  |tt |qS )z{}={})formatreprgetattr.0xr   r   r   
<listcomp>   s   " z%TSResult.__repr__.<locals>.<listcomp>)r   	__class____name__join	__slots__r   r   r   r   __repr__   s   zTSResult.__repr__)NN)r   
__module____qualname__r   r   r   r   r   r   r   r      s    
r   c                 C   s   | d   | dd    S )Nr      )upperlower)kr   r   r   to_sentence_case   s   r&   c                 C   sD   | |   krdS | |  krdS | |  krdS | t| kr dS dS )Nr#   titler$   sentmixed)r#   r'   r$   r&   )wordr   r   r   determine_case"   s   r+   )r#   r$   r'   r(   c                   @   s4  e Zd ZeeddfddZdd Zdd Zd	d
 Zdd Zdd Z		dIddZ
dd ZdJddZdd Zdd Zdd Zdd Zdd  Zd!d" ZdJd#d$Zd%d& Zd'd( Zd)d* Zd+d, Zd-d. ZdKd/d0ZdKd1d2ZdLd3d4Zd5d6 Zd7d8 Zd9d: Zd;d< Zd=d> Z d?d@ Z!dAdB Z"dCdD Z#dEdF Z$dGdH Z%dS )M
TextSearchFNc           
      K   s   d}||vrt d|d}	t|tr||	vrt d|	|ddtttttfv r1t d|	t	
 | _|| _|dkr?|nt| _|  | _| jdv | _|pQt | _|pWt | _d| _|| _|pbg | _i | _d	S )
ul   TextSearch is built on a C implementation of Aho-Corasick available in the ahocorasick[0] lib.

        It mainly helps with providing convenience for NLP / text search related tasks.
        For example, it will help find tokens by default only if it is a full word match (and not a sub-match).
        Though this can be adapted.

        Arguments:
            case: one of "ignore", "insensitive", "sensitive", "smart"
                - ignore: converts both sought words and to-be-searched to lower case before matching.
                          Text matches will always be returned in lowercase.
                - insensitive: converts both sought words and to-be-searched to lower case before matching.
                               However: it will return the original casing as it uses the position found.
                - sensitive: does not do any conversion, will only match on exact words added.
                - smart: takes an input `k` and also adds k.title(), k.upper() and k.lower(). Matches sensitively.
            returns: one of 'match', 'norm', 'object' (becomes TSResult) or a custom class
                See the examples!
                - match: returns the sought key when a match occurs
                - norm: returns the associated (usually normalized) value when a key match occurs
                - "object": convenient object for working with matches, e.g.:
                  TSResult(match='HI', norm='greeting', start=0, end=2, case='upper', is_exact=True)
                - class: bring your own class that will get instantiated like:
                  MyClass(**{"match": k, "norm": v, "case": "lower", "exact": False, start: 0, end: 1})
                  Trick: to get json-serializable results, use `dict`.
            left_bound_chars (set(str)):
                Characters that will determine the left-side boundary check in findall/replace.
                Defaults to set([A-Za-z0-9_])
            right_bound_chars (set(str)):
                Characters that will determine the right-side boundary check in findall/replace
                Defaults to set([A-Za-z0-9_])
            replace_foreign_chars (default=False): replaces 'á' with 'a' for example in both input and target.
                Adds a roughly 15% slowdown.
            handlers (list): provides a way to add hooks to matches.
                Currently only used when left and/or right bound chars are set.
                Regex can only be used when using norm
                The default handler that gets added in any case will check boundaries.
                Check how to conveniently add regex at the `add_regex_handler` function.
                Default: (False, True, self.bounds_check)
                - The first argument should be the normalized tag to fire on.
                - The second argument should be whether to keep the result
                - The third should be the handler function, that takes the arguments:
                  - text: the original sentence/document text
                  - start: the starting position of said string
                  - stop: the ending position of said string
                  - norm: the normalized result found
                  Should return:
                  - start: In case start position should change it is possible
                  - stop: In case end position should change it is possible
                  - norm: the new returned item. In case this is None, it will be removed
                Custom example:
                  >>> def custom_handler(text, start, stop, norm):
                  >>>    return start, stop, text[start:stop] + " is OK"
                  >>> ts = TextSearch("ignore", "norm", handlers=[("HI", True, custom_handler)])
                  >>> ts.add("hi", "HI")
                  >>> ts.findall("hi HI")
                  ['hi is OK', 'HI is OK']
        Examples:
            >>> from textsearch import TextSearch

            >>> ts = TextSearch(case="ignore", returns="match")
            >>> ts.add("hi")
            >>> ts.findall("hello, hi")
            ["hi"]

            >>> ts = TextSearch(case="ignore", returns="norm")
            >>> ts.add("hi", "greeting")
            >>> ts.add("hello", "greeting")
            >>> ts.findall("hello, hi")
            ["greeting", "greeting"]

            >>> ts = TextSearch(case="ignore", returns="match")
            >>> ts.add(["hi", "bye"])
            >>> ts.findall("hi! bye! HI")
            ["hi", "bye", "hi"]

            >>> ts = TextSearch(case="insensitive", returns="match")
            >>> ts.add(["hi", "bye"])
            >>> ts.findall("hi! bye! HI")
            ["hi", "bye", "HI"]

            >>> ts = TextSearch("sensitive", "object")
            >>> ts.add("HI")
            >>> ts.findall("hi")
            []
            >>> ts.findall("HI")
            [TSResult(match='HI', norm='HI', start=0, end=2, case='upper', is_exact=True)]

            >>> ts = TextSearch("sensitive", dict)
            >>> ts.add("hI")
            >>> ts.findall("hI")
            [{'case': 'mixed', 'end': 2, 'exact': True, 'match': 'hI', 'norm': 'hI', 'start': 0}]

        Notes:
           [0]: https://github.com/WojciechMula/pyahocorasick
        )ignoreinsensitive	sensitivesmartz#argument 'case' must be one of {!r})r   r   objectz/argument 'returns' must be one of {!r} or classTFr1   )r-   r.   N)
ValueErrorr   
isinstancestrboolintlisttuplesetahocorasick	Automaton	automatonr
   r   returnsget_extraction_fn
extract_fn_ignore_case_in_searchleft_bound_charsright_bound_charswords_changedreplace_foreign_charshandlers
_root_dict)
r   r
   r=   rA   rB   rD   rE   kwargs	case_optsreturns_optsr   r   r   r   8   s&   h



zTextSearch.__init__c                 C   s~   t |ts	td| |}|| j ||j g }t }|j| j D ]}|d |vr9|| ||d  q%||_|S )Nz8Can only be merged with another TextSearch derived class)	r3   r,   	TypeErrorto_tsaddrF   r9   rE   append)r   othertsrE   seenr   r   r   r   __add__   s   


zTextSearch.__add__c                 C   s   |  g dd| d S )N)zhttp://zhttps://zwww.z[^ ]+add_regex_handlerr   keep_resultr   r   r   add_http_handler   s   zTextSearch.add_http_handlerc                 C   s   |  | | | d S r   )add_tweet_hashtagadd_tweet_mentionrU   r   r   r   add_twitter_handler   s   
zTextSearch.add_twitter_handlerc                 C      |  dgd| d S )N#z[a-zA-Z][^ !$%^&*+.]+rS   rU   r   r   r   rX         zTextSearch.add_tweet_hashtagc                 C   r[   )N@z[a-zA-Z][a-zA-Z0-9_]+rS   rU   r   r   r   rY      r]   zTextSearch.add_tweet_mentionTr   c           
      C   sj   |pdt dd|d  p|d  }|D ]}| || q|r#| jn| j}	| j|||	|||f dS )a   Allows adding a regex that should hit when a prefix or postfix gets found.
        words: list of words that when hit trigger the regex handler.
        regex: the regex pattern that gets applied to a matched word.
            NOTE: Already assumes the words to occur at either the start or the end of the match!
        keep_result: whether to throw away the result when found.
        prefix:
            - True means from the start of the match to the end of the whole string
            - False means means from the start of the whole string to the end of the match
        name: give a name for the token (used internally, only set to avoid a clash).
            By default would choose the first of the words, stripped of special chars.
        return_value: either a fixed value when set, otherwise the regex match.

        Let's explain this example
        ts.add_regex_handler(["http://", "https://", "www."], "[^ ]+", keep_result=True)

        Will match on http://, https://, www. and try the regex '[^ ]+' afterwards, matching urls.

        Warning: better not to try to use regex with too short of a prefix, e.g. just a single letter.
        $z	[^a-zA-Z] r   N)resubr#   add_oneprefix_regex_handlerpostfix_regex_handlerrE   rN   )
r   wordsregexrV   prefixnamereturn_valueflagsr*   handlerr   r   r   rT      s
   &zTextSearch.add_regex_handlerc                    st   d | j }|jjdkr fdd| j D }|di |}n fdd|j D }|jdi |}|| |S )N)r<   r@   rC   r?   rF   typec                       i | ]\}}| vr||qS r   r   r   r%   v
block_keysr   r   
<dictcomp>       z$TextSearch.to_ts.<locals>.<dictcomp>c                    rn   r   r   ro   rq   r   r   rs      rt   r   )rF   r   r   __dict__itemsrM   )r   rP   datainitsr   rq   r   rL      s   
zTextSearch.to_tsc                 C   sd   t |ttfr|D ]}| || q	d S t |tr*| D ]
\}}| || qd S | || d S r   )r3   r9   r7   rc   dictrv   )r   r%   rp   r   kkvvr   r   r   rM     s   
zTextSearch.addc                 C   sP   | j dkr| j| ||f d S | j dkr$| j| ||f d S td)Nr   r   zLignore only returns a match or normalized value. Maybe you want insensitive?)r=   r<   add_wordr$   r2   r   r%   rp   lengthr   r   r   
add_ignore  s   

zTextSearch.add_ignorec                 C   sh   | j dkr| j| ||f d S | j dkr$| j| |df d S | j| ||ddf d S )Nr   r   rJ   Fr   r   )r=   r<   r|   r$   r}   r   r   r   add_insensitive  s
   

 zTextSearch.add_insensitivec                 C   s   || j vrdS | j |= d| _| jrt|}| jdkrB| j| | j|  | j|  | j|	  | jt
| dS | jdkrO| j| dS | j|  dS )z; Remove k from known words. Takes into account the casing. FTr0   r/   )rF   rC   rD   r   r
   r<   remove_wordr$   r'   r#   r&   )r   r%   r   r   r   remove"  s$   


zTextSearch.removec                 C   s   t |}| jdr|n|}| jdr|d | n|}| j|||f | jdkr,d S t| |	 |
 t|gg dD ]\}}||ksL|| jv rMq?| j|||f q?d S )Nr   _caser   r0   r#   r'   r$   r(   )r+   r=   
startswithendswithr<   r|   r
   zipr#   r'   r$   r&   )r   r%   rp   r~   r
   
text_valueresultkeyr   r   r   add_sensitive_string6  s   
zTextSearch.add_sensitive_stringc                 C   s   t |}|||dd}| j|||f | jdkrd S t| | | t|gg dD ]\}}||ks;|| jv r<q.|||dd}| j|||f q.d S )NT)r   r   r
   r   r0   r   F)	r+   r<   r|   r
   r   r#   r'   r$   r&   )r   r%   rp   r~   r
   objr   r   r   r   add_sensitive_objectH  s   
zTextSearch.add_sensitive_objectc                 C   s.   | j dv r| ||| d S | ||| d S )N)r   r   )r=   r   r   r}   r   r   r   add_sensitiveZ  s   
zTextSearch.add_sensitivec                 C   s   || j |< d| _| jrt|}|d u r|n|}t|}| jdkr)| ||| d S | jdkr7| ||| d S | ||| d S )NTr-   r.   )	rF   rC   rD   r   lenr
   r   r   r   r}   r   r   r   rc   b  s   


zTextSearch.add_onec                 C   sR   t ||kr|| | jv rdS |dkr||d  | jv rdS ||| ||||fS )N)NNNr   r"   )r   rB   rA   r?   )r   textr   stopr   r   r   r   bounds_checkr  s
   zTextSearch.bounds_checkc                 C   s   | j   d| _| jrt|}| jr| n|}| js0| js0| j	s0| j 
|D ]\}\}} dS | jdd| jfg }| j 
|D ]8\}\}}|du rKq@|| d }|d }|D ] \}	}
}|	rc|	|urcqW|||||\}}}|durw|
rw  dS qWq@dS )z] Test whether any known words match in text.
        text: str
        returns: bool
        FTNr"   )r<   make_automatonrC   rD   r   r@   r$   rE   rA   rB   iterr   )r   r   _text	end_indexr~   r   rE   r   r   comparerV   rl   r   r   r   r   containsz  s.   
zTextSearch.containsc                 C   s   | j r| j  d| _ d S )NF)rC   r<   r   r   r   r   r   build_automaton  s   

zTextSearch.build_automatonc                    s  |    | jrt|}g }d}| jr| n|}| jdd| jfg }| j|D ]\}\}}|du r3q(|| d  |d }	|D ]x\}
}}|
rR|
|urR||
ddkrRq?|| |	|\ }	}|du rb nV|ss|	}|	|   |df  nE |kr|	}|   ||f}|	| q?|	  |d d krt
|	|}|   ||f}t fdd	t|D d}|d| }|	| q?q(d
d |D S )[ Finds the known words in text.
        text: str
        returns: list of matches
        rJ   FTNr"   r   r   c                 3   s$    | ]\}}|d   kr|V  qdS )r"   Nr   )r   ir%   r   r   r   	<genexpr>  s   " z%TextSearch.findall.<locals>.<genexpr>c                 S   s    g | ]}|d  dur|d  qS )   Nr   r   r   r   r   r     s     z&TextSearch.findall.<locals>.<listcomp>)r   rD   r   r@   r$   rE   r   r<   r   rN   maxnext	enumerate)r   r   keywordscurrent_stopr   rE   r   r~   r   r   r   rV   rl   r   keep_up_to_indr   r   r   findall  sH   

	zTextSearch.findallc                    s        jrtg } jr n} js- js- js- fdd j	|D S  jdd j
fg } j	|D ];\}\}}|du rHq=|| d }|d }	|D ]#\}
}}|
r`|
|ur`qT|||	|\}}}|smqT|du rrqT|| qTq=dd |D S )r   c                    s0   g | ]\}\}}  || d  |d  |qS )r"   )r?   )r   r   r~   r   r   r   r   r   r     s    
z/TextSearch.find_overlapping.<locals>.<listcomp>FTNr"   c                 S   s   g | ]}|d ur|qS r   r   r   r   r   r   r     s    )r   rD   r   r@   r$   rE   rA   rB   r<   r   r   rN   )r   r   r   r   rE   r   r~   r   r   r   r   rV   rl   r   r   r   r   r   find_overlapping  s4   
	zTextSearch.find_overlappingc                    "   t j||d  fdd}|S )Nrk   c                    s   |dkr| |d  j v r||d fS  | |d  }|rB||  }p*| || }tjts5d|i}|||| }|||fS ||d fS )Nr   r"   r   )rA   r   r	   r3   r=   r4   r?   r   r   r   r   reg_resrvrg   rj   r   r   r   regex_handler  s   


z6TextSearch.prefix_regex_handler.<locals>.regex_handlerra   compiler   rrj   rk   r   r   r   r   rd        zTextSearch.prefix_regex_handlerc                    r   )Nr   c                    s   t | |kr| | jv r||d fS  | d | }|r@| }p(| || }tjts3d|i}|||| }|||fS ||d fS )Nr   )r   rB   searchr   r3   r=   r4   r?   r   r   r   r   r     s   


z7TextSearch.postfix_regex_handler.<locals>.regex_handlerr   r   r   r   r   re     r   z TextSearch.postfix_regex_handlerc                 C   s  | j dkrt| j std|   | jrt|}dg}d}| jr%| n|}| jdd| j	fg }| j
|D ]n\}\}}	|| d }
|d }|D ][\}}}|rU||	urUqI|||
||	\}
}}|du re n@|si n<|
|kr|}||
 |
|||
| |ff}|| qI||
 |d d	 krt||}||
 |
|||
| |ff}||d< qIq7|dt|dd
f d}t|dd |dd D ]&\\}}}}\}}}}t|d tr|d n|d j}	|||| |	 7 }q|r|dd |dd D fS |S )z Replaces known words in text.
        text: str
        returns: replaced str
        If return_entities=True, returns: replaced str, list of matches
        r   zno idea how i would do that)NNr   r`   r`   rJ   FTr"   Nr   r   r`   c                 S   s   g | ]}|d  qS )rJ   r   r   r   r   r   r   J  s    z&TextSearch.replace.<locals>.<listcomp>)r=   callabler2   r   rD   r   r@   r$   rE   r   r<   r   rN   r   r   r   r3   r4   r   )r   r   return_entitiesr   r   r   rE   r   r~   r   r   r   r   rV   rl   r   text_r   start1stop1result1start2stop2result2r   r   r   replace  sN   
 zTextSearch.replacec                 C   s   |S r   r   )r   start_indexr	   r   r   r   r   r   extract_strM  s   zTextSearch.extract_strc                 C   s   ||| }|S r   r   r   r   r	   r   r   r   r   r   r   extract_insensitive_matchP  s   z$TextSearch.extract_insensitive_matchc                 C   s$   t ||| }t|dd |S )Nc                 S   s   | S r   r   )r   r   r   r   <lambda>V  s    z5TextSearch.extract_insensitive_norm.<locals>.<lambda>)r+   case_fnget)r   r   r	   r   r   r
   r   r   r   extract_insensitive_normT  s   z#TextSearch.extract_insensitive_normc                 C   s@   ||| }||d< t ||d< ||d< ||d< | jdi |S )Nr   r
   r   r	   r   )r+   r=   r   r   r   r   extract_insensitive_objectX  s   z%TextSearch.extract_insensitive_objectc                 C   s    ||d< ||d< | j di |S )Nr   r	   r   )r=   )r   r   r	   r   r   r   r   r   extract_object`  s   zTextSearch.extract_objectc                 C   s\   | j dkr| jdkr| j}|S | jdkr| j}|S | j}|S t| jtr)| j}|S | j}|S )Nr.   r   r   )	r
   r=   r   r   r   r3   r4   r   r   )r   r?   r   r   r   r>   e  s   

	
zTextSearch.get_extraction_fnc                 C   s
   t | jS r   )r   r<   r   r   r   r   __len__s     
zTextSearch.__len__c                 C   s
   | j  S r   )r<   __iter__r   r   r   r   r   v  r   zTextSearch.__iter__c                 C   s6   | j rt|}| js| jdkr| | jv S || jv S )Nr0   )rD   r   r@   r
   r$   r<   )r   r   r   r   r   __contains__y  s
   
zTextSearch.__contains__c                 C   sz   d t| g}| jtkr|d d| j | jtkr&|d d| j | jr.|d d | jj| j	| j
d|S )Nznum_items={}z{}={!r}rA   rB   zreplace_foreign_chars=Truez{}(case={!r}, returns={!r}, {})r   )r   r   rA   ALPHANUMrN   rB   rD   r   r   r
   r=   r   )r   sr   r   r   r     s   


zTextSearch.__repr__)TNNr   r   )r   )F)&r   r    r!   r   r   rR   rW   rZ   rX   rY   rT   rL   rM   r   r   r   r   r   r   rc   r   r   r   r   r   rd   re   r   r   r   r   r   r   r>   r   r   r   r   r   r   r   r   r,   7   sP    





?
 

.r,   )ra   stringr   r:   r9   digitsascii_lettersr   ALPHAascii_lowercaseALPHA_LOWERascii_uppercaseALPHA_UPPERr1   r   r&   r+   r4   r#   r$   r'   r   r,   r   r   r   r   <module>   s"    


	