o
    i /                     @   s&  d dl Z d dlZd dlZd dlmZ d dlmZmZmZm	Z	m
Z
mZmZmZ d dlmZ ddlmZ ddlmZmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZmZ ddlm Z  ddlm!Z!m"Z"m#Z# ddl$m%Z% ddl&m'Z' de	e  dee(ef fddZ)dd Z*G dd de'Z+dd Z,dS )    N)Path)AnyCallableDictIterableListOptionalTupleUnion)Model   )util)ErrorsWarnings)Language)Lookupsload_lookups)Scorer)DocToken)Example)SimpleFrozenListloggerregistry)Vocab   )Pipeexamplesreturnc                 K   s   t j| dfi |S )Nlemma)r   score_token_attr)r   kwargs r"   M/home/ubuntu/.local/lib/python3.10/site-packages/spacy/pipeline/lemmatizer.pylemmatizer_score   s   r$   c                   C   s   t S N)r$   r"   r"   r"   r#   make_lemmatizer_scorer   s   r&   c                   @   s  e Zd ZdZededeee ee f fddZ	d3dde	d	d
e
dee dedededee ddfddZedd ZdedefddZ	d4ddddeeg ee f  dee dee fddZejfdeddfddZd edee fd!d"Zd edee fd#d$Zd edefd%d&Ze  d'd(e!ee"f d)ee fd*d+Z#e  d'd(e!ee"f d)ee dd fd,d-Z$e  d'd)ee de%fd.d/Z&e  d'd0e%d)ee dd fd1d2Z'dS )5
Lemmatizerz
    The Lemmatizer supports simple part-of-speech-sensitive suffix rules and
    lookup tables.

    DOCS: https://spacy.io/api/lemmatizer
    moder   c                 C   s0   |dkr	dgg fS |dkrdgddgfS g g fS )a  Returns the lookups configuration settings for a given mode for use
        in Lemmatizer.load_lookups.

        mode (str): The lemmatizer mode.
        RETURNS (Tuple[List[str], List[str]]): The required and optional
            lookup tables for this mode.
        lookuplemma_lookuprulelemma_rules	lemma_exclemma_indexr"   )clsr(   r"   r"   r#   get_lookups_config%   s
   	
zLemmatizer.get_lookups_config
lemmatizerr)   F)r(   	overwritescorervocabmodelnamer2   r3   Nc                C   s   || _ || _|| _|| _t | _|| _d| _| jdkr | j	| _
n$| jdkr*| j| _
n| j d}t| |s>ttjj|dt| || _
i | _|| _dS )a&  Initialize a Lemmatizer.

        vocab (Vocab): The vocab.
        model (Model): A model (not yet implemented).
        name (str): The component name. Defaults to "lemmatizer".
        mode (str): The lemmatizer mode: "lookup", "rule". Defaults to "lookup".
        overwrite (bool): Whether to overwrite existing lemmas. Defaults to
            `False`.
        scorer (Optional[Callable]): The scoring method. Defaults to
            Scorer.score_token_attr for the attribute "lemma".

        DOCS: https://spacy.io/api/lemmatizer#init
        Fr)   r+   
_lemmatize)r(   N)r4   r5   r6   _moder   lookupsr2   
_validatedr(   lookup_lemmatize	lemmatizerule_lemmatizehasattr
ValueErrorr   E1003formatgetattrcacher3   )selfr4   r5   r6   r(   r2   r3   	mode_attrr"   r"   r#   __init__4   s"   





zLemmatizer.__init__c                 C   s   | j S r%   )r8   rD   r"   r"   r#   r(   ^   s   zLemmatizer.modedocc              
   C   s   | j s	| tj |  }z|D ]}| js|jdkr"| |d |_q|W S  t	yA } z|| j
| |g| W Y d}~dS d}~ww )zApply the lemmatizer to one document.

        doc (Doc): The Doc to process.
        RETURNS (Doc): The processed Doc.

        DOCS: https://spacy.io/api/lemmatizer#call
        r   N)r:   _validate_tablesr   E1004get_error_handlerr2   r   r<   lemma_	Exceptionr6   )rD   rH   error_handlertokener"   r"   r#   __call__b   s    zLemmatizer.__call__)nlpr9   get_examplesrR   r9   c                C   sx   |  | j\}}|du r1td t| jj|d}t| jj|dd}|jD ]}|||	| q%|| _
| tj dS )a  Initialize the lemmatizer and load in data.

        get_examples (Callable[[], Iterable[Example]]): Function that
            returns a representative sample of gold-standard Example objects.
        nlp (Language): The current nlp object the component is part of.
        lookups (Lookups): The lookups object containing the (optional) tables
            such as "lemma_rules", "lemma_index", "lemma_exc" and
            "lemma_lookup". Defaults to None.
        Nz2Lemmatizer: loading tables from spacy-lookups-data)langtablesF)rT   rU   strict)r0   r(   r   debugr   r4   rT   rU   	set_table	get_tabler9   rI   r   rJ   )rD   rS   rR   r9   required_tablesoptional_tablesoptional_lookupstabler"   r"   r#   
initializeu   s   


zLemmatizer.initializeerror_messagec                 C   sH   |  | j\}}|D ]}|| jvrt|j| j|| jjdq
d| _dS )z8Check that the lookups are correct for the current mode.)r(   rU   foundTN)r0   r(   r9   r?   rA   rU   r:   )rD   r_   rZ   r[   r]   r"   r"   r#   rI      s   

zLemmatizer._validate_tablesrO   c                 C   s2   | j di }||j|j}t|tr|g}|S )zLemmatize using a lookup-based approach.

        token (Token): The token to lemmatize.
        RETURNS (list): The available lemmas for the string.

        DOCS: https://spacy.io/api/lemmatizer#lookup_lemmatize
        r*   )r9   rY   gettext
isinstancestr)rD   rO   lookup_tableresultr"   r"   r#   r;      s
   
zLemmatizer.lookup_lemmatizec                 C   s  |j |j|jjf}|| jv r| j| S |j}|j }|dv r.|dkr)t	t
j | gS | |r8| gS | jdi }| jdi }| jdi }t||||||fsi|dkrd|gS | gS ||i }||i }	||i }
|}| }g }g }|
D ]8\}}||r|dt|t|  | }|sq||v s| s||v r|d| q|| q|| qtt|}|	|g D ]}||vr|d| q|s|| |s|| || j|< |S )	zLemmatize using a rule-based approach.

        token (Token): The token to lemmatize.
        RETURNS (list): The available lemmas for the string.

        DOCS: https://spacy.io/api/lemmatizer#rule_lemmatize
        ) eolspacerg   r.   r-   r,   propnNr   )orthposmorphkeyrC   rb   pos_lowerwarningswarnr   W108is_base_formr9   rY   anyra   endswithlenisalphainsertappendlistdictfromkeysextend)rD   rO   	cache_keystringuniv_posindex_table	exc_tablerules_tableindex
exceptionsrulesorigforms	oov_formsoldnewformr"   r"   r#   r=      sh   











zLemmatizer.rule_lemmatizec                 C   s   dS )a  Check whether the token is a base form that does not need further
        analysis for lemmatization.

        token (Token): The token.
        RETURNS (bool): Whether the token is a base form.

        DOCS: https://spacy.io/api/lemmatizer#is_base_form
        Fr"   )rD   rO   r"   r"   r#   rt      s   	zLemmatizer.is_base_formexcludepathr   c                   s8   i } fdd|d< fdd|d< t ||  dS )zSerialize the pipe to disk.

        path (str / Path): Path to a directory.
        exclude (Iterable[str]): String names of serialization fields to exclude.

        DOCS: https://spacy.io/api/lemmatizer#to_disk
        c                       j j|  dS Nr   )r4   to_diskpr   rD   r"   r#   <lambda>	      z$Lemmatizer.to_disk.<locals>.<lambda>r4   c                        j | S r%   )r9   r   r   rG   r"   r#   r   
      r9   N)r   r   )rD   r   r   	serializer"   r   r#   r      s   
zLemmatizer.to_diskc                   @   i } fdd|d< fdd|d< t ||    S )aH  Load the pipe from disk. Modifies the object in place and returns it.

        path (str / Path): Path to a directory.
        exclude (Iterable[str]): String names of serialization fields to exclude.
        RETURNS (Lemmatizer): The modified Lemmatizer object.

        DOCS: https://spacy.io/api/lemmatizer#from_disk
        c                    r   r   )r4   	from_diskr   r   r"   r#   r     r   z&Lemmatizer.from_disk.<locals>.<lambda>r4   c                    r   r%   )r9   r   r   rG   r"   r#   r     r   r9   )r   r   rI   )rD   r   r   deserializer"   r   r#   r        zLemmatizer.from_diskc                   s.   i } fdd|d< j j|d< t| S )zSerialize the pipe to a bytestring.

        exclude (Iterable[str]): String names of serialization fields to exclude.
        RETURNS (bytes): The serialized object.

        DOCS: https://spacy.io/api/lemmatizer#to_bytes
        c                      s   j j dS r   )r4   to_bytesr"   r   r"   r#   r   (  s    z%Lemmatizer.to_bytes.<locals>.<lambda>r4   r9   )r9   r   r   )rD   r   r   r"   r   r#   r     s   zLemmatizer.to_bytes
bytes_datac                   r   )a  Load the pipe from a bytestring.

        bytes_data (bytes): The serialized pipe.
        exclude (Iterable[str]): String names of serialization fields to exclude.
        RETURNS (Lemmatizer): The loaded Lemmatizer.

        DOCS: https://spacy.io/api/lemmatizer#from_bytes
        c                    r   r   )r4   
from_bytesbr   r"   r#   r   8  r   z'Lemmatizer.from_bytes.<locals>.<lambda>r4   c                    r   r%   )r9   r   r   rG   r"   r#   r   9  r   r9   )r   r   rI   )rD   r   r   r   r"   r   r#   r   ,  r   zLemmatizer.from_bytes)r1   r%   )(__name__
__module____qualname____doc__classmethodrd   r	   r   r0   r$   r   r   r   boolr   rF   propertyr(   r   rQ   r   r   r   r   r^   r   E912rI   r   r;   r=   rt   r   r
   r   r   r   bytesr   r   r"   r"   r"   r#   r'      s    $	
*

F



r'   c                 C   s,   | dkrt d}|jS tdt d|  )Nmake_lemmatizerzspacy.pipeline.factorieszmodule z has no attribute )	importlibimport_moduler   AttributeErrorr   )r6   moduler"   r"   r#   __getattr__@  s   
r   )-r   sysrq   pathlibr   typingr   r   r   r   r   r   r	   r
   	thinc.apir   rg   r   errorsr   r   languager   r9   r   r   r3   r   tokensr   r   trainingr   r   r   r   r4   r   piper   rd   r$   r&   r'   r   r"   r"   r"   r#   <module>   s,    (  %