o
    i>                     @   sH   d dl Z d dlmZmZmZ ddlmZ ddlmZ G dd deZ	dS )    N)ListOptionalTuple   )
Lemmatizer)Tokenc                       s  e Zd ZdZededeee ee f f fddZde	dee fddZ
d	ed
ee dee fddZded
ee dedee dee f
ddZded
ee dedee dee f
ddZded
ee dedee dee f
ddZded
ee dedee dee f
ddZded
ee dedee dee f
ddZded
ee dee dee dee f
ddZded
ee dee dee dee f
ddZded
ee dee dee dee f
ddZ  ZS ) SpanishLemmatizerzH
    Spanish rule-based lemmatizer with morph-based rule selection.
    modereturnc                    s$   |dkrg d}|g fS t  |S )Nrule)lemma_ruleslemma_rules_groupslemma_index	lemma_exc)superget_lookups_config)clsr	   required	__class__ L/home/ubuntu/.local/lib/python3.10/site-packages/spacy/lang/es/lemmatizer.pyr      s   z$SpanishLemmatizer.get_lookups_configtokenc                 C   s  |j |jt|jf}|| jv r| j| S |j}|j }t|j}|dv r*| gS |dv r=|j	r:|dkr:| gS |gS | }| j
d|i |}|d urWt|}n.|dkr^d}n|}| |t|}	| j
d|g }
t| d| |||	|
}tt|}|| j|< |S )	N) eolspace)	adpcconjintjpartpropnpunctsconjsymxr    r   auxverbr   
lemmatize_)orthposstrmorphcachetextpos_lowersetis_sent_startlookups	get_tablegetlistselect_rulegetattrdictfromkeys)selfr   	cache_keystringr)   featuresexclemmasrule_posr   indexr   r   r   rule_lemmatize   s6   







z SpanishLemmatizer.rule_lemmatizer)   r=   c                 C   sD   | j d}||v r || D ]}t|d |r|d   S qd S )Nr      r   )r2   r3   r0   issubset)r:   r)   r=   groupsgroupr   r   r   r6   B   s   zSpanishLemmatizer.select_rulewordr   rA   c              	   C      g }g }| j d|g D ]\}}t|d ||}	|	|kr$||	 qg }
d|v rS|D ]%}	|	ds9|	drR| j ddg D ]\}}|
t|||	 qCq-||
 |D ]}||v re|| qZt|dkrn|S t|dkrv|S |gS )a"  
        Lemmatize an adjective.

        word (str): The word to lemmatize.
        features (List[str]): The morphological features as a list of Feat=Val
            pairs.
        index (List[str]): The POS-specific lookup list.

        RETURNS (List[str]): The list of lemmas.
        r   $Number=Plurnsaccentsr   	r2   r3   r4   resubappendendswithextendlenr:   rG   r=   r   rA   possible_lemmasselected_lemmasoldnewpossible_lemmaadditional_lemmaslemmar   r   r   lemmatize_adjJ   4   


zSpanishLemmatizer.lemmatize_adjc                 C   s6   | j ddg D ]\}}||kr|g  S q
|gS )a  
        Lemmatize an adverb.

        word (str): The word to lemmatize.
        features (List[str]): The morphological features as a list of Feat=Val
            pairs.
        index (List[str]): The POS-specific lookup list.

        RETURNS (List[str]): The list of lemmas.
        r   adverbs)r2   r3   r4   )r:   rG   r=   r   rA   rX   rY   r   r   r   lemmatize_advz   s
   
zSpanishLemmatizer.lemmatize_advc                 C   s   g }g }| j ddg D ]\}}||kr|g  S q| j ddg D ]\}}||kr3|g  S q&| j ddg D ]\}}t|d ||}	||	 q>|| t|dkr^|S t|dkr||D ]}
|
|v rq||
 qft|dkrz|S |S g S )a"  
        Lemmatize a determiner.

        word (str): The word to lemmatize.
        features (List[str]): The morphological features as a list of Feat=Val
            pairs.
        index (List[str]): The POS-specific lookup list.

        RETURNS (List[str]): The list of lemmas.
        r   detdet_and_pron_fixeddet_and_pron_generalrI   rC   r2   r3   r4   rO   rP   rQ   rT   r:   rG   r=   r   rA   rV   rW   rX   rY   rZ   r\   r   r   r   lemmatize_det   s:   



zSpanishLemmatizer.lemmatize_detc              	   C   rH   )a  
        Lemmatize a noun.

        word (str): The word to lemmatize.
        features (List[str]): The morphological features as a list of Feat=Val
            pairs.
        index (List[str]): The POS-specific lookup list.

        RETURNS (List[str]): The list of lemmas.
        r   rI   rJ   rK   rL   rM   r   rN   rU   r   r   r   lemmatize_noun   r^   z SpanishLemmatizer.lemmatize_nounc                 C   sl   | j ddg D ]\}}||kr|g  S q
|d}td|d r,tdd|}tdd|}|gS )	a  
        Lemmatize a numeral.

        word (str): The word to lemmatize.
        features (List[str]): The morphological features as a list of Feat=Val
            pairs.
        index (List[str]): The POS-specific lookup list.

        RETURNS (List[str]): The list of lemmas.
        r   num,z(\.)([0-9]{3})$r   z\.r   .)r2   r3   r4   splitrO   searchrP   )r:   rG   r=   r   rA   rX   rY   splitted_wordr   r   r   lemmatize_num   s   

zSpanishLemmatizer.lemmatize_numc                 C   s  g }g }| j ddg D ]\}}||kr|g  S q| j ddg D ]\}}||kr3|g  S q&| j ddg D ]\}}t|d ||}	|	|krT||	 q>|| t|dkrb|S t|dkr|D ]}
|
|v ru||
 qjt|dkr~|S |S g S )a  
        Lemmatize a pronoun.

        word (str): The word to lemmatize.
        features (List[str]): The morphological features as a list of Feat=Val
            pairs.
        index (List[str]): The POS-specific lookup list.

        RETURNS (List[str]): The list of lemmas.
        r   pronrb   rc   rI   rC   rd   re   r   r   r   lemmatize_pron  s>   




z SpanishLemmatizer.lemmatize_pronc              	   C   s  d|v r|  ||||S g }g }t|pd}| jd|g D ]\}}t|d ||}	|	|kr6||	 q |D ]}
|
|v rD||
 q9t|dkr|D ]^}
| jddg D ].\}}||
v rt	|
D ]!\}}||kr|
d| | |
|d d  }||v r|| qeqY| jdd	g D ]\}}||
v r|

||d}||v r|| qqMg }|D ]}	| jdd
g D ]\}}|t|||	 qq|| t|dkr|S t|dkr|S |gS )a  
        Lemmatize a verb.

        word (str): The word to lemmatize.
        features (List[str]): The morphological features as a list of Feat=Val
            pairs.
        index (List[str]): The POS-specific lookup list.

        RETURNS (List[str]): The list of lemmas.
        PronType=Prsr   r   rI   r   	voc_alt_1NrC   	voc_alt_2rM   )lemmatize_verb_pronr*   r2   r3   r4   rO   rP   rQ   rT   	enumeratereplacerS   )r:   rG   r=   r   rA   rV   rW   rX   rY   rZ   r\   icharvoc_alt_lemmar[   r   r   r   lemmatize_verbG  s\   

 

	
z SpanishLemmatizer.lemmatize_verbc              	   C   sl  d}g }|}t ||}|d ur:t|dkr:t |dd d|}|dg| }t ||}|d ur:t|dks| jddg D ]\}	}
t |	|
|}qD| jdd	i |}|d urf|d
 }n| d	|}| 	||dh ||d
 }g }|D ].}| jddi |}|d ur|
|d
  q}| d|}|
| ||||d
  q}|d d| gS )Nz^(.*?)([mts]e|l[aeo]s?|n?os)$r      rI   r   r   rM   r   r&   r   rq   ro    )rO   rl   rT   rP   rF   r2   r3   r4   r6   rz   rQ   rp   join)r:   rG   r=   r   rA   	pron_pattpronsr&   mrX   rY   r>   
verb_lemmapron_lemmasro   r   r   r   rt     s8   
z%SpanishLemmatizer.lemmatize_verb_pron)__name__
__module____qualname____doc__classmethodr*   r   r   r   r   rB   r   r6   r]   r`   rf   rg   rn   rp   rz   rt   __classcell__r   r   r   r   r      s    (-
0

5
0

7
Er   )
rO   typingr   r   r   pipeliner   tokensr   r   r   r   r   r   <module>   s
    