o
    i9                     @   s<  d dl Z d dlZd dlmZ d dlmZ d dlmZmZm	Z	m
Z
mZmZmZmZ d dlZd dlZd dlmZmZmZmZ d dlmZmZ ddlmZ dd	lmZ dd
lmZ ddl m!Z! ddl"m#Z#m$Z$m%Z% ddl&m'Z' ddl(m)Z) ddl*m+Z+ ddl,m-Z- ddl.m/Z/ dZ0dZ1e 2e1d Z3G dd de/Z4dd Z5dS )    N)Counter)islice)AnyCallableDictIterableListOptionalTuplecast)ConfigModelNumpyOpsSequenceCategoricalCrossentropy)Floats2dInts2d   )util)Errors)Language)Doc)Examplevalidate_examplesvalidate_get_examples)Vocab   )	EditTrees)validate_edit_tree)lemmatizer_score)TrainablePipe   z
[model]
@architectures = "spacy.Tagger.v2"

[model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v2"
pretrained_vectors = null
width = 96
depth = 4
embed_size = 2000
window_size = 1
maxout_pieces = 3
subword_features = true
modelc                   @   s  e Zd ZdZ	d?ddddedded	ed
edee de	de
de	dee fddZdee dee deeee f fddZdee dee fddZdd Zdd Zdd Zdee fd d!Zedee	d"f fd#d$Zede
fd%d&Zedefd'd(Zd)d)d*d+eg ee f d,ee  d-ee fd.d/Z!e" d0d1d2Z#e" d0d3d4Z$e" fd5d6Z%e" fd7d8Z&d-efd9d:Z'd+eg ee f fd;d<Z(d@d=d>Z)d)S )AEditTreeLemmatizerzK
    Lemmatizer that lemmatizes each word using a predicted edit tree.
    trainable_lemmatizerorth   Fr   )backoffmin_tree_freq	overwritetop_kscorervocabr!   namer&   r'   r(   r)   r*   c          	      C   sZ   || _ || _|| _|| _|| _|| _|| _t| j j| _	i | _
dg i| _|| _t | _dS )a  
        Construct an edit tree lemmatizer.

        backoff (Optional[str]): backoff to use when the predicted edit trees
            are not applicable. Must be an attribute of Token or None (leave the
            lemma unset).
        min_tree_freq (int): prune trees that are applied less than this
            frequency in the training data.
        overwrite (bool): overwrite existing lemma annotations.
        top_k (int): try to apply at most the k most probable edit trees.
        labelsN)r+   r!   r,   r&   r'   r(   r)   r   stringstrees
tree2labelcfgr*   r   	numpy_ops)	selfr+   r!   r,   r&   r'   r(   r)   r*    r4   W/home/ubuntu/.local/lib/python3.10/site-packages/spacy/pipeline/edit_tree_lemmatizer.py__init__1   s   
zEditTreeLemmatizer.__init__examplesscoresreturnc                 C   s   t |d tddd}g }|D ]8}g }t|j|jdddD ]#\}}|d u s*|dkr-d}	n| j|j|}
| j	|
d	}	|
|	 q|
| q|||\}}| jjj|rattjj| jd
t||fS )NEditTreeLemmatizer.get_lossF)	normalizemissing_valueLEMMAT)	as_string r   r,   )r   r   zip	predictedget_alignedr/   addtextr0   getappendr!   opsxpisnan
ValueErrorr   E910formatr,   float)r3   r7   r8   	loss_functruthseg	eg_truthsrC   
gold_lemmalabeltree_idd_scoreslossr4   r4   r5   get_lossW   s$   
r:   docsc                    s   j dkr	j}nj tkrj}nj}tt|}tdd |D s?tjd   fdd|D }t||ks=J |S j	
|}t||ksMJ |||}t||ksZJ |S )Nr   c                 s   s    | ]}t |V  qd S N)len).0docr4   r4   r5   	<genexpr>   s    z-EditTreeLemmatizer.predict.<locals>.<genexpr>r-   c                    s   g | ]
}j jd  qS )r   )r!   rI   alloc2i)r]   _n_labelsr3   r4   r5   
<listcomp>   s    z.EditTreeLemmatizer.predict.<locals>.<listcomp>)r)   _scores2guesses_top_k_equals_1TOP_K_GUARDRAIL_scores2guesses_top_k_greater_1_scores2guesses_top_k_guardrailr\   listanyr1   r!   predict)r3   rZ   scores2guessesn_docsguessesr8   r4   rb   r5   rk   r   s    


zEditTreeLemmatizer.predictc                 C   s   g }t ||D ]A\}}|jdd}| j|}g }t|D ]"\}}	| jd ||  }
| j|
|	jd ur:|	|
 q|	d q|	t
| q|S )Nr   )axisr-   r;   )rB   argmaxr2   asarray	enumerater1   r/   applyrF   rH   nparray)r3   rZ   r8   rn   r^   
doc_scoresdoc_guessesdoc_compat_guessesitokenrV   r4   r4   r5   re      s   z1EditTreeLemmatizer._scores2guesses_top_k_equals_1c                 C   s   g }t | jt| j}t||D ]T\}}| j|}g }t|D ];\}}	t|D ]-}
t	|| 
 }| jd | }| j||	jd urJ||  nttjj |||f< q(|d q |t| q|S )Nr-   r;   )minr)   r\   r-   rB   r2   rq   rr   rangeintrp   r1   r/   rs   rF   rH   rt   finfofloat32ru   )r3   rZ   r8   rn   r)   r^   rv   rx   ry   rz   ra   	candidatecandidate_tree_idr4   r4   r5   rg      s"   

z2EditTreeLemmatizer._scores2guesses_top_k_greater_1c                 C   s   g }t ||D ]O\}}t|dd | j d df }| j|}g }t ||D ]%\}}	d}
|	D ]}| jd | }| j||j	d urG|}
 nq0|
|
 q(|
t| q|S )N.r   r;   r-   )rB   rt   argsortr)   r2   rq   r1   r/   rs   rF   rH   ru   )r3   rZ   r8   rn   r^   rv   rw   rx   rz   
candidatesrV   r   r   r4   r4   r5   rh      s     z2EditTreeLemmatizer._scores2guesses_top_k_guardrailc           	      C   s   t |D ]H\}}|| }t|dr| }t |D ]2\}}| js'|| jdkrK|dkr<| jd ur;t|| | j|| _q| j||| j	}||| _
qqd S )NrG   r   r;   )rr   hasattrrG   r(   lemmar&   getattrr/   rs   rF   lemma_)	r3   rZ   batch_tree_idsry   r^   doc_tree_idsjrV   r   r4   r4   r5   set_annotations   s   


z"EditTreeLemmatizer.set_annotations.c                 C   s   t | jd S )z4Returns the labels currently added to the component.r-   )tupler1   r3   r4   r4   r5   r-      s   zEditTreeLemmatizer.labelsc                 C   s   dS )NTr4   r   r4   r4   r5   hide_labels   s   zEditTreeLemmatizer.hide_labelsc                 C   sz   g }t t| jD ](}| j| }d|v r| jj|d  |d< d|v r,| jj|d  |d< || q	t|t| jd dS )Norigsubstr-   )r/   r-   )	r|   r\   r/   r+   r.   rH   dictr   r1   )r3   r/   rV   treer4   r4   r5   
label_data   s   
zEditTreeLemmatizer.label_dataN)nlpr-   get_examplesr   r-   c          	         s  t |d |d u r| | n| | g }g }t| dD ]@}||j g }|jD ]!}|jdkr5d  n| |j	|j
 | fdd| jd D  q+tt|}|| jjj|dd q|   t|dkssJ tjj| jd	t|dksJ tjj| jd	| jj||d
 d S )NEditTreeLemmatizer.initialize
   r   c                    s   g | ]
}| kr
d ndqS )g      ?g        r4   )r]   rU   
gold_labelr4   r5   rd     s    z1EditTreeLemmatizer.initialize.<locals>.<listcomp>r-   r   )dtyperA   )XY)r   _labels_from_data_add_labelsr   rH   x	referencer   _pair2labelrF   r   r1   r   r   r!   rI   rq   _require_labelsr\   r   E923rN   r,   
initialize)	r3   r   r   r-   
doc_samplelabel_sampleexamplegold_labelsrz   r4   r   r5   r      s0   





  r   excludec                   sB   fddfdd fddfddd}t ||  S )Nc                        j t| S r[   )r1   updatesrsly
json_loadsbr   r4   r5   <lambda>      z/EditTreeLemmatizer.from_bytes.<locals>.<lambda>c                        j | S r[   )r!   
from_bytesr   r   r4   r5   r         c                       j j|  dS Nr   )r+   r   r   r   r3   r4   r5   r         c                    r   r[   )r/   r   r   r   r4   r5   r     r   r1   r!   r+   r/   )r   r   )r3   
bytes_datar   deserializersr4   r   r5   r     s   


zEditTreeLemmatizer.from_bytesc                   s<   fddfdd fddfddd}t | S )Nc                      s   t  jS r[   )r   
json_dumpsr1   r4   r   r4   r5   r   $  r   z-EditTreeLemmatizer.to_bytes.<locals>.<lambda>c                      
    j  S r[   )r!   to_bytesr4   r   r4   r5   r   %     
 c                      s   j j dS r   )r+   r   r4   r   r4   r5   r   &      c                      r   r[   )r/   r   r4   r   r4   r5   r   '  r   r   )r   r   )r3   r   serializersr4   r   r5   r   "  s   


zEditTreeLemmatizer.to_bytesc                    sL   t |}fddfdd fddfddd}t ||  d S )Nc                    s   t |  jS r[   )r   
write_jsonr1   pr   r4   r5   r   /  r   z,EditTreeLemmatizer.to_disk.<locals>.<lambda>c                    r   r[   )r!   to_diskr   r   r4   r5   r   0  r   c                    r   r   )r+   r   r   r   r4   r5   r   1  r   c                    r   r[   )r/   r   r   r   r4   r5   r   2  r   r   )r   ensure_pathr   )r3   pathr   r   r4   r   r5   r   ,  s   



zEditTreeLemmatizer.to_diskc                    sF   fdd}fdd| fddfddd}t ||  S )Nc                    s`   z"t | d} j|  W d    W d S 1 sw   Y  W d S  ty/   ttjd w )Nrb)openr!   r   readAttributeErrorrL   r   E149)r   mfiler   r4   r5   
load_model7  s   &z0EditTreeLemmatizer.from_disk.<locals>.load_modelc                    r   r[   )r1   r   r   	read_jsonr   r   r4   r5   r   ?  r   z.EditTreeLemmatizer.from_disk.<locals>.<lambda>c                    r   r   )r+   	from_diskr   r   r4   r5   r   A  r   c                    r   r[   )r/   r   r   r   r4   r5   r   B  r   r   )r   r   )r3   r   r   r   r   r4   r   r5   r   6  s   

zEditTreeLemmatizer.from_diskc                 C   s   d|vrt tjjddd|vrt tjjddt|d | jd< g }|d D ];}t|}|r=t tjjd|dt	|}d|v rP| j
j|d |d< d|v r_| j
j|d |d< || q)| j| t| jD ]	\}}|| j|< qpd S )Nr-   rA   r/   
)errorsr   r   )rL   r   E857rN   ri   r1   r   E1026joinr   r+   r.   rE   rH   r/   	from_jsonrr   r-   r0   )r3   r-   r/   r   r   rU   r4   r4   r5   r   H  s(   zEditTreeLemmatizer._add_labelsc                 C   s   t  }t|j}t }i }| D ]%}|jD ]}|jdkr4||j|j}||  d7  < |j|jf||< qq|	 D ]\}}	|	| j
krQ|| \}
}| j|
|dd q:d S )Nr   r   T)	add_label)r   r   r.   r   r   r   rE   rF   r   itemsr'   r   )r3   r   r+   r/   
tree_freqs
repr_pairsr   rz   rV   freqformr   r4   r4   r5   r   b  s$   




z$EditTreeLemmatizer._labels_from_datac                 C   sN   | j ||}|| jvr"|sdS t| jd | j|< | jd | | j| S )z
        Look up the edit tree identifier for a form/label pair. If the edit
        tree is unknown and "add_label" is set, the edit tree will be added to
        the labels.
        Nr-   )r/   rE   r0   r\   r1   rH   )r3   r   r   r   rV   r4   r4   r5   r   w  s   

zEditTreeLemmatizer._pair2label)r#   )F)*__name__
__module____qualname____doc__r   r   r   strr	   r}   boolr   r6   r   r   r   r   r
   rO   rY   r   r   rk   re   rg   rh   r   propertyr-   r   r   r   r   r   r   r   r   r   r   r   r   r   r4   r4   r4   r5   r"   ,   sx    	

&

*

r"   c                 C   s,   | dkrt d}|jS tdt d|  )Nmake_edit_tree_lemmatizerzspacy.pipeline.factorieszmodule z has no attribute )	importlibimport_moduler   r   r   )r,   moduler4   r4   r5   __getattr__  s   
r   )6r   syscollectionsr   	itertoolsr   typingr   r   r   r   r   r	   r
   r   numpyrt   r   	thinc.apir   r   r   r   thinc.typesr   r   r@   r   r   r   languager   tokensr   trainingr   r   r   r+   r   _edit_tree_internals.edit_treesr   _edit_tree_internals.schemasr   
lemmatizerr   trainable_piper   rf   default_model_configfrom_str"DEFAULT_EDIT_TREE_LEMMATIZER_MODELr"   r   r4   r4   r4   r5   <module>   s4    (  ^