o
    
iI                     @   sD  d dl mZmZmZmZmZmZmZ d dlm	Z	 d dl
mZ d dlmZ d dlZd dlZd dlmZmZmZ d dlmZ d dlZd dlmZmZ d d	lmZ d d
lmZmZ d dlmZ d dl m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z'm(Z(m)Z) d dl*m+Z+m,Z, d dl-m.Z. d dl/m0Z0 d dl1m2Z2 dZ3dd Z4G dd de!Z5dS )    )OptionalIterableCallableDictUnionListAny)Floats2d)Path)isliceN)CosineDistanceModel	Optimizer)set_dropout_rate)KnowledgeBase	Candidate)empty_kb)DocSpan)deserialize_config)TrainablePipe)Language)Vocab)Examplevalidate_examplesvalidate_get_examples)ErrorsWarnings)SimpleFrozenList)util)ScorerTc                 K   s   t j| fdtjgi|S )Nnegative_labels)r    score_linksEntityLinker_v1NIL)exampleskwargs r'   Y/home/ubuntu/.local/lib/python3.10/site-packages/spacy_legacy/components/entity_linker.pyentity_linker_score   s   r)   c                   @   s  e Zd ZdZdZ	d?eeddedede	de
e	 d	ed
edededeeege
e f dedee ddfddZdeegef fddZd@ddZddddeg e
e f dee deeegef  fddZddddd e
e d!ed"ee d#eee	ef  dee	ef f
d$d%Zd e
e d&efd'd(Zd)e
e dee	 fd*d+Z d)e
e d,ee	 ddfd-d.Z!e" d/d0d1Z#e" d/d2d3Z$e% d/d4e&e	e'f d5e
e	 ddfd6d7Z(e% d/d4e&e	e'f d5e
e	 dd fd8d9Z)ddd:d;d<Z*d=d> Z+dS )Ar#   z^Pipeline component for named entity linking.

    DOCS: https://spacy.io/api/entitylinker
    r$   entity_linker)	overwritescorervocabmodelnamelabels_discardn_sents
incl_priorincl_contextentity_vector_lengthget_candidatesr+   r,   returnNc                C   sd   || _ || _|| _t|| _|| _|| _|| _|	| _d|
i| _	t
dd| _t|| j | _|| _dS )a  Initialize an entity linker.

        vocab (Vocab): The shared vocabulary.
        model (thinc.api.Model): The Thinc Model powering the pipeline component.
        name (str): The component instance name, used to add entries to the
            losses during training.
        labels_discard (Iterable[str]): NER labels that will automatically get a "NIL" prediction.
        n_sents (int): The number of neighbouring sentences to take into account.
        incl_prior (bool): Whether or not to include prior probabilities from the KB in the model.
        incl_context (bool): Whether or not to include the local context in the model.
        entity_vector_length (int): Size of encoding vectors in the KB.
        get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
            produces a list of candidates, given a certain knowledge base and a textual mention.
        scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links.
        DOCS: https://spacy.io/api/entitylinker#init
        r+   F)	normalizeN)r-   r.   r/   listr0   r1   r2   r3   r5   cfgr   distancer   kbr,   )selfr-   r.   r/   r0   r1   r2   r3   r4   r5   r+   r,   r'   r'   r(   __init__+   s   


zEntityLinker_v1.__init__	kb_loaderc                 C   s.   t |sttjjt|d|| j| _dS )ziDefine the KB of this pipe by providing a function that will
        create it using this object's vocab.)arg_typeN)callable
ValueErrorr   E885formattyper-   r;   )r<   r>   r'   r'   r(   set_kbY   s   zEntityLinker_v1.set_kbc                 C   sD   | j d u rttjj| jdt| j dkr ttjj| jdd S )Nr/   r   )r;   rA   r   E1018rC   r/   lenE139r<   r'   r'   r(   validate_kba   s
   
zEntityLinker_v1.validate_kb)nlpr>   get_examplesrL   c                C   s   t |d |dur| | |   | jj}g }g }t| dD ]}||j || jj	
| q t|dksCJ tjj| jdt|dksSJ tjj| jd| jj|| jj	j|ddd dS )	a  Initialize the pipe for training, using a representative set
        of data examples.

        get_examples (Callable[[], Iterable[Example]]): Function that
            returns a representative sample of gold-standard Example objects.
        nlp (Language): The current nlp object the component is part of.
        kb_loader (Callable[[Vocab], KnowledgeBase]): A function that creates an InMemoryLookupKB from a Vocab instance.
            Note that providing this argument, will overwrite all data accumulated in the current KB.
            Use this only when loading a KB as-such from file.

        DOCS: https://spacy.io/api/entitylinker#initialize
        EntityLinker_v1.initializeN
   r   rF   float32)dtype)XY)r   rE   rK   r;   r4   r   appendxr.   opsalloc1frH   r   E923rC   r/   
initializeasarray)r<   rM   rL   r>   nO
doc_samplevector_sampleexampler'   r'   r(   rY   h   s   

  
rN           )dropsgdlossesr%   r`   ra   rb   c             
   C   s|  |    |du r
i }|| jd |s|S t|d g }|D ]b}dd |jjD }|jddd}|jjD ]K}	||	j }
|
rz|	|	j
}W n tyR   ttjdw td	|| j }tt|d
 || j }|| j}|| j}|j||  }|| q4qt| j| |sttjjdd |S | j|\}}| j||d\}}|| |dur|  | || j  |7  < |S )a.  Learn from a batch of documents and gold-standard information,
        updating the pipe's model. Delegates to predict and get_loss.

        examples (Iterable[Example]): A batch of Example objects.
        drop (float): The dropout rate.
        sgd (thinc.api.Optimizer): The optimizer.
        losses (Dict[str, float]): Optional record of the loss during training.
            Updated using the component name as the key.
        RETURNS (Dict[str, float]): The updated losses dictionary.

        DOCS: https://spacy.io/api/entitylinker#update
        Nr_   EntityLinker_v1.updatec                 S      g | ]}|qS r'   r'   .0sr'   r'   r(   
<listcomp>       z*EntityLinker_v1.update.<locals>.<listcomp>	ENT_KB_IDT	as_stringr      zEntity LinkerrF   )sentence_encodingsr%   )!rK   
setdefaultr/   r   	referencesentsget_alignedentsstartindexsentAttributeErrorRuntimeErrorr   E030maxr1   minrH   end	predictedas_docrT   r   r.   warningswarnr   W093rC   begin_updateget_lossfinish_update)r<   r%   r`   ra   rb   sentence_docseg	sentenceskb_idsentkb_id
sent_indexstart_sentenceend_sentencestart_token	end_tokensent_docrn   
bp_contextlossd_scoresr'   r'   r(   update   sP   






rc   rn   c                 C   s   t |d g }|D ]"}|jddd}|jjD ]}||j }|r*| j|}|| qq	| jj	
|}|j|jkrEtjjddd}	t|	| j||}
| j||}|t| }t||
fS )NEntityLinker_v1.get_lossrj   Trk   r   zgold entities do not match upmethodmsg)r   rr   rp   rs   rt   r;   
get_vectorrT   r.   rV   	asarray2fshaper   E147rC   rx   r:   get_gradr   rH   float)r<   r%   rn   entity_encodingsr   r   r   r   entity_encodingerr	gradientsr   r'   r'   r(   r      s*   


r   docsc                 C   sx  |    d}g }|s|S t|tr|g}t|D ]\}}dd |jD }t|dkr&|jD ]}|j}||}	|	dks?J t	d|	| j
 }
tt|d |	| j
 }||
 j}|| j}|||  }| jjj}| jr| j|gd }|j}|j|}|d7 }|j| jv r|| j q/t| | j|}|s|| j q/t|dkr||d j q/t | |!dd |D }| j"s|!dd |D }|}| jr|!dd |D }|jj|dd}t|t|krt#t$j%j&d	d
d|'||||  }|j(|j(krt)t$j*|| ||  }|+ , }|| }||j q/qt||ks:t$j%j&d	dd}t#||S )ap  Apply the pipeline's model to a batch of docs, without modifying them.
        Returns the KB IDs for each entity in each doc, including NIL if there is
        no prediction.

        docs (Iterable[Doc]): The documents to predict.
        RETURNS (List[str]): The models prediction for each document.

        DOCS: https://spacy.io/api/entitylinker#predict
        r   c                 S   rd   r'   r'   re   r'   r'   r(   rh      ri   z+EntityLinker_v1.predict.<locals>.<listcomp>rm   c                 S      g | ]}|j qS r'   )
prior_probrf   cr'   r'   r(   rh         c                 S   s   g | ]}d qS )r_   r'   )rf   _r'   r'   r(   rh     ri   c                 S   r   r'   )entity_vectorr   r'   r'   r(   rh     r   )axispredictzvectors not of equal lengthr   z$result variables not of equal length)-rK   
isinstancer   	enumeraterq   rH   rs   rv   ru   rz   r1   r{   rt   r|   r~   r.   rV   xpr3   r   Tlinalgnormlabel_r0   rT   r$   r8   r5   r;   entity_randomshufflerZ   r2   rx   r   r   rC   dotr   rA   E161argmaxitem)r<   r   entity_countfinal_kb_idsidocr   r   rv   r   r   r   r   r   r   r   sentence_encodingsentence_encoding_tsentence_norm
candidatesprior_probsscoresr   entity_normsims
best_indexbest_candidater   r'   r'   r(   r      s   









zEntityLinker_v1.predictr   c           
      C   s   t dd |D }|t |krttjj|t |dd}| jd }|D ]}|jD ]}|| }|d7 }|D ]}	|	jdks>|rA||	_q5q)q$dS )a  Modify a batch of documents, using pre-computed scores.

        docs (Iterable[Doc]): The documents to modify.
        kb_ids (List[str]): The IDs to set, produced by EntityLinker.predict.

        DOCS: https://spacy.io/api/entitylinker#set_annotations
        c                 S   s   g | ]
}|j D ]}|qqS r'   )rs   )rf   r   r   r'   r'   r(   rh   ?  s    z3EntityLinker_v1.set_annotations.<locals>.<listcomp>)rs   idsr   r+   rm   N)	rH   rA   r   E148rC   r9   rs   	ent_kb_id
ent_kb_id_)
r<   r   r   
count_entsr   r+   r   r   r   tokenr'   r'   r(   set_annotations7  s    

zEntityLinker_v1.set_annotationsexcludec                   sf      i }tdrjdurfdd|d<  fdd|d< jj|d< jj|d< t| S )	zSerialize the pipe to a bytestring.

        exclude (Iterable[str]): String names of serialization fields to exclude.
        RETURNS (bytes): The serialized object.

        DOCS: https://spacy.io/api/entitylinker#to_bytes
        r9   Nc                      s   t  jS N)srsly
json_dumpsr9   r'   rJ   r'   r(   <lambda>W      z*EntityLinker_v1.to_bytes.<locals>.<lambda>c                      s   j j dS Nr   )r-   to_bytesr'   r   r<   r'   r(   r   X      r-   r;   r.   )_validate_serialization_attrshasattrr9   r;   r   r.   r   )r<   r   	serializer'   r   r(   r   L  s   zEntityLinker_v1.to_bytesc                   sx      fdd}i }tdrjdurfdd|d<  fdd|d< fd	d|d
< ||d< t||  S )zLoad the pipe from a bytestring.

        exclude (Iterable[str]): String names of serialization fields to exclude.
        RETURNS (TrainablePipe): The loaded object.

        DOCS: https://spacy.io/api/entitylinker#from_bytes
        c                    s.   z	 j |  W d S  ty   ttjd w r   )r.   
from_bytesrw   rA   r   E149brJ   r'   r(   
load_modelg  s
   z.EntityLinker_v1.from_bytes.<locals>.load_modelr9   Nc                    s    j t| S r   )r9   r   r   
json_loadsr   rJ   r'   r(   r   o  r   z,EntityLinker_v1.from_bytes.<locals>.<lambda>c                       j j|  dS r   )r-   r   r   r   r'   r(   r   p  ri   r-   c                        j | S r   )r;   r   r   rJ   r'   r(   r   q  r   r;   r.   )r   r   r9   r   r   )r<   
bytes_datar   r   deserializer'   r   r(   r   ]  s   zEntityLinker_v1.from_bytespathr   c                   sX   i } fdd|d< fdd|d< fdd|d< fdd|d	< t ||  d
S )zSerialize the pipe to disk.

        path (str / Path): Path to a directory.
        exclude (Iterable[str]): String names of serialization fields to exclude.

        DOCS: https://spacy.io/api/entitylinker#to_disk
        c                    r   r   )r-   to_diskpr   r'   r(   r     ri   z)EntityLinker_v1.to_disk.<locals>.<lambda>r-   c                    s   t |  jS r   )r   
write_jsonr9   r   rJ   r'   r(   r     r   r9   c                    r   r   )r;   r   r   rJ   r'   r(   r     r   r;   c                    r   r   )r.   r   r   rJ   r'   r(   r     r   r.   N)r   r   )r<   r   r   r   r'   r   r(   r   v  s   
zEntityLinker_v1.to_diskc                   s\   fdd}i }fdd|d<  fdd|d< fdd|d	< ||d
< t ||  S )aN  Load the pipe from disk. Modifies the object in place and returns it.

        path (str / Path): Path to a directory.
        exclude (Iterable[str]): String names of serialization fields to exclude.
        RETURNS (EntityLinker): The modified EntityLinker object.

        DOCS: https://spacy.io/api/entitylinker#from_disk
        c                    s`   z"|  d} j|  W d    W d S 1 sw   Y  W d S  ty/   ttjd w )Nrb)openr.   r   readrw   rA   r   r   )r   infilerJ   r'   r(   r     s   &z-EntityLinker_v1.from_disk.<locals>.load_modelc                    s    j t| S r   )r9   r   r   r   rJ   r'   r(   r     ri   z+EntityLinker_v1.from_disk.<locals>.<lambda>r9   c                    r   r   )r-   	from_diskr   r   r'   r(   r     ri   r-   c                    r   r   )r;   r   r   rJ   r'   r(   r     r   r;   r.   )r   r   )r<   r   r   r   r   r'   r   r(   r     s   zEntityLinker_v1.from_disk)ra   rb   c                K      t r   NotImplementedError)r<   r%   ra   rb   configr'   r'   r(   rehearse     zEntityLinker_v1.rehearsec                 C   r   r   r   )r<   labelr'   r'   r(   	add_label  r   zEntityLinker_v1.add_label)r*   )r6   N),__name__
__module____qualname____doc__r$   BACKWARD_OVERWRITEr)   r   r   strr   intboolr   r   r   r   r   r=   rE   rK   r   r   rY   r   r   r   r   r	   r   r   r   r   r   tupler   r   r   r   r
   r   r   r   r   r'   r'   r'   r(   r#   #   s    	

.

'

AV



r#   )6typingr   r   r   r   r   r   r   thinc.typesr	   pathlibr
   	itertoolsr   r   r   	thinc.apir   r   r   r   r   spacy.kbr   r   spacy.mlr   spacy.tokensr   r   spacy.pipeline.piper   spacy.pipeline.trainable_piper   spacy.languager   spacy.vocabr   spacy.trainingr   r   r   spacy.errorsr   r   
spacy.utilr   spacyr   spacy.scorerr    r   r)   r#   r'   r'   r'   r(   <module>   s0   $