o
    i}^                     @   sl  d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZm	Z	m
Z
mZmZmZmZ d dlZd dlmZmZmZmZmZ d dlmZ ddlmZ dd	lmZ dd
lmZmZ ddlm Z  ddl!m"Z" ddl#m$Z$m%Z% ddl&m'Z'm(Z(m)Z) ddlm*Z*m+Z+ ddl,m-Z- ddl.m/Z/ ddl0m1Z1 ddl2m3Z3 dZ4dZ5e 6e5d Z7dd Z8dd Z9G dd de3Z:dd Z;dS )     N)islice)Path)AnyCallableDictIterableListOptionalUnion)ConfigCosineDistanceModel	Optimizerset_dropout_rate)Floats2d   )util)Errors)	CandidateKnowledgeBase)Language)Scorer)DocSpan)Examplevalidate_examplesvalidate_get_examples)SimpleFrozenListregistry)Vocab   )EntityLinker_v1)deserialize_config)TrainablePipeTz
[model]
@architectures = "spacy.EntityLinker.v2"

[model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v2"
pretrained_vectors = null
width = 96
depth = 2
embed_size = 2000
window_size = 1
maxout_pieces = 3
subword_features = true
modelc                 K   s   t j| fdtjgi|S )Nnegative_labels)r   score_linksEntityLinkerNIL)exampleskwargs r+   P/home/ubuntu/.local/lib/python3.10/site-packages/spacy/pipeline/entity_linker.pyentity_linker_score-   s   r-   c                   C   s   t S N)r-   r+   r+   r+   r,   make_entity_linker_scorer1      r/   c                $   @   sN  e Zd ZdZdZ	dHeedddedede	d	e
e	 d
ededededeeege
e f deee
e ge
e
e  f deeegef dedee dededee ddf"ddZde
e de
e fddZdeegef fddZdIdd Zddd!d"eg e
e f d#ee deeegef  fd$d%Zd&d' Zd(ddd)de
e d*ed+ee d,eee	ef  dee	ef f
d-d.Zde
e d/efd0d1Zd2e
e  de!e	 fd3d4Z"d2e
e  d5e!e	 ddfd6d7Z#e$ d8d9d:Z%e$ d8d;d<Z&e' d8d=e(e	e)f d>e
e	 ddfd?d@Z*e' d8d=e(e	e)f d>e
e	 dd fdAdBZ+dddCdDdEZ,dFdG Z-dS )Jr'   z^Pipeline component for named entity linking.

    DOCS: https://spacy.io/api/entitylinker
    r(   entity_linkerN)	overwritescorer	thresholdvocabr$   namelabels_discardn_sents
incl_priorincl_contextentity_vector_lengthget_candidatesget_candidates_batchgenerate_empty_kbr2   r3   use_gold_entscandidates_batch_sizer4   returnc                   s   |durd|  krdksn t tjjdd|d|_|_|_t|_|_	|_
|_|	_|
_d|i_tdd_|j|_|_|_|_|dk r[t tjdtt f fd	d
}|_dS )aU  Initialize an entity linker.

        vocab (Vocab): The shared vocabulary.
        model (thinc.api.Model): The Thinc Model powering the pipeline component.
        name (str): The component instance name, used to add entries to the
            losses during training.
        labels_discard (Iterable[str]): NER labels that will automatically get a "NIL" prediction.
        n_sents (int): The number of neighbouring sentences to take into account.
        incl_prior (bool): Whether or not to include prior probabilities from the KB in the model.
        incl_context (bool): Whether or not to include the local context in the model.
        entity_vector_length (int): Size of encoding vectors in the KB.
        get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
            produces a list of candidates, given a certain knowledge base and a textual mention.
        get_candidates_batch (
            Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]],
            Iterable[Candidate]]
            ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
        generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
        scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links.
        use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
            component must provide entity annotations.
        candidates_batch_size (int): Size of batches for entity candidate generation.
        threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the
            threshold, prediction is discarded. If None, predictions are not filtered by any threshold.
        DOCS: https://spacy.io/api/entitylinker#init
        Nr   r    )range_start	range_endvaluer2   F)	normalizer)   c                    sf    s S j s | fi |S | } dd | D }t| |D ]\}}||_q# | fi |S )Nc                 s   s    | ]}|j V  qd S r.   	predicted.0egr+   r+   r,   	<genexpr>   s    zFEntityLinker.__init__.<locals>._score_with_ents_set.<locals>.<genexpr>)r?   _ensure_entspipeziprG   )r)   r*   docsrJ   docr3   selfr+   r,   _score_with_ents_set   s   
z3EntityLinker.__init__.<locals>._score_with_ents_set)
ValueErrorr   E1043formatr5   r$   r6   listr7   r8   r9   r:   r<   r=   cfgr   distancekbr?   r@   r4   E1044r   r   r3   )rR   r5   r$   r6   r7   r8   r9   r:   r;   r<   r=   r>   r2   r3   r?   r@   r4   rS   r+   rQ   r,   __init__=   s6   1



zEntityLinker.__init__r)   c                 C   sB   | j s|S g }|D ]}| \}}| }||j_|| q	|S )zLIf use_gold_ents is true, set the gold entities to (a copy of) eg.predicted.)r?   get_aligned_ents_and_nercopyrG   entsappend)rR   r)   new_examplesrJ   r_   _new_egr+   r+   r,   rL      s   zEntityLinker._ensure_ents	kb_loaderc                 C   s.   t |sttjjt|d|| j| _dS )ziDefine the KB of this pipe by providing a function that will
        create it using this object's vocab.)arg_typeN)callablerT   r   E885rV   typer5   rZ   )rR   rd   r+   r+   r,   set_kb   s   zEntityLinker.set_kbc                 C   sP   | j d u rttjj| jdt| j dr$| j  r&ttjj| jdd S d S )Nr6   is_empty)	rZ   rT   r   E1018rV   r6   hasattrrk   E139rR   r+   r+   r,   validate_kb   s
   
zEntityLinker.validate_kb)nlprd   get_examplesrq   c                C   s  t |d |dur| | |   | jj}g }g }| t| d}|D ]}|j}	||	 || j	j
| q%t|dksJJ tjj| jdt|dksZJ tjj| jdtdd |D }
|
sv|d }	|	dd }d	|_|f|	_| j	j|| j	j
j|d
dd |
sg |	_dS dS )a  Initialize the pipe for training, using a representative set
        of data examples.

        get_examples (Callable[[], Iterable[Example]]): Function that
            returns a representative sample of gold-standard Example objects.
        nlp (Language): The current nlp object the component is part of.
        kb_loader (Callable[[Vocab], KnowledgeBase]): A function that creates a KnowledgeBase from a Vocab
            instance. Note that providing this argument will overwrite all data accumulated in the current KB.
            Use this only when loading a KB as-such from file.

        DOCS: https://spacy.io/api/entitylinker#initialize
        EntityLinker.initializeN
   r   rj   c                 S      g | ]}|j qS r+   r_   )rI   rP   r+   r+   r,   
<listcomp>       z+EntityLinker.initialize.<locals>.<listcomp>r    XXXfloat32dtype)XY)r   ri   rp   rZ   r;   rL   r   xr`   r$   opsalloc1flenr   E923rV   r6   anylabel_r_   
initializeasarray)rR   rr   rq   rd   nO
doc_samplevector_sampler)   rJ   rP   has_annotationsentr+   r+   r,   r      s4   


  
rs   c                 C   s:   |D ]}|j jD ]}t| | j|}|r  dS qqdS )zCheck if a batch contains a learnable example.

        If one isn't present, then the update step needs to be skipped.
        TF)rG   r_   rW   r<   rZ   )rR   r)   rJ   r   
candidatesr+   r+   r,   batch_has_learnable_example   s   z(EntityLinker.batch_has_learnable_example        )dropsgdlossesr   r   r   c          
      C   s   |    |du r
i }|| jd |s|S | |}t|d | |s&|S t| j| dd |D }| j|\}}| j	||d\}}	||	 |durQ| 
| || j  |7  < |S )a.  Learn from a batch of documents and gold-standard information,
        updating the pipe's model. Delegates to predict and get_loss.

        examples (Iterable[Example]): A batch of Example objects.
        drop (float): The dropout rate.
        sgd (thinc.api.Optimizer): The optimizer.
        losses (Dict[str, float]): Optional record of the loss during training.
            Updated using the component name as the key.
        RETURNS (Dict[str, float]): The updated losses dictionary.

        DOCS: https://spacy.io/api/entitylinker#update
        Nr   EntityLinker.updatec                 S   ru   r+   rF   rH   r+   r+   r,   rw     rx   z'EntityLinker.update.<locals>.<listcomp>)sentence_encodingsr)   )rp   
setdefaultr6   rL   r   r   r   r$   begin_updateget_lossfinish_update)
rR   r)   r   r   r   rO   r   
bp_contextlossd_scoresr+   r+   r,   update   s*   




r   r   c                 C   s  t |d g }d}g }|D ]+}|jddd}| D ]}||j }	|	r3| j|	}
||
 || |d7 }qq| jjj	|dd}|| }|sT| jjj
|j }d|fS |j|jkrftjjd	d
d}t|| j||}| jjj
|j }|||< | j||}|t| }t||fS )NEntityLinker.get_lossr   	ENT_KB_IDT)	as_stringr    rz   r{   r   zgold entities do not match upmethodmsg)r   get_alignedget_matching_entsstartrZ   
get_vectorr`   r$   r   	asarray2falloc2fshaper   E147rV   RuntimeErrorrY   get_gradr   r   float)rR   r)   r   entity_encodingseidx	keep_entsrJ   kb_idsr   kb_identity_encodingselected_encodingsouterr	gradientsr   r+   r+   r,   r   ,  s<   




r   rO   c              	      sv     d}g }jjj}|s|S t|tr|g}t|D ]\}}t|dkr)qdd |jD }t	dt|j
jD ]h}|j
||j    fddt	t D }	tjdkrjj fdd|	D n	 fdd|	D }
t D ])\}}t|dsJ t|j}||d ||d	 f}|d |d   krdksJ  J jrtd|d j }tt|d |d j }|| j}|| j}|||  }j|gd }|j}|j|}|d7 }|jjv r|j qyt|
| }|s
|j qyt|dkr j d
u r ||d j! qyt"#| |$dd |D }j%s=|$dd |D }|}jr|$dd |D }|jj|dd}t|t|krht&t'j(j)ddd|*||||  }|j+|j+kr~t,t'j-|| ||  }|j d
u s| j kr||. /  j!nt0j qyq;qt||kst'j(j)ddd}t&||S )ap  Apply the pipeline's model to a batch of docs, without modifying them.
        Returns the KB IDs for each entity in each doc, including NIL if there is
        no prediction.

        docs (Iterable[Doc]): The documents to predict.
        RETURNS (List[str]): The models prediction for each document.

        DOCS: https://spacy.io/api/entitylinker#predict
        r   c                 S   s   g | ]}|qS r+   r+   )rI   sr+   r+   r,   rw   i      z(EntityLinker.predict.<locals>.<listcomp>c                    s    g | ]} | j jvr|qS r+   )r   r7   rI   idx	ent_batchrR   r+   r,   rw   p  s
    r    c                    s   g | ]} | qS r+   r+   r   )r   r+   r,   rw   x  s    c                    s   g | ]} j | qS r+   )r<   rZ   r   r   r+   r,   rw   {  s    sentsNc                 S   ru   r+   )
prior_probrI   cr+   r+   r,   rw     rx   c                 S   s   g | ]}d qS )r   r+   )rI   rb   r+   r+   r,   rw     r   c                 S   ru   r+   )entity_vectorr   r+   r+   r,   rw     rx   )axispredictzvectors not of equal lengthr   z$result variables not of equal length)1rp   r$   r   xp
isinstancer   	enumerater   r   ranger_   r@   rW   r=   rZ   rm   indexr:   maxr8   minr   endas_docr   Tlinalgnormr   r7   r`   r(   r4   entity_randomshuffler   r9   r   r   r   rV   dotr   rT   E161argmaxitemr'   )rR   rO   entity_countfinal_kb_idsr   irP   	sentencesent_idxvalid_ent_idxbatch_candidatesjr   r   sent_indicesstart_sentenceend_sentencestart_token	end_tokensent_docsentence_encodingsentence_encoding_tsentence_normr   prior_probsscoresr   entity_normsimsr   r+   r   r,   r   T  s   





$




[zEntityLinker.predictr   c           
      C   s   t dd |D }|t |krttjj|t |dd}| jd }|D ]}|jD ]}|| }|d7 }|D ]}	|	jdks>|rA||	_q5q)q$dS )a  Modify a batch of documents, using pre-computed scores.

        docs (Iterable[Doc]): The documents to modify.
        kb_ids (List[str]): The IDs to set, produced by EntityLinker.predict.

        DOCS: https://spacy.io/api/entitylinker#set_annotations
        c                 S   s   g | ]
}|j D ]}|qqS r+   rv   )rI   rP   r   r+   r+   r,   rw     s    z0EntityLinker.set_annotations.<locals>.<listcomp>)r_   idsr   r2   r    N)	r   rT   r   E148rV   rX   r_   	ent_kb_id
ent_kb_id_)
rR   rO   r   
count_entsr   r2   rP   r   r   tokenr+   r+   r,   set_annotations  s    

zEntityLinker.set_annotationsexcludec                   sf      i }tdrjdurfdd|d<  fdd|d< jj|d< jj|d< t| S )	zSerialize the pipe to a bytestring.

        exclude (Iterable[str]): String names of serialization fields to exclude.
        RETURNS (bytes): The serialized object.

        DOCS: https://spacy.io/api/entitylinker#to_bytes
        rX   Nc                      s   t  jS r.   )srsly
json_dumpsrX   r+   ro   r+   r,   <lambda>      z'EntityLinker.to_bytes.<locals>.<lambda>c                      s   j j dS Nr   )r5   to_bytesr+   r   rR   r+   r,   r         r5   rZ   r$   )_validate_serialization_attrsrm   rX   rZ   r   r$   r   )rR   r   	serializer+   r   r,   r     s   zEntityLinker.to_bytesc                   sx      fdd}i }tdrjdurfdd|d<  fdd|d< fd	d|d
< ||d< t||  S )zLoad the pipe from a bytestring.

        exclude (Iterable[str]): String names of serialization fields to exclude.
        RETURNS (TrainablePipe): The loaded object.

        DOCS: https://spacy.io/api/entitylinker#from_bytes
        c                    s.   z	 j |  W d S  ty   ttjd w r.   )r$   
from_bytesAttributeErrorrT   r   E149bro   r+   r,   
load_model  s
   z+EntityLinker.from_bytes.<locals>.load_modelrX   Nc                    s    j t| S r.   )rX   r   r   
json_loadsr  ro   r+   r,   r     rx   z)EntityLinker.from_bytes.<locals>.<lambda>c                       j j|  dS r   )r5   r  r  r   r+   r,   r     r   r5   c                        j | S r.   )rZ   r  r  ro   r+   r,   r     r   rZ   r$   )r   rm   rX   r   r  )rR   
bytes_datar   r  deserializer+   r   r,   r    s   zEntityLinker.from_bytespathr   c                   sX   i } fdd|d< fdd|d< fdd|d< fdd|d	< t ||  d
S )zSerialize the pipe to disk.

        path (str / Path): Path to a directory.
        exclude (Iterable[str]): String names of serialization fields to exclude.

        DOCS: https://spacy.io/api/entitylinker#to_disk
        c                    r  r   )r5   to_diskpr   r+   r,   r     r   z&EntityLinker.to_disk.<locals>.<lambda>r5   c                    s   t |  jS r.   )r   
write_jsonrX   r  ro   r+   r,   r     r   rX   c                    r	  r.   )rZ   r  r  ro   r+   r,   r     r   rZ   c                    r	  r.   )r$   r  r  ro   r+   r,   r     r   r$   N)r   r  )rR   r  r   r   r+   r   r,   r    s   
zEntityLinker.to_diskc                   s\   fdd}i }fdd|d<  fdd|d< fdd|d	< ||d
< t ||  S )aN  Load the pipe from disk. Modifies the object in place and returns it.

        path (str / Path): Path to a directory.
        exclude (Iterable[str]): String names of serialization fields to exclude.
        RETURNS (EntityLinker): The modified EntityLinker object.

        DOCS: https://spacy.io/api/entitylinker#from_disk
        c                    s`   z"|  d} j|  W d    W d S 1 sw   Y  W d S  ty/   ttjd w )Nrb)openr$   r  readr  rT   r   r  )r  infilero   r+   r,   r  *  s   &z*EntityLinker.from_disk.<locals>.load_modelc                    s    j t| S r.   )rX   r   r"   r  ro   r+   r,   r   2  r   z(EntityLinker.from_disk.<locals>.<lambda>rX   c                    r  r   )r5   	from_diskr  r   r+   r,   r   3  r   r5   c                    r	  r.   )rZ   r  r  ro   r+   r,   r   4  r   rZ   r$   )r   r  )rR   r  r   r  r  r+   r   r,   r    s   zEntityLinker.from_disk)r   r   c                K      t r.   NotImplementedError)rR   r)   r   r   configr+   r+   r,   rehearse9  r0   zEntityLinker.rehearsec                 C   r  r.   r  )rR   labelr+   r+   r,   	add_label<  r0   zEntityLinker.add_label)r1   )rA   N).__name__
__module____qualname____doc__r(   BACKWARD_OVERWRITEr-   r   r   strr   intboolr   r   r   r   r	   r   r\   r   rL   ri   rp   r   r   r   r   r   r   r   r   r   r   r   r   tupler   r  r   r
   r   r  r  r  r  r+   r+   r+   r,   r'   5   s    	

`

6

/(z



r'   c                 C   s,   | dkrt d}|jS tdt d|  )Nmake_entity_linkerzspacy.pipeline.factorieszmodule z has no attribute )	importlibimport_moduler&  r  r  )r6   moduler+   r+   r,   __getattr__A  s   
r*  )<r'  r   sys	itertoolsr   pathlibr   typingr   r   r   r   r   r	   r
   r   	thinc.apir   r   r   r   r   thinc.typesr    r   errorsr   rZ   r   r   languager   r3   r   tokensr   r   trainingr   r   r   r   r   r5   r   legacy.entity_linkerr!   rM   r"   trainable_piper#   r!  default_model_configfrom_strDEFAULT_NEL_MODELr-   r/   r'   r*  r+   r+   r+   r,   <module>   s@    $    