o
    ia                  	   @   s6  d dl Z d dlZd dlmZ d dlmZ d dlmZmZm	Z	m
Z
mZmZmZmZmZ d dlZd dlmZmZmZmZmZmZ d dlmZmZmZmZ ddlmZmZ dd	l m!Z! dd
l"m#Z# ddl$m%Z% ddl&m'Z'm(Z(m)Z) ddl*m+Z+m,Z, ddl-m.Z. ddl/m0Z0 ddl1m2Z2 dZ3dZ4dZ5e 6e3d Z7e 6e4d Z8eG dd deZ9ddde
e' dee: dee defddZ;ddde
e' de<dee defd d!Z=dee: de9fd"d#Z>d$e:d%e:de9fd&d'Z?de<de9fd(d)Z@d*e
e+ de	e<ef fd+d,ZAd-d. ZBeG d/d0 d0ZCG d1d2 d2e2ZDd3d4 ZEdS )5    N)	dataclass)partial)	AnyCallableDictIterableListOptionalTupleUnioncast)ConfigModelOps	Optimizerget_current_opsset_dropout_rate)Floats2dInts1dInts2dRagged   )Protocolruntime_checkable)Errors)Language)Scorer)DocSpan	SpanGroup)Examplevalidate_examples)registry)Vocab   )TrainablePipea4  
[model]
@architectures = "spacy.SpanCategorizer.v1"
scorer = {"@layers": "spacy.LinearLogistic.v1"}

[model.reducer]
@layers = spacy.mean_max_reducer.v1
hidden_size = 128

[model.tok2vec]
@architectures = "spacy.Tok2Vec.v2"

[model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v2"
width = 96
rows = [5000, 1000, 2500, 1000]
attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
include_static_vectors = false

[model.tok2vec.encode]
@architectures = "spacy.MaxoutWindowEncoder.v2"
width = ${model.tok2vec.embed.width}
window_size = 1
maxout_pieces = 3
depth = 4
a&  
[model]
@architectures = "spacy.SpanCategorizer.v1"
scorer = {"@layers": "Softmax.v2"}

[model.reducer]
@layers = spacy.mean_max_reducer.v1
hidden_size = 128

[model.tok2vec]
@architectures = "spacy.Tok2Vec.v2"
[model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v1"
width = 96
rows = [5000, 1000, 2500, 1000]
attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
include_static_vectors = false

[model.tok2vec.encode]
@architectures = "spacy.MaxoutWindowEncoder.v2"
width = ${model.tok2vec.embed.width}
window_size = 1
maxout_pieces = 3
depth = 4
scmodelc                   @   s0   e Zd Zdddee dee defddZdS )	SuggesterNopsdocsr*   returnc                C      d S N )selfr+   r*   r/   r/   J/home/ubuntu/.local/lib/python3.10/site-packages/spacy/pipeline/spancat.py__call__S   s    zSuggester.__call__)	__name__
__module____qualname__r   r   r	   r   r   r2   r/   r/   r/   r1   r(   Q   s    (r(   r)   r+   sizesr*   r,   c             	   C   s  |d u rt  }g }g }| D ]U}|jjt|dd}|d}d}|D ]:}|t|krL|d t||d   }	||j|	|	| f ||d jd 7 }|r\|d jdks\J |d jq"|| q|	|}
t|dkrxt
|j||
}nt
|jjddd|
}|jjdksJ |S )	Nidtype)r$   r   r$   r:   r   r   r   )r   xparangelenreshapeappendhstackshapendim	asarray1ir   vstackzerosdataXd)r+   r6   r*   spanslengthsdocstartslengthsizestarts_sizelengths_arrayoutputr/   r/   r1   ngram_suggesterV   s.   

rQ   	spans_keyc          
      C   s   |d u rt  }g }g }| D ]#}d}|j| r+|j| D ]}||j|jg |d7 }q|| qtt|j|dd}t|dkrMt	|j|dd|}	|	S t	|j
jddd|}	|	S )Nr   r$   r7   r8   r;   )r   rH   r@   startendr   r   asarrayr>   r   r<   rF   )
r+   rR   r*   rH   rI   rJ   rL   spanrO   rP   r/   r/   r1   preset_spans_suggesters   s"   

rW   c                 C      t t| dS )zSuggest all spans of the given lengths. Spans are returned as a ragged
    array of integers. The array has two columns, indicating the start and end
    position.r6   )r   rQ   rY   r/   r/   r1   build_ngram_suggester   s   rZ   min_sizemax_sizec                 C   s   t t| |d }t|S )zSuggest all spans of the given lengths between a given min and max value - both inclusive.
    Spans are returned as a ragged array of integers. The array has two columns,
    indicating the start and end position.r$   )listrangerZ   )r[   r\   r6   r/   r/   r1   build_ngram_range_suggester   s   r_   c                 C   rX   )zSuggest all spans that are already stored in doc.spans[spans_key].
    This is useful when an upstream component is used to set the spans
    on the Doc such as a SpanRuler or SpanFinder.rR   )r   rW   r`   r/   r/   r1   build_preset_spans_suggester   s   ra   examplesc                    sn   t |}d |d |d    |dd |d fdd |d	fd
d tj| fi |S )Nspans_rR   attrallow_overlapTgetterc                    s   | j |t d  g S r.   )rH   getr>   )rJ   key)attr_prefixr/   r1   <lambda>   s    zspancat_score.<locals>.<lambda>has_annotationc                    s
    | j v S r.   )rH   )rJ   )rh   r/   r1   rj      s   
 )dict
setdefaultr   score_spans)rb   kwargsr/   )ri   rh   r1   spancat_score   s   rp   c                   C   s   t S r.   )rp   r/   r/   r/   r1   make_spancat_scorer   s   rq   c                   @   s(   e Zd ZdZdd Zdd Zdd ZdS )	
_Intervalsz:
    Helper class to avoid storing overlapping spans.
    c                 C   s   t  | _d S r.   )setrangesr0   r/   r/   r1   __init__   s   z_Intervals.__init__c                 C   s    t ||D ]}| j| qd S r.   )r^   rt   add)r0   r7   jer/   r/   r1   rw      s   z_Intervals.addc                 C   s,   |\}}t ||D ]
}|| jv r dS q	dS )NTF)r^   rt   )r0   rangr7   rx   ry   r/   r/   r1   __contains__   s   
z_Intervals.__contains__N)r3   r4   r5   __doc__rv   rw   r{   r/   r/   r/   r1   rr      s
    rr   c                   @   st  e Zd ZdZ	dPdddddded	d
edeeee	 e
f ef dededededee dee dee dee dee ddfddZedefddZdQddZdedefddZedee fdd Zedee fd!d"Zedeeef fd#d$Zedefd%d&Zedeedf fd'd(Zd)ee	 fd*d+Z d,d-d)ee	 d.eddfd/d0Z!d)ee	 ddfd1d2Z"d3ddd4d5ee# d6ed7ee$ d8eeeef  deeef f
d9d:Z%d5ee# d;ee
ef deeef fd<d=Z&ddd>d?eg ee# f d@ee' dAeee  ddfdBdCZ(d5ee# fdDdEZ)dFe#fdGdHZ*dIe	dJe+dKede,fdLdMZ-	dRdIe	dJe+dKedede,f
dNdOZ.dS )SSpanCategorizerz_Pipeline component to label spans of text.

    DOCS: https://spacy.io/api/spancategorizer
    spancatFrH         ?TNg      ?)add_negative_labelrR   negative_weightre   max_positive	thresholdscorervocabr'   	suggesternamer   rR   r   re   r   r   r   r,   c                C   sj   g ||
|	||d| _ || _|| _|| _|| _|| _|| _|s/|	dur1|	dkr3ttj	j
|	ddS dS dS )a}  Initialize the multi-label or multi-class span categorizer.

        vocab (Vocab): The shared vocabulary.
        model (thinc.api.Model): The Thinc Model powering the pipeline component.
            For multi-class classification (single label per span) we recommend
            using a Softmax classifier as a the final layer, while for multi-label
            classification (multiple possible labels per span) we recommend Logistic.
        suggester (Callable[[Iterable[Doc], Optional[Ops]], Ragged]): A function that suggests spans.
            Spans are returned as a ragged array with two integer columns, for the
            start and end positions.
        name (str): The component instance name, used to add entries to the
            losses during training.
        spans_key (str): Key of the Doc.spans dict to save the spans under.
            During initialization and training, the component will look for
            spans on the reference document under the same key. Defaults to
            `"spans"`.
        add_negative_label (bool): Learn to predict a special 'negative_label'
            when a Span is not annotated.
        threshold (Optional[float]): Minimum probability to consider a prediction
            positive. Defaults to 0.5. Spans with a positive prediction will be saved
            on the Doc.
        max_positive (Optional[int]): Maximum number of labels to consider
            positive per span. Defaults to None, indicating no limit.
        negative_weight (float): Multiplier for the loss terms.
            Can be used to downweight the negative samples if there are too many
            when add_negative_label is True. Otherwise its unused.
        allow_overlap (bool): If True the data is assumed to contain overlapping spans.
            Otherwise it produces non-overlapping spans greedily prioritizing
            higher assigned label scores. Only used when max_positive is 1.
        scorer (Optional[Callable]): The scoring method. Defaults to
            Scorer.score_spans for the Doc.spans[spans_key] with overlapping
            spans allowed.

        DOCS: https://spacy.io/api/spancategorizer#init
        )labelsrR   r   r   r   re   Nr$   )r   )cfgr   r   r'   r   r   r   
ValueErrorr   E1051format)r0   r   r'   r   r   r   rR   r   re   r   r   r   r/   r/   r1   rv      s    3zSpanCategorizer.__init__c                 C      t | jd S )zKey of the doc.spans dict to save the spans under. During
        initialization and training, the component will look for spans on the
        reference document under the same key.
        rR   )strr   ru   r/   r/   r1   rh        zSpanCategorizer.keyc                 C   s   d}| j dr| j d}n| j dr'| j ddr'| j dd}|durB|| jkrD| jsFttj	j
| j| j dddS dS dS )z<Raise an error if the component can not add any more labels.NnOoutput_layer)r   r   )r'   has_dimget_dimhas_refget_ref	_n_labelsis_resizabler   r   E922r   r   )r0   r   r/   r/   r1   _allow_extra_label  s    z"SpanCategorizer._allow_extra_labellabelc                 C   sL   t |ts
ttj|| jv rdS |   | jd | | j	j
| dS )zAdd a new label to the pipe.

        label (str): The label to add.
        RETURNS (int): 0 if label is already present, otherwise 1.

        DOCS: https://spacy.io/api/spancategorizer#add_label
        r   r   r$   )
isinstancer   r   r   E187r   r   r   r@   r   stringsrw   )r0   r   r/   r/   r1   	add_label'  s   


zSpanCategorizer.add_labelc                 C   r   )zRETURNS (Tuple[str]): The labels currently added to the component.

        DOCS: https://spacy.io/api/spancategorizer#labels
        r   )tupler   ru   r/   r/   r1   r   8  r   zSpanCategorizer.labelsc                 C   s
   t | jS )zRETURNS (List[str]): Information about the component's labels.

        DOCS: https://spacy.io/api/spancategorizer#label_data
        )r]   r   ru   r/   r/   r1   
label_data@  s   
zSpanCategorizer.label_datac                 C   s   dd t | jD S )z(RETURNS (Dict[str, int]): The label map.c                 S   s   i | ]\}}||qS r/   r/   ).0r7   r   r/   r/   r1   
<dictcomp>K  s    z.SpanCategorizer._label_map.<locals>.<dictcomp>)	enumerater   ru   r/   r/   r1   
_label_mapH  s   zSpanCategorizer._label_mapc                 C   s   | j r
t| jd S t| jS )z RETURNS (int): Number of labels.r$   )r   r>   r   ru   r/   r/   r1   r   M  s   
zSpanCategorizer._n_labelsc                 C   s   | j rt| jS dS )z8RETURNS (Union[int, None]): Index of the negative label.N)r   r>   r   ru   r/   r/   r1   _negative_label_iU  s   
z!SpanCategorizer._negative_label_ir+   c                 C   sP   | j || jjd}|j dkr| jjdd}||fS | j||f}||fS )zApply the pipeline's model to a batch of docs, without modifying them.

        docs (Iterable[Doc]): The documents to predict.
        RETURNS: The models prediction for each document.

        DOCS: https://spacy.io/api/spancategorizer#predict
        r)   r   )r   r'   r*   rI   sumalloc2fpredict)r0   r+   indicesscoresr/   r/   r1   r   ]  s   zSpanCategorizer.predict
candidates)candidates_keyr   c                C   s`   | j || jjd}t||D ]\}}g |j|< |jD ]}|j| ||d |d   qqdS )ao  Use the spancat suggester to add a list of span candidates to a list of docs.
        This method is intended to be used for debugging purposes.

        docs (Iterable[Doc]): The documents to modify.
        candidates_key (str): Key of the Doc.spans dict to save the candidate spans under.

        DOCS: https://spacy.io/api/spancategorizer#set_candidates
        r)   r   r$   N)r   r'   r*   ziprH   rG   r@   )r0   r+   r   suggester_outputr   rJ   indexr/   r/   r1   set_candidatesl  s   

"zSpanCategorizer.set_candidatesc           
   
   C   s   |\}}d}t |D ]I\}}|| j}tt| jd }	| jd dkr8| ||||||j|   |	|j| j< n| 	||||||j|   |j| j< ||j| 7 }q
dS )a  Modify a batch of Doc objects, using pre-computed scores.

        docs (Iterable[Doc]): The documents to modify.
        scores: The scores to set, produced by SpanCategorizer.predict.

        DOCS: https://spacy.io/api/spancategorizer#set_annotations
        r   re   r   r$   N)
r   rG   r   boolr   _make_span_group_singlelabelrI   rH   rh   _make_span_group_multilabel)
r0   r+   indices_scoresr   r   offsetr7   rJ   	indices_ire   r/   r/   r1   set_annotations~  s&   
zSpanCategorizer.set_annotations        )dropsgdlossesrb   r   r   r   c                C   s   |du ri }| | jd t|d | | tdd |D s"|S dd |D }| j|| jjd}|j	 d	kr;|S t
| j| | j||f\}}| |||f\}	}
||
 |durb| | || j  |	7  < |S )
a1  Learn from a batch of documents and gold-standard information,
        updating the pipe's model. Delegates to predict and get_loss.

        examples (Iterable[Example]): A batch of Example objects.
        drop (float): The dropout rate.
        sgd (thinc.api.Optimizer): The optimizer.
        losses (Dict[str, float]): Optional record of the loss during training.
            Updated using the component name as the key.
        RETURNS (Dict[str, float]): The updated losses dictionary.

        DOCS: https://spacy.io/api/spancategorizer#update
        Nr   SpanCategorizer.updatec                 s   s$    | ]}|j rt|j nd V  qdS )r   N)	predictedr>   r   egr/   r/   r1   	<genexpr>  s   " z)SpanCategorizer.update.<locals>.<genexpr>c                 S      g | ]}|j qS r/   )r   r   r/   r/   r1   
<listcomp>      z*SpanCategorizer.update.<locals>.<listcomp>r)   r   )rm   r   r!   _validate_categoriesanyr   r'   r*   rI   r   r   begin_updateget_lossfinish_update)r0   rb   r   r   r   r+   rH   r   backprop_scoreslossd_scoresr/   r/   r1   update  s&   


r   spans_scoresc                 C   s  |\}}t | jj|j| jj|j}tj|j|j	d}| j
r)t|jd }d}| j}t|D ]\\}	}
i }||	 j}t|j|	 D ]}t||df }t||df }|| |||f< qD| |
D ]"}|j|jf}||v r|| }||j }d|||f< | j
rd||< qd||j|	 7 }q2| jjj|dd}| j
rt|d }d||| jf< || }| j
rtt| jd }|dkr||  |9  < t|d  }||fS )	ak  Find the loss and gradient of loss for the batch of documents and
        their predicted scores.

        examples (Iterable[Examples]): The batch of examples.
        spans_scores: Scores representing the model's predictions.
        RETURNS (Tuple[float, float]): The loss and the gradient.

        DOCS: https://spacy.io/api/spancategorizer#get_loss
        r8   r   r$   r   r   fr   r   )r   r'   r*   to_numpydatarI   numpyrF   rB   r9   r   onesr   r   rG   r^   int_get_aligned_spansrS   rT   label_rU   nonzeror   r   floatr   r   )r0   rb   r   rH   r   targetnegative_spansr   	label_mapr7   r   spans_indexspans_irx   rS   rT   	gold_spanrh   rowknegative_samplesr   
neg_weightr   r/   r/   r1   r     sJ   

zSpanCategorizer.get_loss)nlpr   get_examplesr   r   c                C   s   g }|dur|D ]}|  | q| D ]#}|du r+|jj| jg D ]}|  |j q"t|dk r6|| q|   |redd |D }t	dgd|}	| j
j|	jjd | j}
| j
j||	f|
d dS | j
  dS )	a|  Initialize the pipe for training, using a representative set
        of data examples.

        get_examples (Callable[[], Iterable[Example]]): Function that
            returns a representative sample of gold-standard Example objects.
        nlp (Optional[Language]): The current nlp object the component is part of.
        labels (Optional[List[str]]): The labels to add to the component, typically generated by the
            `init labels` command. If no labels are provided, the get_examples
            callback is used to extract the labels from the data.

        DOCS: https://spacy.io/api/spancategorizer#initialize
        N
   c                 S   r   r/   )xr   r/   r/   r1   r      r   z.SpanCategorizer.initialize.<locals>.<listcomp>r$   rY   r   )XY)r   	referencerH   rg   rh   r   r>   r@   _require_labelsrZ   r'   r*   r   rG   rB   r   
initialize)r0   r   r   r   subbatchr   r   rV   r+   rH   r   r/   r/   r1   r     s$   

zSpanCategorizer.initializec                 C   r-   r.   r/   )r0   rb   r/   r/   r1   r   '  s   z$SpanCategorizer._validate_categoriesr   c                 C   s   |j |jj| jg ddS )NT)re   )get_aligned_spans_y2xr   rH   rg   rh   )r0   r   r/   r/   r1   r   +  s   z"SpanCategorizer._get_aligned_spansrJ   r   r   c              
   C   s  t || jd}|jdkr|S | jj|}| jj|}| jd }| jd }||k}|dur~t|ts5J | j	r_t
|dd| jf }t
j |dd| jf< |d  }	||dd| jf< n|d  }	|	dd|df }
t|
D ]
\}}d|||f< qsg }t|jd D ]7}||df }||df }t|| D ]"\}}|r|| jkr|t|||| j| d	 ||||f  qqt
||jd
< |S )z5Find the top-k labels for each span (k=max_positive).r   r   r   r   Nr:   Fr$   r   r   )r   rh   rM   r'   r*   r   r   r   r   r   r   copyr   infargsortr   r^   rB   r@   r   r   arrayattrs)r0   rJ   r   r   rH   r   r   keepsnegative_scoresrankedspan_filterr7   r   attrs_scoresrS   rT   rx   keepr/   r/   r1   r   0  s@   



z+SpanCategorizer._make_span_group_multilabelc              	   C   s  |j dkrt|| jdS | jj|}| jj|}|jdd}tj|t	|ddd}tj
|jtd}| jr@t||| jk}| jd }|durSt|||k }|sm| d  }	||	 }||	 }||	 }||	 }t }
t|| jd}g }t|jd D ];}|| sq|| }||df }||df }|s||f|
v rq|
|| |||  |t|||| j| d	 qt||jd
< |S )z$Find the argmax label for each span.r   r   r$   )axisr8   r   Nr:   r   r   )rM   r   rh   r'   r*   r   argmaxr   take_along_axisexpand_dimsr   rB   r   r   logical_andr   r   squeezer   rr   r^   rw   r@   r   r   r   r   )r0   rJ   r   r   re   r   argmax_scoresr   r   sort_idxseenrH   r   r7   r   rS   rT   r/   r/   r1   r   Y  sJ   
	
z,SpanCategorizer._make_span_group_singlelabel)r~   )r,   N)T)/r3   r4   r5   r|   rp   r#   r   r
   r   r   r   r   r(   r   r   r	   r   r   r   rv   propertyrh   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r   r   r   r   r   r   r   r   r   r   r   r/   r/   r/   r1   r}      s    
	

C

 

)


B

&
.r}   c                 C   sD   | dkrt d}|jS | dkrt d}|jS tdt d|  )Nmake_spancatzspacy.pipeline.factoriesmake_spancat_singlelabelzmodule z has no attribute )	importlibimport_moduler  r  AttributeErrorr3   )r   moduler/   r/   r1   __getattr__  s   

r
  )Fr  sysdataclassesr   	functoolsr   typingr   r   r   r   r   r	   r
   r   r   r   	thinc.apir   r   r   r   r   r   thinc.typesr   r   r   r   compatr   r   errorsr   languager   r   r   tokensr   r   r   trainingr    r!   utilr"   r   r#   trainable_piper%   spancat_default_config"spancat_singlelabel_default_configDEFAULT_SPANS_KEYfrom_strDEFAULT_SPANCAT_MODEL!DEFAULT_SPANCAT_SINGLELABEL_MODELr(   r   rQ   r   rW   rZ   r_   ra   rp   rq   rr   r}   r
  r/   r/   r/   r1   <module>   sx    , 

   N