o
    iS)                     @   s,  d dl Z d dlZd dlmZmZmZmZmZmZm	Z	 d dl
mZmZmZmZ d dlmZ ddlmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZ ddl m!Z! dZ"e #e"d Z$dd Z%dee dee&ef fddZ'dede	e(e(f fddZ)G dd de!Z*dd Z+dS )    N)AnyCallableDictIterableListOptionalTuple)ConfigModel	Optimizerset_dropout_rate)Floats2d   )Errors)Language)Scorer)DocSpan)Example)registry   )DEFAULT_SPANS_KEY)TrainablePipea  
[model]
@architectures = "spacy.SpanFinder.v1"

[model.scorer]
@layers = "spacy.LinearLogistic.v1"
nO = 2

[model.tok2vec]
@architectures = "spacy.Tok2Vec.v2"

[model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v2"
width = 96
rows = [5000, 1000, 2500, 1000]
attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
include_static_vectors = false

[model.tok2vec.encode]
@architectures = "spacy.MaxoutWindowEncoder.v2"
width = ${model.tok2vec.embed.width}
window_size = 1
maxout_pieces = 3
depth = 4
modelc                   C   s   t S N)span_finder_score r   r   N/home/ubuntu/.local/lib/python3.10/site-packages/spacy/pipeline/span_finder.pymake_span_finder_scorer.   s   r   examplesreturnc                    s   t |}d |d |d    |d fdd |dfdd |d	d
 |dd tj| fi |}||d  dd  |S )Nspans_	spans_keyattrgetterc                    s   | j |t d  g S r   )spansgetlen)dockey)attr_prefixr   r   <lambda>8   s    z#span_finder_score.<locals>.<lambda>has_annotationc                    s
    | j v S r   )r%   )r(   )r)   r   r   r+   :   s   
 allow_overlapTlabeledF	_per_type)dict
setdefaultr   score_spanspop)r   kwargsscoresr   )r*   r)   r   r   2   s   r   spanc                 C   s(   | d j }| d j t| d  }||fS )Nr   )idxr'   )r6   startendr   r   r   _char_indicesB   s   
r;   c                   @   s0  e Zd ZdZ	d'edddeddedeee	 e
f ded	ed
edee dee dee ddfddZdee	 fddZdee	 de
ddfddZdddddee dedee deeeef  deeef f
ddZdeee
f fdd Zdee
e
f fd!d"Zdd#d$eg ee f dee ddfd%d&ZdS )(
SpanFinderzUPipeline that learns span boundaries.

    DOCS: https://spacy.io/api/spanfinder
    span_finderg      ?N)r"   	threshold
max_length
min_lengthscorernlpr   namer"   r>   r?   r@   rA   r    c          	      C   sb   |j | _ |dur|dk s|dur|dk rttjj||d|| _|| _|| _||||d| _dS )a  Initialize the span finder.
        model (thinc.api.Model): The Thinc Model powering the pipeline
            component.
        name (str): The component instance name, used to add entries to the
            losses during training.
        threshold (float): Minimum probability to consider a prediction
            positive.
        scorer (Optional[Callable]): The scoring method.
        spans_key (str): Key of the doc.spans dict to save the spans under.
            During initialization and training, the component will look for
            spans on the reference document under the same key.
        max_length (Optional[int]): Maximum length of the produced spans,
            defaults to None meaning unlimited length.
        min_length (Optional[int]): Minimum length of the produced spans,
            defaults to None meaning shortest span length is 1.

        DOCS: https://spacy.io/api/spanfinder#init
        Nr   )r@   r?   )r@   r?   r>   r"   )	vocab
ValueErrorr   E1053formatr   rC   rA   cfg)	selfrB   r   rC   r"   r>   r?   r@   rA   r   r   r   __init__N   s   zSpanFinder.__init__docsc                 C   s   | j |}|S )a  Apply the pipeline's model to a batch of docs, without modifying
        them.

        docs (Iterable[Doc]): The documents to predict.
        RETURNS: The models prediction for each document.

        DOCS: https://spacy.io/api/spanfinder#predict
        )r   predict)rI   rK   r5   r   r   r   rL   }   s   	zSpanFinder.predictr5   c              	   C   s&  d}t |D ]\}}g |j| jd < g }g }|||t|  }t||D ]"\}	}
|
d | jd kr8||	j |
d | jd krG||	j q%|D ]?}|D ]:}|d | }|dk r[qN| jd du si| jd |kr| jd du sw|| jd kr|j| jd  |||d   qNqJ|t|7 }qdS )a  Modify a batch of Doc objects, using pre-computed scores.
        docs (Iterable[Doc]): The documents to modify.
        scores: The scores to set, produced by SpanFinder predict method.

        DOCS: https://spacy.io/api/spanfinder#set_annotations
        r   r"   r>   r   r@   Nr?   )	enumerater%   rH   r'   zipappendi)rI   rK   r5   offsetrP   r(   startsends
doc_scorestokentoken_scorer9   r:   span_lengthr   r   r   set_annotations   s4   "zSpanFinder.set_annotations        )dropsgdlossesr   rZ   r[   r\   c          
      C   s   |du ri }| | jd dd |D }t| j| | j|\}}| ||\}}	||	 |dur7| | || j  |7  < |S )a?  Learn from a batch of documents and gold-standard information,
        updating the pipe's model. Delegates to predict and get_loss.
        examples (Iterable[Example]): A batch of Example objects.
        drop (float): The dropout rate.
        sgd (Optional[thinc.api.Optimizer]): The optimizer.
        losses (Optional[Dict[str, float]]): Optional record of the loss during
            training. Updated using the component name as the key.
        RETURNS (Dict[str, float]): The updated losses dictionary.

        DOCS: https://spacy.io/api/spanfinder#update
        NrY   c                 S      g | ]}|j qS r   )	predicted.0egr   r   r   
<listcomp>       z%SpanFinder.update.<locals>.<listcomp>)r1   rC   r   r   begin_updateget_lossfinish_update)
rI   r   rZ   r[   r\   r^   r5   backprop_scoreslossd_scoresr   r   r   update   s   
zSpanFinder.updatec                 C   sF   |  || jj\}}|| jj| }||9 }t|d  }||fS )ab  Find the loss and gradient of loss for the batch of documents and
        their predicted scores.
        examples (Iterable[Examples]): The batch of examples.
        scores: Scores representing the model's predictions.
        RETURNS (Tuple[float, Floats2d]): The loss and the gradient.

        DOCS: https://spacy.io/api/spanfinder#get_loss
        r   )_get_aligned_truth_scoresr   ops	asarray2ffloatsum)rI   r   r5   truthsmasksri   rh   r   r   r   re      s
   	zSpanFinder.get_lossc                 C   sZ  g }g }|D ]}|j j|jjkrttjjddt|j}|j	j
|dfdd}|j	j|dfdd}| jd |jjv r|jj| jd  D ]I}	t|	\}
}|jj|
|dd}t|\}}||
k}||k}|rod	||d
 jd
f< n	d
||d
 jd
f< |rd	||d jd	f< qDd
||d jd	f< qD|| || q|j	j|d
d}|j	j|d
d}||fS )z\Align scores of the predictions to the references for calculating
        the loss.
        r=   )	componentr   float32)dtyper"   expand)alignment_moder   r   r7   )axis)xtextyrE   r   E1054rG   r'   r^   xpzerosonesrH   	referencer%   r;   	char_spanrP   rO   concatenate)rI   r   rl   rp   rq   ra   n_tokenstruthmaskr6   ref_start_charref_end_char	pred_spanpred_start_charpred_end_charstart_match	end_matchr   r   r   rk      s8   

z$SpanFinder._get_aligned_truth_scores)rB   get_examplesc                C   sn   g }| D ]}t |dk r|| q|r0dd |D }| || jj\}}| jj||d dS | j  dS )a  Initialize the pipe for training, using a representative set
        of data examples.
        get_examples (Callable[[], Iterable[Example]]): Function that
            returns a representative sample of gold-standard Example objects.
        nlp (Optional[Language]): The current nlp object the component is part
            of.

        DOCS: https://spacy.io/api/spanfinder#initialize
        
   c                 S   r]   r   )r   r_   r   r   r   rb     rc   z)SpanFinder.initialize.<locals>.<listcomp>)XYN)r'   rO   rk   r   rl   
initialize)rI   r   rB   subbatchra   rK   r   _r   r   r   r      s   

zSpanFinder.initialize)r=   )__name__
__module____qualname____doc__r   r   r   r
   r   r   r   strrn   r   intr   rJ   rL   rX   r   r   r   rj   r   re   rk   r   r   r   r   r   r<   H   sl    		

/'

 'r<   c                 C   s,   | dkrt d}|jS tdt d|  )Nmake_span_finderzspacy.pipeline.factorieszmodule z has no attribute )	importlibimport_moduler   AttributeErrorr   )rC   moduler   r   r   __getattr__  s   
r   ),r   systypingr   r   r   r   r   r   r   	thinc.apir	   r
   r   r   thinc.typesr   errorsr   languager   rA   r   tokensr   r   trainingr   utilr   spancatr   trainable_piper   span_finder_default_configfrom_strDEFAULT_SPAN_FINDER_MODELr   r   r   r   r;   r<   r   r   r   r   r   <module>   s*    $ U