o
    i1J                     @   s  d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZm	Z	m
Z
mZmZmZmZmZmZmZmZ d dlZddlmZ ddlmZmZ ddlmZ dd	lmZmZ dd
lmZ ddl m!Z! ddl"m#Z#m$Z$ ddl%m&Z& ddlm'Z'm(Z(m)Z) ddl*m+Z+ e
e,ee,ee
e,ef  f f Z-dZ.dee$ dee$ dee$ fddZ/dd Z0dee$ dee$ dee$ fddZ1dd Z2e.ddee& de
e,ef fdd Z3e.fd!e,fd"d#Z4G d$d% d%e+Z5d&d' Z6dS )(    N)partial)Path)AnyCallableDictIterableListOptionalSequenceSetTupleUnioncast   )util)ErrorsWarnings)Language)MatcherPhraseMatcher)levenshtein_compare)Scorer)DocSpan)Example)SimpleFrozenListensure_pathregistry   )Piperulerentitiesspansreturnc                    s   dd }t ||dd}t| } g }t |D ]*}|j|j tfdd|D r@||  fdd| D } t  q| | S )	a  Merge entities and spans into one list without overlaps by allowing
    spans to overwrite any entities that they overlap with. Intended to
    replicate the overwrite_ents=True behavior from the EntityRuler.

    entities (Iterable[Span]): The entities, already filtered for overlaps.
    spans (Iterable[Span]): The spans to merge, may contain overlaps.
    RETURNS (List[Span]): Filtered list of non-overlapping spans.
    c                 S      | j | j | j fS Nendstartspan r+   M/home/ubuntu/.local/lib/python3.10/site-packages/spacy/pipeline/span_ruler.py<lambda>0       z,prioritize_new_ents_filter.<locals>.<lambda>Tkeyreversec                 3       | ]}|j  vV  qd S r%   i.0tokenseen_tokensr+   r,   	<genexpr>8       z-prioritize_new_ents_filter.<locals>.<genexpr>c                    s$   g | ]}|j  k r|jks|qS r+   )r(   r'   )r6   er&   r+   r,   
<listcomp>:   s   $ z.prioritize_new_ents_filter.<locals>.<listcomp>)	sortedlistsetr(   r'   allappendupdaterange)r!   r"   get_sort_keynew_entitiesr*   r+   )r'   r9   r(   r,   prioritize_new_ents_filter%   s   
rG   c                   C      t S r%   )rG   r+   r+   r+   r,   make_prioritize_new_ents_filter?      rI   c                    s   dd }t ||dd}t| } g }t   jdd | D   |D ] }|j}|j}t fdd|D r@||  t|| q | | S )a  Merge entities and spans into one list without overlaps by prioritizing
    existing entities. Intended to replicate the overwrite_ents=False behavior
    from the EntityRuler.

    entities (Iterable[Span]): The entities, already filtered for overlaps.
    spans (Iterable[Span]): The spans to merge, may contain overlaps.
    RETURNS (List[Span]): Filtered list of non-overlapping spans.
    c                 S   r$   r%   r&   r)   r+   r+   r,   r-   N   r.   z1prioritize_existing_ents_filter.<locals>.<lambda>Tr/   c                 s   s    | ]
}t |j|jV  qd S r%   )rD   r(   r'   )r6   entr+   r+   r,   r:   S   s    z2prioritize_existing_ents_filter.<locals>.<genexpr>c                 3   r2   r%   r3   r5   r8   r+   r,   r:   W   r;   )	r>   r?   r@   rC   r(   r'   rA   rB   rD   )r!   r"   rE   rF   r*   r(   r'   r+   r8   r,   prioritize_existing_ents_filterC   s   
rL   c                   C   rH   r%   )rL   r+   r+   r+   r,   "make_preserve_existing_ents_filter]   rJ   rM   	spans_keyexamplesc                   sr   t |}d |d    |dd |dd |d fdd |d	fd
d tj| fi |S )Nspans_attrallow_overlapTlabeledgetterc                    s   | j |t d  g S r%   )r"   getlen)docr0   )attr_prefixr+   r,   r-   j       z1overlapping_labeled_spans_score.<locals>.<lambda>has_annotationc                    s
    | j v S r%   )r"   )rX   rN   r+   r,   r-   l   s   
 )dict
setdefaultr   score_spans)rP   rO   kwargsr+   )rY   rO   r,   overlapping_labeled_spans_scorea   s   r`   rO   c                 C   s   t t| dS )NrN   )r   r`   rN   r+   r+   r,   %make_overlapping_labeled_spans_scorerp   s   ra   c                   @   sv  e Zd ZdZ	dFeddejdeddee	edd	de
ded	ee d
eeee ee gee f  dedeee ee gee f deeeef  dedededee ddfddZdefddZdedefddZedee fddZdedefddZdefdd Zd!d" Zedeed#f fd$d%Zedeed#f fd&d'Zddd(d)eg ee f dee
 d*ee e!  fd+d,Z"ede#e! fd-d.Z$d*e#e! ddfd/d0Z%dGd1d2Z&deddfd3d4Z'd5eddfd6d7Z(dGd8d9Z)e* d:d;e+d<ee dd fd=d>Z,e* d:d<ee de+fd?d@Z-e* d:dAeee.f d<ee dd fdBdCZ/e* d:dAeee.f d<ee ddfdDdEZ0dS )H	SpanRulerzThe SpanRuler lets you add spans to the `Doc.spans` using token-based
    rules or exact phrase matches.

    DOCS: https://spacy.io/api/spanruler
    USAGE: https://spacy.io/usage/rule-based-matching#spanruler
    
span_rulerNFrN   )	rO   spans_filterannotate_entsents_filterphrase_matcher_attrmatcher_fuzzy_comparevalidate	overwritescorernlpnamerO   rd   re   rf   rg   rh   ri   rj   rk   r#   c       	         C   sT   || _ || _|| _|| _|| _|	| _|
| _|| _|| _|| _	|| _
i | _|   dS )a  Initialize the span ruler. If patterns are supplied here, they
        need to be a list of dictionaries with a `"label"` and `"pattern"`
        key. A pattern can either be a token pattern (list) or a phrase pattern
        (string). For example: `{'label': 'ORG', 'pattern': 'Apple'}`.

        nlp (Language): The shared nlp object to pass the vocab to the matchers
            and process phrase patterns.
        name (str): Instance name of the current pipeline component. Typically
            passed in automatically from the factory when the component is
            added. Used to disable the current span ruler while creating
            phrase patterns with the nlp object.
        spans_key (Optional[str]): The spans key to save the spans under. If
            `None`, no spans are saved. Defaults to "ruler".
        spans_filter (Optional[Callable[[Iterable[Span], Iterable[Span]], List[Span]]):
            The optional method to filter spans before they are assigned to
            doc.spans. Defaults to `None`.
        annotate_ents (bool): Whether to save spans to doc.ents. Defaults to
            `False`.
        ents_filter (Callable[[Iterable[Span], Iterable[Span]], List[Span]]):
            The method to filter spans before they are assigned to doc.ents.
            Defaults to `util.filter_chain_spans`.
        phrase_matcher_attr (Optional[Union[int, str]]): Token attribute to
            match on, passed to the internal PhraseMatcher as `attr`. Defaults
            to `None`.
        matcher_fuzzy_compare (Callable): The fuzzy comparison method for the
            internal Matcher. Defaults to
            spacy.matcher.levenshtein.levenshtein_compare.
        validate (bool): Whether patterns should be validated, passed to
            Matcher and PhraseMatcher as `validate`.
        overwrite (bool): Whether to remove any existing spans under this spans
            key if `spans_key` is set, and/or to remove any ents under `doc.ents` if
            `annotate_ents` is set. Defaults to `True`.
        scorer (Optional[Callable]): The scoring method. Defaults to
            spacy.pipeline.span_ruler.overlapping_labeled_spans_score.

        DOCS: https://spacy.io/api/spanruler#init
        N)rl   rm   rO   re   rg   ri   rj   rd   rf   rk   rh   _match_label_id_mapclear)selfrl   rm   rO   rd   re   rf   rg   rh   ri   rj   rk   r+   r+   r,   __init__|   s   :zSpanRuler.__init__c                 C   s
   t | jS )z1The number of all labels added to the span ruler.)rW   	_patternsrp   r+   r+   r,   __len__   s   
zSpanRuler.__len__labelc                 C   s&   | j  D ]}|d |kr dS qdS )z+Whether a label is present in the patterns.ru   TF)rn   values)rp   ru   label_idr+   r+   r,   __contains__   s
   zSpanRuler.__contains__c                 C      | j S )z2Key of the doc.spans dict to save the spans under.rN   rs   r+   r+   r,   r0      s   zSpanRuler.keyrX   c              
   C   s\   |   }z| |}| || |W S  ty- } z|| j| |g|W  Y d}~S d}~ww )zFind matches in document and add them as entities.

        doc (Doc): The Doc object in the pipeline.
        RETURNS (Doc): The Doc with added entities, if available.

        DOCS: https://spacy.io/api/spanruler#call
        N)get_error_handlermatchset_annotations	Exceptionrm   )rp   rX   error_handlermatchesr<   r+   r+   r,   __call__   s   
zSpanRuler.__call__c                    s      t ' tjddd ttttttf  t	 t
  }W d    n1 s2w   Y  t fdd|D }tt|S )Nignorez\[W036)messagec                 3   sD    | ]\}}}||krt  ||j| d  j| d dV  qdS )ru   id)ru   span_idN)r   rn   )r6   m_idr(   r'   rX   rp   r+   r,   r:      s    z"SpanRuler.match.<locals>.<genexpr>)_require_patternswarningscatch_warningsfilterwarningsr   r   r   intr?   matcherphrase_matcherr@   r>   )rp   rX   r   deduplicated_matchesr+   r   r,   r{      s   
zSpanRuler.matchc                 C   s   | j r(g }| j |jv r| js|j| j  }|| jr| ||n| ||j| j < | jrPg }| js5t|j}| ||}zt	||_W dS  t
yO   t
tjw dS )zModify the document in placeN)r0   r"   rj   extendrd   re   r?   entsrf   r>   
ValueErrorr   E854)rp   rX   r   r"   r+   r+   r,   r|      s&   

zSpanRuler.set_annotations.c                 C   s   t ttdd | jD S )zAll labels present in the match patterns.

        RETURNS (set): The string labels.

        DOCS: https://spacy.io/api/spanruler#labels
        c                 S   s   g | ]	}t t|d  qS ru   )r   strr6   pr+   r+   r,   r=     rZ   z$SpanRuler.labels.<locals>.<listcomp>tupler>   r@   rr   rs   r+   r+   r,   labels  s   zSpanRuler.labelsc                 C   s&   t ttdd | jD tdg S )zAll IDs present in the match patterns.

        RETURNS (set): The string IDs.

        DOCS: https://spacy.io/api/spanruler#ids
        c                 S   s   g | ]
}t t|d qS r   )r   r   rV   r   r+   r+   r,   r=   "      z!SpanRuler.ids.<locals>.<listcomp>Nr   rs   r+   r+   r,   ids  s    zSpanRuler.ids)rl   patternsget_examplesr   c                C   s   |    |r| | dS dS )a  Initialize the pipe for training.

        get_examples (Callable[[], Iterable[Example]]): Function that
            returns a representative sample of gold-standard Example objects.
        nlp (Language): The current nlp object the component is part of.
        patterns (Optional[Iterable[PatternType]]): The list of patterns.

        DOCS: https://spacy.io/api/spanruler#initialize
        N)ro   add_patterns)rp   r   rl   r   r+   r+   r,   
initialize%  s   zSpanRuler.initializec                 C   ry   )zGet all patterns that were added to the span ruler.

        RETURNS (list): The original patterns, one dictionary per pattern.

        DOCS: https://spacy.io/api/spanruler#patterns
        )rr   rs   r+   r+   r,   r   9  s   zSpanRuler.patternsc                 C   s  z&d}t | jjD ]\}\}}| |kr|} nq	dd | jj|d D }W n ty1   g }Y nw | jj|d g }g }|D ]]}	tt|	d }
tt|	dd}t	|
|f}|
|d	| j
| jjj|< t|	d
 trz|| ||	d
  nt|	d
 tr| j||	d
 g nttjj|	d
 d| j|	 q@t|| j|D ]\}}| j||g qW d   dS 1 sw   Y  dS )a  Add patterns to the span ruler. A pattern can either be a token
        pattern (list of dicts) or a phrase pattern (string). For example:
        {'label': 'ORG', 'pattern': 'Apple'}
        {'label': 'ORG', 'pattern': 'Apple', 'id': 'apple'}
        {'label': 'GPE', 'pattern': [{'lower': 'san'}, {'lower': 'francisco'}]}

        patterns (list): The patterns to add.

        DOCS: https://spacy.io/api/spanruler#add_patterns
        c                 S   s   g | ]}|qS r+   r+   )r6   piper+   r+   r,   r=   W      z*SpanRuler.add_patterns.<locals>.<listcomp>N)disableru   r    )ru   r   pattern)r   )	enumeraterl   pipeline
pipe_namesr   select_pipesr   r   rV   reprrn   vocabstringsas_int
isinstancerB   r?   r   addr   E097formatrr   zipr   r   )rp   r   current_indexr4   rm   r   subsequent_pipesphrase_pattern_labelsphrase_pattern_textsentryp_labelp_idru   r   r+   r+   r,   r   C  sF   

"zSpanRuler.add_patternsc                 C   s:   g | _ t| jj| j| jd| _t| jj| j| jd| _	dS )zfReset all patterns.

        RETURNS: None
        DOCS: https://spacy.io/api/spanruler#clear
        )ri   fuzzy_compare)rR   ri   N)
rr   r   rl   r   ri   rh   r   r   rg   r   rs   r+   r+   r,   ro   s  s   zSpanRuler.clearc                    s    | vrt tjjd | jd fdd| jD | _| jD ])}| j| d  krG| jjj	
|}|| jv r<| j| || jv rG| j| qdS )zRemove a pattern by its label.

        label (str): Label of the pattern to be removed.
        RETURNS: None
        DOCS: https://spacy.io/api/spanruler#remove
        ru   	attr_typeru   	componentc                    s   g | ]
}|d   kr|qS r   r+   r   r   r+   r,   r=     r   z$SpanRuler.remove.<locals>.<listcomp>N)r   r   E1024r   rm   rr   rn   rl   r   r   	as_stringr   remover   )rp   ru   m_labelm_label_strr+   r   r,   r     s   


zSpanRuler.remove
pattern_idc                    s   t | } fdd| jD | _|t | kr!ttjjd | jd| jD ])}| j| d  krM| jj	j
|}|| jv rB| j| || jv rM| j| q$dS )zRemove a pattern by its pattern ID.

        pattern_id (str): ID of the pattern to be removed.
        RETURNS: None
        DOCS: https://spacy.io/api/spanruler#remove_by_id
        c                    s   g | ]}| d  kr|qS r   )rV   r   r   r+   r,   r=     s    z*SpanRuler.remove_by_id.<locals>.<listcomp>IDr   r   N)rW   rr   r   r   r   r   rm   rn   rl   r   r   r   r   r   r   )rp   r   orig_lenr   r   r+   r   r,   remove_by_id  s"   


zSpanRuler.remove_by_idc                 C   s*   t | dkrttjj| jd dS dS )z:Raise a warning if this component has no patterns defined.r   )rm   N)rW   r   warnr   W036r   rm   rs   r+   r+   r,   r     s   zSpanRuler._require_patterns)exclude
bytes_datar   c                   s*       d fddi}t|||  S )zLoad the span ruler from a bytestring.

        bytes_data (bytes): The bytestring to load.
        RETURNS (SpanRuler): The loaded span ruler.

        DOCS: https://spacy.io/api/spanruler#from_bytes
        r   c                         t| S r%   )r   srsly
json_loads)brs   r+   r,   r-     r   z&SpanRuler.from_bytes.<locals>.<lambda>)ro   r   
from_bytes)rp   r   r   deserializersr+   rs   r,   r     s
   
zSpanRuler.from_bytesc                   s   d fddi}t ||S )zSerialize the span ruler to a bytestring.

        RETURNS (bytes): The serialized patterns.

        DOCS: https://spacy.io/api/spanruler#to_bytes
        r   c                      s   t  jS r%   )r   
json_dumpsr   r+   rs   r+   r,   r-     s    z$SpanRuler.to_bytes.<locals>.<lambda>)r   to_bytes)rp   r   serializersr+   rs   r,   r     s   zSpanRuler.to_bytespathc                   s2       t|}d fddi}t||i   S )zLoad the span ruler from a directory.

        path (Union[str, Path]): A path to a directory.
        RETURNS (SpanRuler): The loaded span ruler.

        DOCS: https://spacy.io/api/spanruler#from_disk
        r   c                    r   r%   )r   r   
read_jsonlr   rs   r+   r,   r-     r   z%SpanRuler.from_disk.<locals>.<lambda>)ro   r   r   	from_disk)rp   r   r   r   r+   rs   r,   r     s   
zSpanRuler.from_diskc                   s*   t |}d fddi}t||i  dS )zSave the span ruler patterns to a directory.

        path (Union[str, Path]): A path to a directory.

        DOCS: https://spacy.io/api/spanruler#to_disk
        r   c                    s   t |  jS r%   )r   write_jsonlr   r   rs   r+   r,   r-     s    z#SpanRuler.to_disk.<locals>.<lambda>N)r   r   to_disk)rp   r   r   r   r+   rs   r,   r     s   	zSpanRuler.to_disk)rc   )r#   N)1__name__
__module____qualname____doc__DEFAULT_SPANS_KEYr   filter_chain_spansr   r   r`   r   r   r	   r   r   r   boolr   r   rq   rt   rx   propertyr0   r   r   r{   r|   r   r   r   r   r
   PatternTyper   r   r   r   ro   r   r   r   r   bytesr   r   r   r   r   r+   r+   r+   r,   rb   t   s    
	

H	

	
0




rb   c                 C   sD   | dkrt d}|jS | dkrt d}|jS tdt d|  )Nmake_span_rulerzspacy.pipeline.factoriesmake_entity_rulerzmodule z has no attribute )	importlibimport_moduler   make_future_entity_rulerAttributeErrorr   )rm   moduler+   r+   r,   __getattr__  s   

r   )7r   sysr   	functoolsr   pathlibr   typingr   r   r   r   r   r	   r
   r   r   r   r   r   r   r   errorsr   r   languager   r   r   r   matcher.levenshteinr   rk   r   tokensr   r   trainingr   r   r   r   r   r   r   r   r   rG   rI   rL   rM   r`   ra   rb   r   r+   r+   r+   r,   <module>   s^    4 



   