o
    i                     @   s  d dl Z d dlZd dlZd dlmZmZ d dlZddlmZ ddl	m
Z
 ddlmZ ddlmZ ddlmZ ejd	g d
dddedefdd	Zejdg ddddefddZejddgddddededefddZG dd dZG dd dZdd ZdS )    N)AnyDict   )util)Warnings)Language)Matcher)Docmerge_noun_chunks)	token.depz	token.tagz	token.posT)requiresretokenizesdocreturnc                 C   sh   |  ds| S |  }| jD ]}|jj|jjd}|j||d qW d   | S 1 s-w   Y  | S )zMerge noun chunks into a single token.

    doc (Doc): The Doc object.
    RETURNS (Doc): The Doc object with merged noun chunks.

    DOCS: https://spacy.io/api/pipeline-functions#merge_noun_chunks
    DEP)tagdepattrsN)has_annotation
retokenizenoun_chunksrootr   r   merge)r   retokenizernpr    r   L/home/ubuntu/.local/lib/python3.10/site-packages/spacy/pipeline/functions.pyr
      s   



merge_entities)zdoc.entsztoken.ent_iobztoken.ent_typec                 C   s^   |   !}| jD ]}|jj|jj|jd}|j||d qW d   | S 1 s(w   Y  | S )zMerge entities into a single token.

    doc (Doc): The Doc object.
    RETURNS (Doc): The Doc object with merged entities.

    DOCS: https://spacy.io/api/pipeline-functions#merge_entities
    )r   r   ent_typer   N)r   entsr   r   r   labelr   )r   r   entr   r   r   r   r   %   s   


merge_subtokensr   subtokr!   c                    s   t  j}|d|ddgg | }t fdd|D }  }|D ]}|| q'W d    S 1 s:w   Y   S )zMerge subtokens into a single token.

    doc (Doc): The Doc object.
    label (str): The subtoken dependency label.
    RETURNS (Doc): The Doc object with merged subtokens.

    DOCS: https://spacy.io/api/pipeline-functions#merge_subtokens
    SUBTOK+)r   opc                    s"   g | ]\}}} ||d   qS )   r   ).0_startendr   r   r   
<listcomp>G   s   " z#merge_subtokens.<locals>.<listcomp>N)r   vocabaddr   filter_spansr   r   )r   r!   mergermatchesspansr   spanr   r-   r   r#   9   s   


c                   @   s   e Zd ZddedefddZdedefdd	Zdeee	f fd
dZ
i fdeee	f ddfddZdd Zdd Zdd Zdd ZdS )TokenSplitterr   
min_lengthsplit_lengthc                 C   s   || _ || _d S Nr7   r8   )selfr7   r8   r   r   r   __init__O   s   
zTokenSplitter.__init__r   r   c              	   C   s   | j dkr^| jdkr^| H}|D ]<}t|j| j krMg }g }i }tdt|j| jD ]}||j||| j   |||| j f q+||||| qW d    |S 1 sYw   Y  |S )Nr   )r7   r8   r   lentextrangeappendsplit)r;   r   r   torthsheadsr   ir   r   r   __call__S   s"   



zTokenSplitter.__call__c                 C   s   | j | jdS )Nr:   r:   r;   r   r   r   _get_configa   s   zTokenSplitter._get_configconfigNc                 C   s    | dd| _| dd| _d S )Nr7   r   r8   )getr7   r8   )r;   rI   r   r   r   _set_configg   s   zTokenSplitter._set_configc                       d fddi}t |g S )Ncfgc                      s   t   S r9   )srsly
json_dumpsrH   r   rG   r   r   <lambda>m       z(TokenSplitter.to_bytes.<locals>.<lambda>r   to_bytesr;   kwargsserializersr   rG   r   rS   k      zTokenSplitter.to_bytesc                    "   d fddi}t ||g   S )NrM   c                         t| S r9   )rK   rN   
json_loadsbrG   r   r   rP   s       z*TokenSplitter.from_bytes.<locals>.<lambda>r   
from_bytesr;   datarU   deserializersr   rG   r   r_   q      zTokenSplitter.from_bytesc                    (   t |}d fddi}t ||g S )NrM   c                    s   t |   S r9   )rN   
write_jsonrH   prG   r   r   rP   {   r]   z'TokenSplitter.to_disk.<locals>.<lambda>r   ensure_pathto_diskr;   pathrU   rV   r   rG   r   rj   x      
zTokenSplitter.to_diskc                    ,   t |}d fddi}t ||g  d S )NrM   c                    rY   r9   )rK   rN   	read_jsonrf   rG   r   r   rP      r]   z)TokenSplitter.from_disk.<locals>.<lambda>r   ri   	from_diskrk   r   rG   r   rq         
zTokenSplitter.from_disk)r   r   )__name__
__module____qualname__intr<   r	   rF   r   strr   rH   rK   rS   r_   rj   rq   r   r   r   r   r6   N   s    r6   c                   @   s^   e Zd Zdddeeef defddZdedefd	d
Z	dd Z
dd Zdd Zdd ZdS )
DocCleanerT)silentr   ry   c                C   s   t ||d| _d S )N)r   ry   )dictrM   )r;   r   ry   r   r   r   r<      s   zDocCleaner.__init__r   r   c           
      C   s   | j d }| j d }| D ]M\}}|}|d}d}|d d D ]}	t||	r.t||	}q!d}|s<ttjj	|d q!|s[t||d rOt
||d | q|s[ttjj	|d q|S )Nr   ry   .FT)attr)rM   itemsrA   hasattrgetattrwarningswarnr   W116formatsetattr)
r;   r   r   ry   r}   valueobjpartsskippartr   r   r   rF      s(   



zDocCleaner.__call__c                    rL   )NrM   c                      s   t  jS r9   )rN   rO   rM   r   rG   r   r   rP      s    z%DocCleaner.to_bytes.<locals>.<lambda>rR   rT   r   rG   r   rS      rW   zDocCleaner.to_bytesc                    rX   )NrM   c                        j t| S r9   )rM   updaterN   rZ   r[   rG   r   r   rP          z'DocCleaner.from_bytes.<locals>.<lambda>r^   r`   r   rG   r   r_      rc   zDocCleaner.from_bytesc                    rd   )NrM   c                    s   t |  jS r9   )rN   re   rM   rf   rG   r   r   rP      rQ   z$DocCleaner.to_disk.<locals>.<lambda>rh   rk   r   rG   r   rj      rm   zDocCleaner.to_diskc                    rn   )NrM   c                    r   r9   )rM   r   rN   ro   rf   rG   r   r   rP      r   z&DocCleaner.from_disk.<locals>.<lambda>rp   rk   r   rG   r   rq      rr   zDocCleaner.from_diskN)rs   rt   ru   r   rw   r   boolr<   r	   rF   rS   r_   rj   rq   r   r   r   r   rx      s     rx   c                 C   sD   | dkrt d}|jS | dkrt d}|jS tdt d|  )Nmake_doc_cleanerzspacy.pipeline.factoriesmake_token_splitterzmodule z has no attribute )	importlibimport_moduler   r   AttributeErrorrs   )namemoduler   r   r   __getattr__   s   

r   )r$   )r   sysr   typingr   r   rN    r   errorsr   languager   matcherr   tokensr	   	componentr
   r   rw   r#   r6   rx   r   r   r   r   r   <module>   s6    97