o
    i&                 	   @   sz  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlm	Z	m
Z
 d dlmZ d dlmZ d dlmZmZ d dlmZ d dlmZ d dlmZmZmZmZmZmZmZmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z& d dl'Z'd dl(Z(d d	l)m*Z* d d
l+m,Z,m-Z-m.Z.m/Z/ d dl0m1Z1 ddl2m3Z3m4Z4m5Z5 ddl6m7Z7 ddl8m9Z9m:Z: ddl;m<Z< ddl=m>Z>m?Z?m@Z@ ddlAmBZBmCZC ddlDmEZE ddlFmGZGmHZHmIZI ddlJmKZKmLZLmMZMmNZNmOZO ddlPmQZQ ddlRmSZS ddlTmUZU ddlVmWZW ddlXmYZYmZZZ ddl[m\Z\m]Z] ddl5m^Z^m_Z_m`Z`maZambZbmcZcmdZdmeZemfZf ddlgmhZh ddlimjZjmkZk eeUgeUf Zleemjnd Zoe5peoZqeemjnd  Zre#d!ZsG d"d# d#Ztd$ed%geSf fd&d'Zud(d) ZvG d*d% d%Zwd+exd$exfd,d-ZyeG d.d/ d/ZzG d0d1 d1e{Z|d2eeY d$eeY fd3d4Z}d5ee$e~eUef esgeUf d6eed7eeU f  d8e"exexexf d$dfd9d:ZG d;d< d<ZG d=d> d>Ze ZdS )?    N)	ExitStackcontextmanager)deepcopy)	dataclass)chaincycle)Path)default_timer)AnyCallableDictIterableIteratorListNoReturnOptionalPatternSequenceSetTupleTypeVarUnioncastoverload)Pool)ConfigCupyOps	Optimizerget_current_ops)convert_recursive   )abouttyutil)Literal)ErrorsWarnings)GIT_VERSION)TOKENIZER_INFIXESTOKENIZER_PREFIXESTOKENIZER_SUFFIXES)BASE_EXCEPTIONS	URL_MATCH)load_lookups)analyze_pipesprint_pipe_analysisvalidate_attrs)ConfigSchemaConfigSchemaInitConfigSchemaNlpConfigSchemaPretrainvalidate_init_settings)Scorer	Tokenizer)Doc)
Underscore)Examplevalidate_examples)init_tok2vec
init_vocab)	_DEFAULT_EMPTY_PIPESCONFIG_SECTION_ORDERSimpleFrozenDictSimpleFrozenList_pipecombine_score_weightsraise_errorregistrywarn_if_jupyter_cupy)BaseVectors)Vocabcreate_vocabzdefault_config.cfgzdefault_config_pretraining.cfg_AnyContextc                   @   s   e Zd ZU dZeedZeed< eZ	e
eee f ed< eZeeeeef   ed< eZeeeeef   ed< eZeeeeef   ed< dZee ed	< eZee ed
< i Ze
eef ed< i Ze
eeegef f ed< e  Z!e"e ed< ddddZ#dS )BaseDefaultszLanguage data defaults, available via Language.Defaults. Can be
    overwritten by language subclasses by defining their own subclasses of
    Language.Defaults.
    section_orderconfigtokenizer_exceptionsprefixessuffixesinfixesNtoken_match	url_matchsyntax_iteratorslex_attr_getters
stop_wordsltrT)	directionhas_casehas_letters)$__name__
__module____qualname____doc__r   r@   rO   __annotations__r+   rP   r   strr   dictr)   rQ   r   r   r   r   r*   rR   r(   rS   rT   r   r,   rU   rV   rW   intr
   setrX   r   writing_system rg   rg   B/home/ubuntu/.local/lib/python3.10/site-packages/spacy/language.pyrL   X   s   
 rL   returnLanguagec                  C   s   dddt fdd} | S )zRegistered function to create a tokenizer. Returns a factory that takes
    the nlp object and returns a Tokenizer instance using the language detaults.
    nlprj   ri   c              	   S   sx   | j j}| j j}| j j}|rt|jnd }|rt|jnd }|r(t|j	nd }t
| j| j j|||| j j| j jdS )N)rulesprefix_searchsuffix_searchinfix_finditerrT   rU   )DefaultsrQ   rR   rS   r#   compile_prefix_regexsearchcompile_suffix_regexcompile_infix_regexfinditerr8   vocabrP   rT   rU   )rk   rQ   rR   rS   rm   rn   ro   rg   rg   rh   tokenizer_factoryp   s   z+create_tokenizer.<locals>.tokenizer_factoryr7   )rw   rg   rg   rh   create_tokenizerk   s   rx   c                 C   s   t jd| t| |d}|S )Nz+Loading lookups from spacy-lookups-data: %s)langtables)r#   loggerdebugr-   )ry   rz   lookupsrg   rg   rh   load_lookups_data   s   r~   c                       s  e Zd ZU dZeZdZee e	d< e
ZeejdZi Zeedf e	d< 	ddi ddd	d
deeef dedeeef deed geegef f  deedgef  deddfddZ fddZedd Zedeeef fddZejdeeef ddfddZede fddZ!e!jde ddfddZ!ede"e fd d!Z#ede"e fd"d#Z$ede"e%ee&f  fd$d%Z'ede"e fd&d'Z(ede"e%ee&f  fd(d)Z)ede"e fd*d+Z*edeeef fd,d-Z+edeee"e f fd.d/Z,e-d0edefd1d2Z.e-d0edefd3d4Z/e-d0eddfd5d6Z0e-d0eddddfd7d8Z1d0eddfd9d:Z2d0ede fd;d<Z3e-e e4 e4 d=e dd>d0ed?eeef d@e5e dAe5e dBedCeeee6 f dDee defdEdFZ7e-e4 e4 d=ddGd0ed@e5e dAe5e dBedDee& dedHef fdIdJZ8g dKd=dLdMe"e dNedeeeef  fdOdPZ9d0ede&fdQdRZ:	de dddSdTed0ee dUeeef dVee  dWede&fdXdYZ;dZed[d d0ede%e&ef fd\d]Z<	dddddde ddd^dTed0ee d_eeeef  d`eeeef  daee dbee d[ed  dUeeef dVee  dWede&fdcddZ=				dd_eeeef  d`eeeef  daee dbee def
dedfZ>d0edefdgdhZ?e ddid0edTedUeeef dWede&f
djdkZ@dledmeddfdndoZAd0ede%ee&f fdpdqZBd0eddfdrdsZCd0eddfdtduZDe4 ddvdweeef dxe5e dyeeeeeef f  defdzd{ZEdd}d~ZFddddxeeee5e f  deeee5e f  dd|fddZGdwedefddZHdeeeeIf defddZJdeeeeIf deKdefddZL	ddddde4 e4 dde5eM dee de6deeN deeee6f  dyeeeeeef f  de5e de5e fddZOddde4 dde5eM deeN deeee6f  dyeeeeeef f  de5e deee6f fddZP	ddddeeg e5eM f  deeN deNfddZQ	ddddeeg e5eM f  deeN deNfddZRdddeeN deNfddZSdeee&e"e eTgeUf fddZVddddd=dde5eM dee deeW dyeeeeeef f  deeeef  dedeeef fddZXdd ZYeZdee[ fddZ\e]dHdHdHdHdHdde5eeef  de^d= dee dxe5e dyeeeeeef f  dede_e fddZ`e]dHdHdHdHdHdde5e%eeef eKf  de^d dee dxe5e dyeeeeeef f  dede_e%eeKf  fddZ`d=de4 ddddee5eeef  e5e%eeef eKf  f dedee dxe5e dyeeeeeef f  dedee_e e_e%eeKf  f fddZ`dxe5e fddZade5eeef  de5edHe_e f  dedede_e f
ddZbdddZce-i fdededede ddddUeeeef e f deeef dxeee5e f deee5e f deee5e f deeef dedWedd fddZededede5e ddfddǄZfeZddeeg de_eg fddʄZhe4 d˜deeeif de5e ddfdd΄Zjekdxeee5e f deee5e f de5e de%edHf fddфZle4 e dҜdeeeif de5e deeef dd fddՄZme4 d˜de5e deIfddׄZne4 d˜deIde5e dd fddڄZo  ZpS )rj   a  A text-processing pipeline. Usually you'll load this once per process,
    and pass the instance around your application.

    Defaults (class): Settings, data and factory methods for creating the `nlp`
        object and processing pipeline.
    lang (str): Two-letter ISO 639-1 or three-letter ISO 639-3 language codes, such as 'en' and 'eng'.

    DOCS: https://spacy.io/api/language
    Nry   errorFactoryMeta_factory_metaTi@B i  )
max_lengthmetarx   create_vectors
batch_sizerv   r   r   rx   r   rI   r   ri   c                K   sp  ddl m} |  tjj  t| j| _	t
|| _d| _d| _i | _i | _t|ts<|dur<ttjj|ttd|du rj|di d}	t| j| j|	d}|sdd| j	d	 d i}
t|
d }|||_n| jr|jr| j|jkrttjj| j|jd
|| _| jdu r| jj| _g | _t  | _!|| _"|sd| j	d	 d i}t|d }|| | _#|| _$t%| _&dS )a  Initialise a Language object.

        vocab (Vocab): A `Vocab` object. If `True`, a vocab is created.
        meta (dict): Custom meta data for the Language class. Is written to by
            models to add model meta data.
        max_length (int): Maximum number of characters in a single text. The
            current models may run out memory on extremely long texts, due to
            large internal allocations. You should segment these texts into
            meaningful units, e.g. paragraphs, subsections etc, before passing
            them to spaCy. Default maximum length is 1,000,000 charas (1mb). As
            a rule of thumb, if all pipeline components are enabled, spaCy's
            default models currently requires roughly 1GB of temporary memory per
            100,000 characters in one text.
        create_tokenizer (Callable): Function that takes the nlp object and
            returns a tokenizer.
        batch_size (int): Default batch size for pipe and evaluate.

        DOCS: https://spacy.io/api/language#init
        r    )register_factoriesNT)rv   
vocab_typevectorsname)vectors_namerk   )rk   rv   	tokenizer)'pipeline.factoriesr   r#   rF   _entry_point_factoriesget_allDEFAULT_CONFIGmergedefault_config_configrc   _meta_path
_optimizer
_pipe_meta_pipe_configs
isinstancerI   
ValueErrorr%   E918formattypegetrJ   ry   rp   resolver   E150rv   _componentsre   	_disabledr   r   r   rE   default_error_handler)selfrv   r   r   rx   r   r   kwargsr   r   vectors_cfgtokenizer_cfgrg   rg   rh   __init__   s@   




zLanguage.__init__c                    s6   t  jdi | t| jj| _| j| jd d< d S )Nrk   ry   rg   )super__init_subclass__r   r   rp   rO   r   ry   )clsr   	__class__rg   rh   r      s   zLanguage.__init_subclass__c                 C   s   | j S N)r   r   rg   rg   rh   path   s   zLanguage.pathc                 C   s0  t tj}| jjr| jd| jj n| jd| j | jdd | jdd | jd| | jdd | jd	d | jd
d | jdd | jdd | jdt | jj	t
| jj| jjj| jjj| jjjd| jd< t| j| jd< t| j| jd< t| j| jd< t| j| jd< | jS )zCustom meta data of the language class. If a model is loaded, this
        includes details from the model's meta.json.

        RETURNS (Dict[str, Any]): The meta.

        DOCS: https://spacy.io/api/language#meta
        ry   r   pipelineversionz0.0.0spacy_versiondescription authoremailurllicensespacy_git_version)widthr   keysr   moder   labels
componentsdisabled)r#   get_minor_version_ranger!   __version__rv   ry   r   
setdefaultr'   vectors_lengthlenr   n_keysr   r   rc   pipe_labelslist
pipe_namescomponent_namesr   )r   r   rg   rg   rh   r      s0   	
zLanguage.metavaluec                 C   
   || _ d S r   )r   r   r   rg   rg   rh   r        
c                 C   s   | j di  | j di  | j| j d d< i }g }| jD ]}| |}| |}d|ji|||< |jr;||j qt	| j| j d d< t	| j
| j d d< || j d< | j d di }t||}|| j d d< t| j szttjj| j d	| j S )
zTrainable config for the current language instance. Includes the
        current pipeline components, as well as default training config.

        RETURNS (thinc.api.Config): The config.

        DOCS: https://spacy.io/api/language#config
        rk   trainingry   factoryr   r   r   score_weightsrO   )r   r   ry   r   get_pipe_metaget_pipe_configr   default_score_weightsappendr   r   r   rD   srslyis_json_serializabler   r%   E961r   )r   r   r   	pipe_name	pipe_metapipe_configprev_weightscombined_score_weightsrg   rg   rh   rO     s*   	




zLanguage.configc                 C   r   r   )r   r   rg   rg   rh   rO   :  r   c                    *    fdd j D }t|tjjdddS )ziGet the names of all disabled components.

        RETURNS (List[str]): The disabled components.
        c                    s   g | ]\}}| j v r|qS rg   r   ).0r   _r   rg   rh   
<listcomp>F      z%Language.disabled.<locals>.<listcomp>r   attrr   r   rB   r%   E926r   r   namesrg   r   rh   r   >     zLanguage.disabledc                 C   s   t | j }t|S )z_Get names of all available factories.

        RETURNS (List[str]): The factory names.
        )r   	factoriesr   rB   r   rg   rg   rh   factory_namesI  s   zLanguage.factory_namesc                 C   s   t | jtjjdddS )zoGet all (name, component) tuples in the pipeline, including the
        currently disabled components.
        r   r   r   )rB   r   r%   r   r   r   rg   rg   rh   r   R  s   zLanguage.componentsc                 C   &   dd | j D }t|tjjdddS )zGet the names of the available pipeline components. Includes all
        active and inactive pipeline components.

        RETURNS (List[str]): List of component name strings, in order.
        c                 S      g | ]\}}|qS rg   rg   r   r   r   rg   rg   rh   r   b      z,Language.component_names.<locals>.<listcomp>r   r   r   r   r   rg   rg   rh   r   [  s   zLanguage.component_namesc                    r   )zThe processing pipeline consisting of (name, component) tuples. The
        components are called on the Doc in order as it passes through the
        pipeline.

        RETURNS (List[Tuple[str, Callable[[Doc], Doc]]]): The pipeline.
        c                    s"   g | ]\}}| j vr||fqS rg   r   )r   npr   rg   rh   r   m  s   " z%Language.pipeline.<locals>.<listcomp>r   r   r   r   )r   pipesrg   r   rh   r   e  r   zLanguage.pipelinec                 C   r   )zGet names of available active pipeline components.

        RETURNS (List[str]): List of component name strings, in order.
        c                 S   r   rg   rg   r   rg   rg   rh   r   v  r   z'Language.pipe_names.<locals>.<listcomp>r   r   r   )r   rB   r%   r   r   r   rg   rg   rh   r   p  s   zLanguage.pipe_namesc                 C   s,   i }| j D ]\}}| |j||< qt|S )zGet the component factories for the available pipeline components.

        RETURNS (Dict[str, str]): Factory names, keyed by component names.
        )r   r   r   rA   )r   r   r   piperg   rg   rh   pipe_factoriesy  s   zLanguage.pipe_factoriesc                 C   sJ   i }| j D ]\}}t|dr|jdu rqt|dr t|j||< qt|S )zGet the labels set by the pipeline components, if available (if
        the component exposes a labels property and the labels are not
        hidden).

        RETURNS (Dict[str, List[str]]): Labels keyed by component name.
        hide_labelsTr   )r   hasattrr   r   r   rA   )r   r   r   r   rg   rg   rh   r     s   
zLanguage.pipe_labelsr   c                 C   s   |  |}|tjv p|tjv S )z=RETURNS (bool): Whether a factory of that name is registered.)get_factory_namerF   r   r   r   internal_namerg   rg   rh   has_factory  s   
zLanguage.has_factoryc                 C   s   | j du r|S | j  d| S )zGet the internal factory name based on the language subclass.

        name (str): The factory name.
        RETURNS (str): The internal factory name.
        N.)ry   )r   r   rg   rg   rh   r     s   
zLanguage.get_factory_namec                 C   sF   |  |}|| jv r| j| S || jv r| j| S ttjjd|d)zGet the meta information for a given factory name.

        name (str): The component factory name.
        RETURNS (FactoryMeta): The meta for the given factory name.
        r   r   r   )r   r   r   r%   E967r   r   rg   rg   rh   get_factory_meta  s   




zLanguage.get_factory_metac                 C   s   || j | |< dS )zSet the meta information for a given factory name.

        name (str): The component factory name.
        value (FactoryMeta): The meta to set.
        N)r   r   )r   r   r   rg   rg   rh   set_factory_meta  s   zLanguage.set_factory_metac                 C   s(   || j vrttjjd|d| j | S )zGet the meta information for a given component name.

        name (str): The component name.
        RETURNS (FactoryMeta): The meta for the given component name.
        	componentr   )r   r   r%   r   r   r   r   rg   rg   rh   r     s   

zLanguage.get_pipe_metac                 C   s*   || j vrttjj|d| j | }|S )zGet the config used to create a pipeline component.

        name (str): The component name.
        RETURNS (Config): The config used to create the pipeline component.
        r   )r   r   r%   E960r   )r   r   r   rg   rg   rh   r     s   

zLanguage.get_pipe_configF)r   assignsrequiresretokenizesr   funcr   r  r  r  r   r	  c          
         s   t tsttjjdddv rttjjdt ts/tjjdt	d}t|dt
dt
f fd	d
}	|durH|	|S |	S )a~  Register a new pipeline component factory. Can be used as a decorator
        on a function or classmethod, or called as a function with the factory
        provided as the func keyword argument. To create a component and add
        it to the pipeline, you can use nlp.add_pipe(name).

        name (str): The name of the component factory.
        default_config (Dict[str, Any]): Default configuration, describing the
            default values of the factory arguments.
        assigns (Iterable[str]): Doc/Token attributes assigned by this component,
            e.g. "token.ent_id". Used for pipeline analysis.
        requires (Iterable[str]): Doc/Token attributes required by this component,
            e.g. "token.ent_id". Used for pipeline analysis.
        retokenizes (bool): Whether the component changes the tokenization.
            Used for pipeline analysis.
        default_score_weights (Dict[str, Optional[float]]): The scores to report during
            training, and their default weight towards the final score used to
            select the best model. Weights should sum to 1.0 per component and
            will be combined and normalized for the whole pipeline. If None,
            the score won't be shown in the logs or be weighted.
        func (Optional[Callable]): Factory function if not used as a decorator.

        DOCS: https://spacy.io/api/language#factory
        r   	decoratorr   r  zdefault configstyler   cfg_typefactory_funcri   c              	      s    }|tjv r#tj|}t| |s#tjj|| d}t	|t
| }d|vs0d|vr9t	tjjdtjj|| d tt tt d}| ttj tjd_| S )N)r   r	  new_funcrk   r   r  )r	  )r   r   r  r  scoresr   r  r   )r   rF   r   r   r#   is_same_funcr%   E004r   r   get_arg_namesE964registerr   r0   r   r   r  rA   r   E957)r  r   existing_funcerr	arg_namesfactory_metar  r   r   r   r   r  r  rg   rh   add_factory  s4   



	z%Language.factory.<locals>.add_factoryN)r   rb   r   r%   E963r   E853rc   E962r   r   )
r   r   r   r  r  r  r   r	  r  r  rg   r  rh   r     s   
#

")zLanguage.factoryr  r  r  r	  .c                   s   durt tsttjjdddv rttjjddur%ntdt	dt
f fdd	}durC|S |S )
a  Register a new pipeline component. Can be used for stateless function
        components that don't require a separate factory. Can be used as a
        decorator on a function or classmethod, or called as a function with the
        factory provided as the func keyword argument. To create a component and
        add it to the pipeline, you can use nlp.add_pipe(name).

        name (str): The name of the component factory.
        assigns (Iterable[str]): Doc/Token attributes assigned by this component,
            e.g. "token.ent_id". Used for pipeline analysis.
        requires (Iterable[str]): Doc/Token attributes required by this component,
            e.g. "token.ent_id". Used for pipeline analysis.
        retokenizes (bool): Whether the component changes the tokenization.
            Used for pipeline analysis.
        func (Optional[Callable[[Doc], Doc]): Factory function if not used as a decorator.

        DOCS: https://spacy.io/api/language#component
        Nr  r
  r   r  component_funcri   c                    s   t trttjjddtdtf fdd}}|t	j
v rAt	j
|}|j}|r7dd |D d nd }t| rA|}j|d	  S )
Nr  r   ri   c                    s    S r   rg   rk   r   r"  rg   rh   r  Q     z?Language.component.<locals>.add_component.<locals>.factory_funcc                 S   s   g | ]}|j qS rg   )cell_contents)r   crg   rg   rh   r   _  s    z=Language.component.<locals>.add_component.<locals>.<listcomp>r   r!  )r   r   r   r%   E965r   rb   PipeCallabler   rF   r   r   __closure__r#   r  r   )r"  r  r   r  closurewrappedr  r   component_namer	  r   r  r  r$  rh   add_componentM  s$   


z)Language.component.<locals>.add_component)r   rb   r   r%   r  r   r  r#   get_object_namer)  r   )r   r   r  r  r  r	  r/  rg   r-  rh   r  +  s   
"zLanguage.component)r  r  r  r  )r   prettyr   r1  c                C   s    t | |d}|rt||d |S )a  Analyze the current pipeline components, print a summary of what
        they assign or require and check that all requirements are met.

        keys (List[str]): The meta values to display in the table. Corresponds
            to values in FactoryMeta, defined by @Language.factory decorator.
        pretty (bool): Pretty-print the results.
        RETURNS (dict): The data.
        )r   )r.   r/   )r   r   r1  analysisrg   rg   rh   r.   p  s   zLanguage.analyze_pipesc                 C   s6   | j D ]\}}||kr|  S qttjj|| jd)zGet a pipeline component for a given component name.

        name (str): Name of pipeline component to get.
        RETURNS (callable): The pipeline component.

        DOCS: https://spacy.io/api/language#get_pipe
        r   opts)r   KeyErrorr%   E001r   r   )r   r   r   r  rg   rg   rh   get_pipe  s
   zLanguage.get_pipe)rO   
raw_configvalidatefactory_namerO   r8  r9  c                C   s^  |dur|n|}t |tstjjd|t|d}t|t|s*ttj	j|d| 
|sFtjj|d| jdt| | jd}t|| |}|jrVt|j|}| |}|tjvrb|}| |d|d	|i}||i}	tj|	|d
}
tjd|	| i|d
d }t|}||d< |d	d |dd |dd |r||}|| j|< |
| S )a  Create a pipeline component. Mostly used internally. To create and
        add a component to the pipeline, you can use nlp.add_pipe.

        factory_name (str): Name of component factory.
        name (Optional[str]): Optional name to assign to component instance.
            Defaults to factory name if not set.
        config (Dict[str, Any]): Config parameters to use for this component.
            Will be merged with default config, if available.
        raw_config (Optional[Config]): Internals: the non-interpolated config.
        validate (bool): Whether to validate the component config against the
            arguments and types expected by the factory.
        RETURNS (Callable[[Doc], Doc]): The pipeline component.

        DOCS: https://spacy.io/api/language#create_pipe
        NrO   r  r   , create_pipe)r   r4  methodry   	lang_coder#  z
@factories)r9  cfgr   rk   r   )r   rc   r%   r   r   r   r   r   r   r   r   E002joinr   r#   r0  ry   r   r   r   r   r   rF   r   r   fillpopr   )r   r:  r   rO   r8  r9  r  r   r   r?  resolvedfilledrg   rg   rh   r<    sD   








zLanguage.create_pipesource_namesourcec                C   s   t |tsttjj|t|d| jj|jjkr#t	
tjj|d ||jvrBttjj||jd  d|jd  d|jd||}t|drO||_|j }t|d | }|| j|< | jj|jjkrx|jjD ]	}| jj| qn||d	 fS )
au  Create a pipeline component by copying it from an existing model.

        source_name (str): Name of the component in the source pipeline.
        source (Language): The source nlp object to copy from.
        name (str): Optional alternative name to use in current pipeline.
        RETURNS (Tuple[Callable[[Doc], Doc], str]): The component and its factory name.
        )r   rG  r  ry   r   r   r;  )r   modelr4  r   r   )r   rj   r   r%   E945r   r   rv   r   warningswarnr&   W113r   r5  E944r   rA  r7  r   r   rO   interpolater#   copy_configr   stringsadd)r   rF  rG  r   r   source_configr   srg   rg   rh   create_pipe_from_source  s,   






z Language.create_pipe_from_source)beforeafterfirstlastrG  rO   r8  r9  rU  rV  rW  rX  c                C   s   t |tst|}tjj||d}t||dur|n|}|| jv r-ttjj|| jdd|v r>t	
tjj|dd |durM| j|||d\}}n
| j||||	|
d}| ||||}| || j|< | j|||f |   |S )a  Add a component to the processing pipeline. Valid components are
        callables that take a `Doc` object, modify it and return it. Only one
        of before/after/first/last can be set. Default behaviour is "last".

        factory_name (str): Name of the component factory.
        name (str): Name of pipeline component. Overwrites existing
            component.name attribute if available. If no name is set and
            the component exposes no name attribute, component.__name__ is
            used. An error is raised if a name already exists in the pipeline.
        before (Union[str, int]): Name or index of the component to insert new
            component directly before.
        after (Union[str, int]): Name or index of the component to insert new
            component directly after.
        first (bool): If True, insert component first in the pipeline.
        last (bool): If True, insert component last in the pipeline.
        source (Language): Optional loaded nlp object to copy the pipeline
            component from.
        config (Dict[str, Any]): Config parameters to use for this component.
            Will be merged with default config, if available.
        raw_config (Optional[Config]): Internals: the non-interpolated config.
        validate (bool): Whether to validate the component config against the
            arguments and types expected by the factory.
        RETURNS (Callable[[Doc], Doc]): The pipeline component.

        DOCS: https://spacy.io/api/language#add_pipe
        r  r   Nr3  r   )name_in_configr  )r   rO   r8  r9  )r   rb   reprr%   E966r   r   r   E007rJ  rK  r&   W119rC  rT  r<  _get_pipe_indexr   r   r   insert_link_components)r   r:  r   rU  rV  rW  rX  rG  rO   r8  r9  bad_valr  pipe_component
pipe_indexrg   rg   rh   add_pipe   s2   
(
zLanguage.add_pipec                 C   s~  ||||d}t dd ||||fD dkr!ttjj|| jd|s/tdd |||fD s4t| jS |r8dS t	|t
rS|| jvrMttjj|| jd| j|S t	|t
rp|| jvrhttjj|| jd| j|d	 S t|tkr|t| jks|dk rtjjd
|| jd}t||S t|tkr|t| jks|dk rtjjd|| jd}t||d	 S ttjj|| jd)a  Determine where to insert a pipeline component based on the before/
        after/first/last values.

        before (str): Name or index of the component to insert directly before.
        after (str): Name or index of component to insert directly after.
        first (bool): If True, insert component first in the pipeline.
        last (bool): If True, insert component last in the pipeline.
        RETURNS (int): The index of the new pipeline component.
        )rU  rV  rW  rX  c                 s       | ]}|d uV  qd S r   rg   )r   argrg   rg   rh   	<genexpr>W      z+Language._get_pipe_index.<locals>.<genexpr>   )argsr4  c                 s   rf  r   rg   )r   r   rg   rg   rh   rh  [  ri  r   r3  r    rU  )diridxr4  rV  )sumr   r%   E006r   r   anyr   r   r   rb   r6  indexr   rd   E959)r   rU  rV  rW  rX  all_argsr  rg   rg   rh   r_  F  sH   




zLanguage._get_pipe_indexc                 C   s
   || j v S )a   Check if a component name is present in the pipeline. Equivalent to
        `name in nlp.pipe_names`.

        name (str): Name of the component.
        RETURNS (bool): Whether a component of the name exists in the pipeline.

        DOCS: https://spacy.io/api/language#has_pipe
        )r   r  rg   rg   rh   has_pipe}  s   
	zLanguage.has_pipe)rO   r9  c                C   s   || j vrttjj|| jdt|dr#tjjt||d}t|| j 	|}| 
| t| jr:|t| jkrC| j||||dS | j|||||dS )aB  Replace a component in the pipeline.

        name (str): Name of the component to replace.
        factory_name (str): Factory name of replacement component.
        config (Optional[Dict[str, Any]]): Config parameters to use for this
            component. Will be merged with default config, if available.
        validate (bool): Whether to validate the component config against the
            arguments and types expected by the factory.
        RETURNS (Callable[[Doc], Doc]): The new pipeline component.

        DOCS: https://spacy.io/api/language#replace_pipe
        r3  __call__rY  )r   rO   r9  )r   rU  rO   r9  )r   r   r%   r6  r   r   r   E968r[  rq  remove_piper   r   re  )r   r   r:  rO   r9  r  rd  rg   rg   rh   replace_pipe  s$   


zLanguage.replace_pipeold_namenew_namec                 C   s   || j vrttjj|| j d|| j v r ttjj|| j d| j |}|| j| d f| j|< | j	|| j|< | j
	|| j
|< || jd d v r`| jd d 	|}|| jd d |< |   dS )zRename a pipeline component.

        old_name (str): Name of the component to rename.
        new_name (str): New name of the component.

        DOCS: https://spacy.io/api/language#rename_pipe
        r3  r    
initializer   N)r   r   r%   r6  r   r]  rq  r   r   rC  r   r   ra  )r   ry  rz  iinit_cfgrg   rg   rh   rename_pipe  s    

zLanguage.rename_pipec                 C   s   || j vrttjj|| j d| j| j |}| j| | j	| | j
di |d || jd d v rD| jd d | || jv rO| j| |   |S )a  Remove a component from the pipeline.

        name (str): Name of the component to remove.
        RETURNS (Tuple[str, Callable[[Doc], Doc]]): A `(name, component)` tuple of the removed component.

        DOCS: https://spacy.io/api/language#remove_pipe
        r3  _sourced_vectors_hashesNr{  r   )r   r   r%   r6  r   r   rC  rq  r   r   r   r   r   r   r   removera  )r   r   removedrg   rg   rh   rw    s   

zLanguage.remove_pipec                 C   s0   || j vrttjj|| j d| j| dS )a  Disable a pipeline component. The component will still exist on
        the nlp object, but it won't be run as part of the pipeline. Does
        nothing if the component is already disabled.

        name (str): The name of the component to disable.
        r3  N)r   r   r%   r6  r   r   rQ  r  rg   rg   rh   disable_pipe  s   
zLanguage.disable_pipec                 C   s>   || j vrttjj|| j d|| jv r| j| dS dS )zEnable a previously disabled pipeline component so it's run as part
        of the pipeline. Does nothing if the component is already enabled.

        name (str): The name of the component to enable.
        r3  N)r   r   r%   r6  r   r   r   r  r  rg   rg   rh   enable_pipe  s
   

zLanguage.enable_pipe)disablecomponent_cfgtextr  r  c          	      C   s  |  |}|du ri }| jD ]u\}}||v rqt|ds(ttjjt||d| j}t|dr4|	 }z||fi |
|i }W n/ tyX } z
ttjj|d|d}~w tyq } z||||g| W Y d}~nd}~ww t|tsttjj|t|dq|S )a  Apply the pipeline to some text. The text can span multiple sentences,
        and can contain arbitrary whitespace. Alignment into the original string
        is preserved.

        text (Union[str, Doc]): If `str`, the text to be processed. If `Doc`,
            the doc will be passed directly to the pipeline, skipping
            `Language.make_doc`.
        disable (List[str]): Names of the pipeline components to disable.
        component_cfg (Dict[str, dict]): An optional dictionary with extra
            keyword arguments for specific components.
        RETURNS (Doc): A container for accessing the annotations.

        DOCS: https://spacy.io/api/language#call
        Nru  rY  get_error_handlerr  )r   returned_type)_ensure_docr   r   r   r%   E003r   r   r   r  r   r5  E109	Exceptionr   r9   E005)	r   r  r  r  docr   procerror_handlererg   rg   rh   ru    s0   



zLanguage.__call__DisabledPipesc                 G   s@   t tjt t|dkrt|d ttfr|d }| j	|dS )aV  Disable one or more pipeline components. If used as a context
        manager, the pipeline will be restored to the initial state at the end
        of the block. Otherwise, a DisabledPipes object is returned, that has
        a `.restore()` method you can use to undo your changes.

        This method has been deprecated since 3.0
        r    r   )r  )
rJ  rK  r&   W096DeprecationWarningr   r   r   tupleselect_pipesr   rg   rg   rh   disable_pipes'  s   zLanguage.disable_pipes)r  enabler  c                   s    du r|du rt tjt|tr|g} durAt tr! g  fddjD }|dur?||kr?t tjj |jd|}|dusGJ fdd|D }t|S )a  Disable one or more pipeline components. If used as a context
        manager, the pipeline will be restored to the initial state at the end
        of the block. Otherwise, a DisabledPipes object is returned, that has
        a `.restore()` method you can use to undo your changes.

        disable (str or iterable): The name(s) of the pipes to disable
        enable (str or iterable): The name(s) of the pipes to enable - all others will be disabled

        DOCS: https://spacy.io/api/language#select_pipes
        Nc                       g | ]}| vr|qS rg   rg   )r   r   r  rg   rh   r   K      z)Language.select_pipes.<locals>.<listcomp>)r  r  r   c                    s   g | ]	}| j vr|qS rg   r   )r   dr   rg   rh   r   W  s    )	r   r%   E991r   rb   r   E992r   r  )r   r  r  
to_disablerg   )r  r   rh   r  4  s$   



zLanguage.select_pipesc                 C   s2   t || jkrttjjt || jd| |S )z{Turn a text into a Doc object.

        text (str): The text to process.
        RETURNS (Doc): The processed doc.
        )lengthr   )r   r   r   r%   E088r   r   )r   r  rg   rg   rh   make_docZ  s
   
zLanguage.make_docdoc_likec                 C   sR   t |tr|S t |tr| |S t |trt| j|S ttj	j
t|d)zCreate a Doc if need be, or raise an error if the input is not
        a Doc, string, or a byte array (generated by Doc.to_bytes()).)r   )r   r9   rb   r  bytesrv   
from_bytesr   r%   E1041r   r   )r   r  rg   rg   rh   r  f  s   



zLanguage._ensure_doccontextc                 C   s   |  |}||_|S )z>Call _ensure_doc to generate a Doc and set its context object.)r  _context)r   r  r  r  rg   rg   rh   _ensure_doc_with_contextq  s   
z!Language._ensure_doc_with_contextg        )dropsgdlossesr  exclude	annotatesexamplesr   r  r  r  r  r  c             	   C   s  |dur	t tj|du ri }t|trt|dkr|S t|d t|}|du r6| jdu r3| 	 | _| j}|du r<i }i }	t
| jD ]%\}
\}}||i  t|| |	|< || d| |	| d| j qC| jD ]V\}}||vrt|dr|j|fd|d||  |dvr||vrt|tjr|jr|jd	vr|| ||v rttd
d |D ||| j|	| d|D ]\}}||_qqlt|S )a<  Update the models in the pipeline.

        examples (Iterable[Example]): A batch of examples
        _: Should not be set - serves to catch backwards-incompatible scripts.
        drop (float): The dropout rate.
        sgd (Optimizer): An optimizer.
        losses (Dict[str, float]): Dictionary to update with the loss, keyed by
            component.
        component_cfg (Dict[str, Dict]): Config parameters for specific pipeline
            components, keyed by component name.
        exclude (Iterable[str]): Names of components that shouldn't be updated.
        annotates (Iterable[str]): Names of components that should set
            annotations on the predicted examples after updating.
        RETURNS (Dict[str, float]): The updated losses dictionary

        DOCS: https://spacy.io/api/language#update
        Nr   Language.updater  r   updater  r  )NF)TFNc                 s       | ]}|j V  qd S r   	predictedr   egrg   rg   rh   rh        z"Language.update.<locals>.<genexpr>)r  r   r   r   )r   r%   E989r   r   r   r<   _copy_examplesr   create_optimizer	enumerater   r   r   r   r   r  r"   TrainableComponentis_trainablerH  finish_updateziprC   r   r  _replace_numpy_floats)r   r  r   r  r  r  r  r  r  pipe_kwargsr|  r   r  r  r  rg   rg   rh   r  y  sZ   







r  )r  r  r  r  c                   s  |du ri }t |trt|dkr|S t|d |du r)| jdu r&|  | _| j}t| j}t| |du r9i }i   fdd}|j	|_	|j
|_
|j|_|D ] \}}	||v s\t|	ds]qOi  |	j|f||d||i  qO  D ]\}
\}}||
|| qt|S )a|  Make a "rehearsal" update to the models in the pipeline, to prevent
        forgetting. Rehearsal updates run an initial copy of the model over some
        data, and update the model so its current predictions are more like the
        initial ones. This is useful for keeping a pretrained model on-track,
        even if you're updating it with a smaller set of examples.

        examples (Iterable[Example]): A batch of `Example` objects.
        sgd (Optional[Optimizer]): An optimizer.
        component_cfg (Dict[str, Dict]): Config parameters for specific pipeline
            components, keyed by component name.
        exclude (Iterable[str]): Names of components that shouldn't be updated.
        RETURNS (dict): Results from the update.

        EXAMPLE:
            >>> raw_text_batches = minibatch(raw_texts)
            >>> for labelled_batch in minibatch(examples):
            >>>     nlp.update(labelled_batch)
            >>>     raw_batch = [Example.from_dict(nlp.make_doc(text), {}) for text in next(raw_text_batches)]
            >>>     nlp.rehearse(raw_batch)

        DOCS: https://spacy.io/api/language#rehearse
        Nr   Language.rehearsec                    s   ||f | < ||fS r   rg   )keyWdWgradsrg   rh   	get_grads  s   z$Language.rehearse.<locals>.get_gradsrehearser  )r   r   r   r<   r   r  r   randomshuffle
learn_rateb1b2r   r  r   items)r   r  r  r  r  r  r   r  r   r  r  r  r  rg   r  rh   r    s@   





r  r  get_examplesc                C   s   t tjt | j||dS )Nr  )rJ  rK  r&   W089r  r{  )r   r  r  rg   rg   rh   begin_training  s   zLanguage.begin_trainingc                   s   |du rt jd t| jg dd  fdd}t|ds,tjjdt	|d	}t
|| j }tj|d
 td}|d }|durF||  zt| |d |d |d d W n tyh   ttjj|d dw | jjjd dkr|t }| jj| t| jd
rt| jj|d ddd}| jj|fd| i| | jD ]&\}	}
t|
tjr|d |	i }t|
j|d|	d}|
j|fd| i| q|d}|rtj|td}t | || | !  || _"|dur|| _"n
| j"du r| # | _"|d }|dur||  | j"S )a  Initialize the pipe for training, using data examples if available.

        get_examples (Callable[[], Iterable[Example]]): Optional function that
            returns gold-standard Example objects.
        sgd (Optional[Optimizer]): An optimizer to use for updates. If not
            provided, will be created using the .create_optimizer() method.
        RETURNS (thinc.api.Optimizer): The optimizer.

        DOCS: https://spacy.io/api/language#initialize
        NzUNo 'get_examples' callback provided to 'Language.initialize', creating dummy examples)xyz)wordsc                      s   t  i gS r   )r;   	from_dictrg   r  rg   rh   r  #  s   z)Language.initialize.<locals>.get_examplesru  Language.initialize)r=  objr{  )schemabefore_init
vocab_datar}   r   )datar}   r   )r   r    r   )sectionr   rk   r   pretraining
after_init)$r#   r{   r|   r9   rv   r   r%   E930r   r   	TypeErrorrO   rN  rF   r   r2   r>   IOErrorE884r   shaper   to_opsr   r5   r{  r   r   r"   InitializableComponentr   r4   r=   ra  r   r  )r   r  r  r  rO   Ir  opstok_settingsr   r  
p_settingspretrain_cfgPr  rg   r  rh   r{    sp   






r  c                C   sz   t  }| jjjd dkr| jj| | jD ]\}}t|dr%t|j|_	q|dur0|| _
| j
S | j
du r:|  | _
| j
S )a   Continue training a pretrained model.

        Create and return an optimizer, and initialize "rehearsal" for any pipeline
        component that has a .rehearse() method. Rehearsal is used to prevent
        models from "forgetting" their initialized "knowledge". To perform
        rehearsal, collect samples of text you want the models to retain performance
        on, and call nlp.rehearse() with a batch of Example objects.

        RETURNS (Optimizer): The optimizer.

        DOCS: https://spacy.io/api/language#resume_training
        r    _rehearsal_modelN)r   rv   r   r  r  r   r   r   rH  r  r   r  )r   r  r  r   r  rg   rg   rh   resume_trainingY  s   


zLanguage.resume_trainingr  c                 C   s.   || _ | jD ]\}}t|dr|| qdS )a  Set an error handler object for all the components in the pipeline
        that implement a set_error_handler function.

        error_handler (Callable[[str, Callable[[Doc], Doc], List[Doc], Exception], NoReturn]):
            Function that deals with a failing batch of documents. This callable
            function should take in the component's name, the component itself,
            the offending batch of documents, and the exception that was thrown.
        DOCS: https://spacy.io/api/language#set_error_handler
        set_error_handlerN)r   r   r   r  )r   r  r   r   rg   rg   rh   r  r  s   

zLanguage.set_error_handler)r   scorerr  
scorer_cfgper_componentr  r  r  c                C   s   t |}t|d t|}|du r| j}|du ri }|du r i }|du r5t|}|d|  td
i |}t }|D ]	}	| |	j	j
 q:| jdd |D ||d}
t||
D ]\}	}||	_qVt }|j||d}tdd |D }|||  |d	< t|S )a  Evaluate a model's pipeline components.

        examples (Iterable[Example]): `Example` objects.
        batch_size (Optional[int]): Batch size to use.
        scorer (Optional[Scorer]): Scorer to use. If not passed in, a new one
            will be created.
        component_cfg (dict): An optional dictionary with extra keyword
            arguments for specific components.
        scorer_cfg (dict): An optional dictionary with extra keyword arguments
            for the scorer.
        per_component (bool): Whether to return the scores keyed by component
            name. Defaults to False.

        RETURNS (Scorer): The scorer containing the evaluation results.

        DOCS: https://spacy.io/api/language#evaluate
        Language.evaluateNrk   c                 s   r  r   r  r  rg   rg   rh   rh    r  z$Language.evaluate.<locals>.<genexpr>)r   r  )r  c                 s   s    | ]}t |jV  qd S r   )r   r  r  rg   rg   rh   rh        speedrg   )r   r<   r  r   rc   r   r6   timerr  	referencer  r   r  r  scorern  r  )r   r  r   r  r  r  r  r   
start_timer  docsr  end_timeresultsn_wordsrg   rg   rh   evaluate  s8   
r  c                 C   s    d| j d d i}t|d S )zCCreate an optimizer, usually using the [training.optimizer] config.	optimizerr   )rO   rF   r   )r   	subconfigrg   rg   rh   r    s   zLanguage.create_optimizerparamsc              	   #   s~     sdV  dS  fdd| j D }|D ]}zt| W q ty%   Y qw dV  |D ]}zt| W q+ ty<   Y q+w dS )a  Replace weights of models in the pipeline with those provided in the
        params dictionary. Can be used as a contextmanager, in which case,
        models go back to their original weights after the block.

        params (dict): A dictionary of parameters keyed by model ID.

        EXAMPLE:
            >>> with nlp.use_params(optimizer.averages):
            >>>     nlp.to_disk("/tmp/checkpoint")

        DOCS: https://spacy.io/api/language#use_params
        Nc                    s.   g | ]\}}t |d rt |dr| qS )
use_paramsrH  )r   r  )r   r   r   r  rg   rh   r     s    z'Language.use_params.<locals>.<listcomp>)r   nextStopIteration)r   r  contextsr  rg   r  rh   r    s(   

zLanguage.use_params)	as_tuplesr   r  r  	n_processtextsr  r  c                C      d S r   rg   r   r  r  r   r  r  r  rg   rg   rh   r        
zLanguage.pipec                C   r  r   rg   r  rg   rg   rh   r     r  r    c                #   sl   |r7t tttttf tf  |} fdd|D } j|||||d}|D ]}	|	j}
d|	_|	|
fV  q'dS t ttttf  |}|dkrJt	
 }|du rPi }|du rW j}g } jD ]%\}}||v req\||i }|d| tjt||| jd}|| q\|dkr |rttj  ||||}n fd	d|D }|D ]}||}q|D ]}	|	V  qdS )
aX  Process texts as a stream, and yield `Doc` objects in order.

        texts (Iterable[Union[str, Doc]]): A sequence of texts or docs to
            process.
        as_tuples (bool): If set to True, inputs should be a sequence of
            (text, context) tuples. Output will then be a sequence of
            (doc, context) tuples. Defaults to False.
        batch_size (Optional[int]): The number of texts to buffer.
        disable (List[str]): Names of the pipeline components to disable.
        component_cfg (Dict[str, Dict]): An optional dictionary with extra keyword
            arguments for specific components.
        n_process (int): Number of processors to process texts. If -1, set `multiprocessing.cpu_count()`.
        YIELDS (Doc): Documents in the order of the original text.

        DOCS: https://spacy.io/api/language#pipe
        c                 3   s     | ]\}}  ||V  qd S r   )r  )r   r  r  r   rg   rh   rh    s    
z Language.pipe.<locals>.<genexpr>)r   r  r  r  Nr   )r  r   r   r   r    c                 3   s    | ]}  |V  qd S r   )r  )r   r  r   rg   rh   rh  Q  r  )r   r   r   r   rb   r9   rK   r   r  mp	cpu_countr   r   r   r   	functoolspartialrC   r   r   _has_gpu_modelrJ  rK  r&   W114_multiprocessing_pipe)r   r  r  r   r  r  r  docs_with_contextsr  r  r  r   r   r  r   fr   rg   r   rh   r      sd   


c                 C   s\   | j D ](\}}t|do|j}||v s|sqt|dr+t|jdr+t|jjtr+ dS qdS )Nr  rH  r  TF)r   r   r  rH  r   r  r   )r   r  r   r  r  rg   rg   rh   r  W  s   $zLanguage._has_gpu_modelr   c              
   #   sV   dt tttf  dt ttttf tf  fdd}||}t|\}}dd t	|D }t
dd t	|D  \}	}
t||}t|||d}|  |   fd	dt
||
D }|D ]}|  q_|
D ]}|  qhtd
d t|	D }z{tt
||dD ]>\}\}\}}}|d urtj|}||_|V  n|d urt|}d d d ttjj|d || dkr|  qW |D ]}| t! |  q|	D ]}|  q|D ]}|"  qt#dd |D st$%t&j' d S d S |D ]}| t! |  q|	D ]}|  q|D ]}|"  qt#dd |D s*t$%t&j' w w )Nr  ri   c                 s   sB    | D ]}t |tr| tt|jfV  q|ttd fV  qd S r   )r   r9   to_bytesr   rK   r  )r  r  rg   rg   rh   prepare_inputi  s   
z5Language._multiprocessing_pipe.<locals>.prepare_inputc                 S   s   g | ]}t  qS rg   )r  Queuer   r   rg   rg   rh   r   y  r   z2Language._multiprocessing_pipe.<locals>.<listcomp>c                 S   s   g | ]}t d qS )F)r  Piper  rg   rg   rh   r   |  s    )
chunk_sizec              
      s.   g | ]\}}t jtj ||t fd qS ))targetrk  )r  Process_apply_pipesr  r:   	get_state)r   rchschr   r   rg   rh   r     s    c                 s   s    | ]}|  V  qd S r   )recv)r   r(  rg   rg   rh   rh    s    
z1Language._multiprocessing_pipe.<locals>.<genexpr>r    r   r   c                 s   s    | ]}|j d kV  qdS )r   N)exitcode)r   r  rg   rg   rh   rh    r  )(r   r   rb   r9   r   r  rK   	itertoolsteeranger  r#   	minibatch_Sendersendstartcloser   from_iterabler   r  rv   r  r  r   msgpack_loadsr   r   r%   E871r   stepput_WORK_DONE_SENTINELrA  allrJ  rK  r&   W127)r   r  r   r  r   r  serialized_texts_with_ctx	raw_textstexts_qbytedocs_recv_chbytedocs_send_chbatch_textssenderprocsr  txbyte_tuplesr|  r   byte_docr  
byte_errorr  r   qrrg   r'  rh   r  b  s~   











zLanguage._multiprocessing_pipec                 C   st   | j D ]\}}t|dr||_qt| j D ]"\}\}}t|tjr7i |_| j |d d D ]	\}}|| q-qdS )zmRegister 'listeners' within pipeline components, to allow them to
        effectively share weights.
        r   r    N)	r   r   r   r  r   r"   ListenedToComponentlistener_mapfind_listeners)r   r   r  r|  name1proc1name2proc2rg   rg   rh   ra    s   
zLanguage._link_components)rv   r  r  r  r   	auto_fillr9  rO  c          '   	      sd  |rt | jtd|}d|vrttjj|dd|d vr'ddi|d d< |d d}	|	durJ|	| j	krJttj
j|d d | j	t| d	| j	|d d< t|}|d
i }
|dd}i |d
< |rqtj||td}n|}|
|d
< |
|d
< |dur||d< ||d< tj|d |td}|d }|d }|d }|d }|d }| }|dur|| }t|trt|| r|| urttjjt|dt  |||||d}|dur||}t|| sttjjdt|d|js| n|}|d
i }i }i }d}|d d D ]}||vr d| }ttjj||dt|| }t |d
 | }| vrd|vrJd|vrJtjj||d}t|d|v r`|d}|j |||||d qd|v sgJ |du ru|j!j"ddgd}|d } | |vrtj#| |j!dgd|| < |d |}!d!}"d"|v r||  $  ||  j%D ]\}#}$|!t&|$d#g v r||  '|#|!|d"  d$}"qt()  t(j*d%d&d' |j |!||  |d( W d   n	1 sw   Y  | |vrt+||  j!j,j"dgd|| < d)|j-vri |j-d)< ||  |j-d) |< |"r|| = q|dur|j!.| t|t/r(|g}t|t/r1|g}t t/r: g t0|t0t1krc|d d*g }%t2|%rct3|%4|sct(5t6j7j||%d+ | 8t9h ||d d,g ||d d }&t3 fd-d.|&D |_:|d d/ |_;|r|n||_<|dur||}t|| sttjjd0t|d|S )1a5  Create the nlp object from a loaded config. Will set up the tokenizer
        and language data, add pipeline components etc. If no config is provided,
        the default config of the given language is used.

        config (Dict[str, Any] / Config): The loaded config.
        vocab (Vocab): A Vocab object. If True, a vocab is created.
        disable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to disable.
            Disabled pipes will be loaded but they won't be run unless you
            explicitly enable them by calling nlp.enable_pipe.
        enable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to enable. All other
            pipes will be disabled (and can be enabled using `nlp.enable_pipe`).
        exclude (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to exclude.
            Excluded components won't be loaded.
        meta (Dict[str, Any]): Meta overrides for nlp.meta.
        auto_fill (bool): Automatically fill in missing values in config based
            on defaults and function argument annotations.
        validate (bool): Validate the component config and arguments against
            the types expected by the factory.
        RETURNS (Language): The initialized Language class.

        DOCS: https://spacy.io/api/language#from_config
        rM   rk   r   r   z@vectorszspacy.Vectors.v1ry   N)bad_lang_coder>  ry   r   r  )r9  r  r   before_creationafter_creationafter_pipeline_creation)r   )rv   rx   r   r   creation)r   r   r   r;  r3  r   rG  )r   rO   )r   rO   r9  r8  r}   rP  r  )rv   r  r  Freplace_listenerslistening_componentsTignorez\[W113\])message)rG  r   r  enabled)r  rZ  r   c                 3   s    | ]	}| vr|V  qd S r   rg   )r   r   rU  rg   rh   rh    s    z'Language.from_config.<locals>.<genexpr>r   pipeline_creation)=r   r   r@   r   r   r%   E985r   r   ry   E958r#   r0  rO  rC  rF   rB  r1   r   r3   r   r   
issubclassE943rG   E942is_interpolatedrN  rA  r   E956E984re  rv   r  
load_modelra  r   getattrrV  rJ  catch_warningsfilterwarningshashr   r   r  rb   idr?   r   re   issubsetrK  r&   W123_resolve_component_statusr   r   r   rO   )'r   rO   rv   r  r  r  r   rO  r9  config_langorig_pipelineorig_pretrainingrE  resolved_nlprx   r   rQ  rR  rS  lang_clsrk   interpolatedr   source_nlpssource_nlp_vectors_hashesvocab_br   r4  pipe_cfgr8  r  r   rH  rF  listeners_replacedr   r  rZ  disabled_pipesrg   rU  rh   from_config  s4  #









	











zLanguage.from_configtok2vec_namer   	listenersc              	   C   s  || j vrtjj|||d| j d}t||| j vr.tjj|||d| j d}t|| |}| |}t|t	j
sJttjj|t|d|j}|j|g }| |}	| j| }
|r
tjd| tt|t|krtjj|||t|d}t||D ]?}zt|
| W n ty   tjj|||d}t|w |d }d|jv r|jd }||d |
d d	 }t|
|| q|D ]E}| }|jd
}|durtt|j}|dkr||}n|dkr||||}n	ttj j|dt!|	j|| |"|| qdS dS )a  Find listener layers (connecting to a token-to-vector embedding
        component) of a given pipeline component model and replace
        them with a standalone copy of the token-to-vector layer. This can be
        useful when training a pipeline with components sourced from an existing
        pipeline: if multiple components (e.g. tagger, parser, NER) listen to
        the same tok2vec component, but some of them are frozen and not updated,
        their performance may degrade significantly as the tok2vec component is
        updated with new data. To prevent this, listeners can be replaced with
        a standalone tok2vec layer that is owned by the component and doesn't
        change if the component isn't updated.

        tok2vec_name (str): Name of the token-to-vector component, typically
            "tok2vec" or "transformer".
        pipe_name (str): Name of pipeline component to replace listeners for.
        listeners (Iterable[str]): The paths to the listeners, relative to the
            component config, e.g. ["model.tok2vec"]. Typically, implementations
            will only connect to one tok2vec component, [model.tok2vec], but in
            theory, custom models can use multiple listeners. The value here can
            either be an empty list to not replace any listeners, or a complete
            (!) list of the paths to all listener layers used by the model.

        DOCS: https://spacy.io/api/language#replace_listeners
        r;  )tok2vecr   unknownr4  )r   r   z%Replacing listeners of component '%s')r   r|  pathsn_listeners)r   r|  r   rH  replace_listener_cfgr|  replace_listenerNr       )
num_params)#r   r%   E889r   rA  r   r7  r   r   r"   rH  E888r   rH  rI  r   r   r#   r{   r|   r   r   E887dot_to_objectr5  E886attrsset_dot_to_objectcopyinspect	signature
parametersE1055replace_model_noderemove_listener)r   rz  r   r{  r  r|  tok2vec_cfgtok2vec_modelpipe_listenersr   rv  listener_path
new_configreplace_funclistener	new_modelreplace_listener_funcr  rg   rg   rh   rV    s   











 zLanguage.replace_listenersmemc              	   c   s    |du rt  }t @}|| j|g}t| jdr(||| j| | jD ]\}}t|dr?|||| q+|V  W d   dS 1 sNw   Y  dS )a  Begin a block where all resources allocated during the block will
        be freed at the end of it. If a resources was created within the
        memory zone block, accessing it outside the block is invalid.
        Behaviour of this invalid access is undefined. Memory zones should
        not be nested.

        The memory zone is helpful for services that need to process large
        volumes of text with a defined memory budget.

        Example
        -------
        >>> with nlp.memory_zone():
        ...     for doc in nlp.pipe(texts):
        ...        process_my_doc(doc)
        >>> # use_doc(doc) <-- Invalid: doc was allocated in the memory zone
        Nmemory_zone)	r   r   enter_contextrv   r  r   r   r   r   )r   r  stackr
  r   r   rg   rg   rh   r  0  s   
"zLanguage.memory_zonerU  r   c                   s   t |}i }fdd|d< fdd|d< fdd|d< jD ]\}}| v r+q"t|ds1q"|fd	d||< q" fd
d|d< t ||  dS )aa  Save the current state to a directory.  If a model is loaded, this
        will include the model.

        path (str / Path): Path to a directory, which will be created if
            it doesn't exist.
        exclude (Iterable[str]): Names of components or serialization fields to exclude.

        DOCS: https://spacy.io/api/language#to_disk
        c                        j j| dgdS Nrv   rU  )r   to_diskr   r   rg   rh   <lambda>^      z"Language.to_disk.<locals>.<lambda>r   c                    s   t | t jS r   )r   
write_jsonr  r   r  r   rg   rh   r  a  s    
	meta.jsonc                    s    j | S r   )rO   r  r  r   rg   rh   r  d      
config.cfgr  c                 S      |j | dgdS r  )r  r   r  rg   rg   rh   r  j      c                       j j|  dS NrU  )rv   r  r  r  r   rg   rh   r  k  r  rv   N)r#   ensure_pathr   r   r  )r   r   r  serializersr   r  rg   r  rh   r  P  s   

zLanguage.to_diskr   c                    sn   t | tr| g} | } r3t  tr g h  fdd|D | }tt |@ r3ttjj | dt|S )af  Derives whether (1) `disable` and `enable` values are consistent and (2)
        resolves those to a single set of disabled components. Raises an error in
        case of inconsistency.

        disable (Union[str, Iterable[str]]): Name(s) of component(s) or serialization fields to disable.
        enable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to enable.
        pipe_names (Iterable[str]): Names of all pipeline components.

        RETURNS (Tuple[str, ...]): Names of components to exclude from pipeline w.r.t.
                                   specified includes and excludes.
        c                    r  rg   rg   )r   r   r  rg   rh   r     r  z6Language._resolve_component_status.<locals>.<listcomp>)r  r  )	r   rb   r   re   r   r%   E1042r   r  )r  r  r   r  rg   r  rh   rl  n  s   

z"Language._resolve_component_status)r  	overridesr  c          	         s   dt ddffdd}dt ddf fdd}t|}i }t |d  r/fd	d
|d< ||d< ||d< fdd
|d< jD ]\}}| v rKqBt|dsQqB|fdd
||< qB|d  skd vrkt dg  t||  |_	  S )a  Loads state from a directory. Modifies the object in place and
        returns it. If the saved `Language` object contains a model, the
        model will be loaded.

        path (str / Path): A path to a directory.
        exclude (Iterable[str]): Names of components or serialization fields to exclude.
        RETURNS (Language): The modified `Language` object.

        DOCS: https://spacy.io/api/language#from_disk
        r   ri   Nc                    s>   |   rt| } j| |di d jj_d S d S Nr   r   )	existsr   	read_jsonr   r  r   rv   r   r   )r   r  r   rg   rh   deserialize_meta  s
   
z,Language.from_disk.<locals>.deserialize_metac                    s    |   rjj|  d d S d S r  )r  rv   	from_disk)r   r  rg   rh   deserialize_vocab  s   z-Language.from_disk.<locals>.deserialize_vocabr  c                    s   j j| d dS )NF)rN  r  )rO   r  r  )r  r   rg   rh   r    r  z$Language.from_disk.<locals>.<lambda>r  rv   c                    r  r  )r   r  r  r   rg   rh   r    r  r   r  c                 S   r  r  )r  r  rg   rg   rh   r        )
r   r#   r  r  r   r   r   r  r   ra  )	r   r   r  r  r  r  deserializersr   r  rg   )r  r  r   rh   r    s*   

zLanguage.from_diskc                   s   i } fdd|d< fdd|d< fdd|d< fdd|d	< j D ]\}}| v r/q&t|d
s5q&|fdd||< q&t| S )a  Serialize the current state to a binary string.

        exclude (Iterable[str]): Names of components or serialization fields to exclude.
        RETURNS (bytes): The serialized form of the `Language` object.

        DOCS: https://spacy.io/api/language#to_bytes
        c                      s   j j dS r  )rv   r  rg   r  rg   rh   r        z#Language.to_bytes.<locals>.<lambda>rv   c                      s    j jdgdS r  )r   r  rg   r   rg   rh   r    r  r   c                      s   t t jS r   )r   
json_dumpsr  r   rg   r   rg   rh   r    s    r  c                      s
    j  S r   )rO   r  rg   r   rg   rh   r    s   
 r  r  c                 S   s   | j dgdS r  )r  )r  rg   rg   rh   r    r  )r   r   r#   r  )r   r  r  r   r  rg   r  rh   r    s   
zLanguage.to_bytes
bytes_datac                   s   fdd}i }fdd|d< ||d<  fdd|d< fd	d|d
< j D ]\}}| v r1q(t|ds7q(|fdd||< q(t||    S )a!  Load state from a binary string.

        bytes_data (bytes): The data to load from.
        exclude (Iterable[str]): Names of components or serialization fields to exclude.
        RETURNS (Language): The `Language` object.

        DOCS: https://spacy.io/api/language#from_bytes
        c                    s2   t | } j| |di d jj_d S r  )r   
json_loadsr   r  r   rv   r   r   )br  r   rg   rh   r    s   
z-Language.from_bytes.<locals>.deserialize_metac                    s    j j| ddS )NF)rN  )rO   r  r  r   rg   rh   r    s    z%Language.from_bytes.<locals>.<lambda>r  r  c                    r  r  )rv   r  r  r  rg   rh   r    r  rv   c                    r  r  )r   r  r  r   rg   rh   r    r  r   r  c                 S   r  r  )r  )r  r  rg   rg   rh   r  	  r  )r   r   r#   r  ra  )r   r  r  r  r  r   r  rg   r  rh   r    s   
zLanguage.from_bytes)Tr   )NNNN)ri   r  ri   N)qr]   r^   r_   r`   rL   rp   ry   r   rb   ra   r   r   rA   r%   r  r   r   r   r   rI   boolrd   r
   r   r9   rH   r   r   propertyr   r   setterr   rO   r   r   r   r   r)  r   r   r   r   r   r   classmethodr   r   r   r  r   r   rB   r   floatr   r  r.   r7  r<  rT  re  r_  rt  rx  r~  rw  r  r  ru  r  r  r  r  r  rK   r  r;   r   r  r  r  r{  r  r  r   r  r6   r  r  r   rc   r  r   r$   r   r   r  r  ra  r?   ry  rV  r   r  r   r  staticmethodrl  r  r  r  __classcell__rg   rg   r   rh   rj      s  
 




H
%"
	




	
Y
G


I

*	


H
7

+


+
&
	

O

B

L

	
;#		&	

W

h

	
 h
h 


&


9	meta_dictc                 C   s   t dd dd t| S )Nc                 S   s   t | tjS r   )r   numpyfloatingvrg   rg   rh   r  	  r  z'_replace_numpy_floats.<locals>.<lambda>c                 S   s   t | S r   )r  r  rg   rg   rh   r  	  s    )r   rc   )r  rg   rg   rh   r  
	  s   r  c                   @   s   e Zd ZU dZeed< dZeeee	f  ed< e
 Zee ed< e
 Zee ed< dZeed< e
 Zee ed	< dZeeeee f  ed
< dS )r   a  Dataclass containing information about a component and its defaults
    provided by the @Language.component or @Language.factory decorator. It's
    created whenever a component is defined and stored on the Language class for
    each component instance and factory instance.
    r   Nr   r  r  Fr  r  r   )r]   r^   r_   r`   rb   ra   r   r   r   r
   r  r  r   r  r  r  r  r   r  rg   rg   rg   rh   r   	  s   
  r   c                   @   sD   e Zd ZdZdedee ddfddZdd	 Zd
d Z	dddZ
dS )r  z)Manager for temporary pipeline disabling.rk   r   ri   Nc                 C   s>   || _ || _| jD ]}| j | q	t|  | | j d S r   )rk   r   r  r   r   extend)r   rk   r   r   rg   rg   rh   r   $	  s   

zDisabledPipes.__init__c                 C   s   | S r   rg   r   rg   rg   rh   	__enter__,	  r%  zDisabledPipes.__enter__c                 G   s   |    d S r   )restore)r   rk  rg   rg   rh   __exit__/	  s   zDisabledPipes.__exit__c                 C   sF   | j D ]}|| jjvrttjj|d| j| qg | dd< dS )zARestore the pipeline to its state when DisabledPipes was created.r  N)r   rk   r   r   r%   E008r   r  r  rg   rg   rh   r  2	  s
   
zDisabledPipes.restorer  )r]   r^   r_   r`   rj   r   rb   r   r  r  r  rg   rg   rg   rh   r  !	  s    r  r  c                 C   s   dd | D S )zMake a copy of a batch of examples, copying the predicted Doc as well.
    This is used in contexts where we need to take ownership of the examples
    so that they can be mutated, for instance during Language.evaluate and
    Language.update.
    c                 S   s   g | ]}t |j |jqS rg   )r;   r  r  r  r  rg   rg   rh   r   A	  r   z"_copy_examples.<locals>.<listcomp>rg   )r  rg   rg   rh   r  ;	  s   r  
ensure_docr   .underscore_statec              	      s  t | 	 z>| }t|tr|  |  W dS  fdd|D }|D ]}||}q&dd |D }dgt|t|  }	||	 }
W n! tye   ddt	t
 fg}dgt|d  }	||	 }
Y nw z||
 W n ty   |  |  Y dS w q)	a9  Worker for Language.pipe

    ensure_doc (Callable[[Union[str, Doc]], Doc]): Function to create Doc from text
        or raise an error if the input is neither a Doc nor a string.
    pipes (Iterable[Pipe]): The components to apply.
    receiver (multiprocessing.Connection): Pipe to receive text. Usually
        created by `multiprocessing.Pipe()`
    sender (multiprocessing.Connection): Pipe to send doc. Usually created by
        `multiprocessing.Pipe()`
    underscore_state (Tuple[dict, dict, dict]): The data in the Underscore class
        of the parent.
    TNc                 3   s    | ]
\}} ||V  qd S r   rg   )r   r  r  r  rg   rh   rh  b	  s    
z_apply_pipes.<locals>.<genexpr>c                 S   s   g | ]
}|  |jd fqS r   )r  r  )r   r  rg   rg   rh   r   h	  s    z _apply_pipes.<locals>.<listcomp>)NNNr    )r:   
load_stater   r   _WorkDoneSentinelr1  r   r  r   msgpack_dumps	traceback
format_excr/  BrokenPipeError)r  r   receiverr@  r  texts_with_ctxr  r   	byte_docspaddingr  	error_msgrg   r  rh   r#  D	  s<   



r#  c                   @   sH   e Zd ZdZdee deej de	ddfddZ
dd	d
ZdddZdS )r.  zAUtil for sending data to multiprocessing workers in Language.piper  queuesr   ri   Nc                 C   s(   t || _t t|| _|| _d| _d S )Nr   )iterr  r   r  r   count)r   r  r  r   rg   rg   rh   r   	  s   

z_Sender.__init__c                 C   s4   t t| jt| j| jD ]	\}}|| qdS )z1Send chunk_size items from self.data to channels.N)r*  islicer  r  r   r  r   r6  )r   itemrF  rg   rg   rh   r/  	  s
   z_Sender.sendc                 C   s0   |  j d7  _ | j | jkrd| _ |   dS dS )znTell sender that comsumed one item. Data is sent to the workers after
        every chunk_size calls.
        r    r   N)r  r   r/  r   rg   rg   rh   r5  	  s
   z_Sender.stepr  )r]   r^   r_   r`   r   r
   r   r  r  rd   r   r/  r5  rg   rg   rg   rh   r.  }	  s    

r.  c                   @   s   e Zd ZdS )r  N)r]   r^   r_   rg   rg   rg   rh   r  	  s    r  )r  r  r*  multiprocessingr  r  r  rJ  
contextlibr   r   r  r   dataclassesr   r   r   pathlibr   timeitr	   r  typingr
   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r   cymem.cymemr   	thinc.apir   r   r   r   
thinc.utilr   r   r!   r"   r#   compatr$   errorsr%   r&   git_infor'   lang.punctuationr(   r)   r*   lang.tokenizer_exceptionsr+   r,   r}   r-   pipe_analysisr.   r/   r0   schemasr1   r2   r3   r4   r5   r  r6   r   r8   tokensr9   tokens.underscorer:   r   r;   r<   training.initializer=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   r   rH   rv   rI   rJ   r)  __file__parentDEFAULT_CONFIG_PATHload_configr   DEFAULT_CONFIG_PRETRAIN_PATHrK   rL   rx   r~   rj   rc   r  r   r   r  r  rb   r  r#  r.  r  r7  rg   rg   rg   rh   <module>   s    H,
                 	
9
