o
    i                     @   s  d dl Z d dlZd dlmZ d dlmZmZmZmZm	Z	m
Z
 d dlmZmZ d dlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ ddlmZ ddlm Z  dZ!e "e!d Z#dZ$dZ%dee dee&ef fddZ'dd Z(G dd de Z)dd Z*dS )    N)islice)AnyCallableDictIterableListOptional)ConfigModel)Floats2d   )Errors)Language)Scorer)Doc)Examplevalidate_get_examples)registry)Vocab   )TextCategorizeraX  
[model]
@architectures = "spacy.TextCatEnsemble.v2"

[model.tok2vec]
@architectures = "spacy.Tok2Vec.v2"

[model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v2"
width = 64
rows = [2000, 2000, 500, 1000, 500]
attrs = ["NORM", "LOWER", "PREFIX", "SUFFIX", "SHAPE"]
include_static_vectors = false

[model.tok2vec.encode]
@architectures = "spacy.MaxoutWindowEncoder.v2"
width = ${model.tok2vec.embed.width}
window_size = 1
maxout_pieces = 3
depth = 2

[model.linear_model]
@architectures = "spacy.TextCatBOW.v3"
exclusive_classes = false
length = 262144
ngram_size = 1
no_output_layer = false
modelzq
[model]
@architectures = "spacy.TextCatBOW.v3"
exclusive_classes = false
ngram_size = 1
no_output_layer = false
aa  
[model]
@architectures = "spacy.TextCatReduce.v1"
exclusive_classes = false
use_reduce_first = false
use_reduce_last = false
use_reduce_max = false
use_reduce_mean = true

[model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v2"
pretrained_vectors = null
width = 96
depth = 4
embed_size = 2000
window_size = 1
maxout_pieces = 3
subword_features = true
examplesreturnc                 K   s   t j| dfddi|S )Ncatsmulti_labelT)r   
score_cats)r   kwargs r   U/home/ubuntu/.local/lib/python3.10/site-packages/spacy/pipeline/textcat_multilabel.pytextcat_multilabel_scoreM   s   r    c                   C   s   t S )N)r    r   r   r   r   make_textcat_multilabel_scorerV   s   r!   c                   @   s   e Zd ZdZ	deddedededede	e
 d	d
fddZedd Zd
d
dde
g ee f de	e de	ee  fddZdee fddZd
S )MultiLabel_TextCategorizerzlPipeline component for multi-label text classification.

    DOCS: https://spacy.io/api/textcategorizer
    textcat_multilabel)scorervocabr   name	thresholdr$   r   Nc                C   s6   || _ || _|| _d| _g |d}t|| _|| _dS )a  Initialize a text categorizer for multi-label classification.

        vocab (Vocab): The shared vocabulary.
        model (thinc.api.Model): The Thinc Model powering the pipeline component.
        name (str): The component instance name, used to add entries to the
            losses during training.
        threshold (float): Cutoff to consider a prediction "positive".
        scorer (Optional[Callable]): The scoring method.

        DOCS: https://spacy.io/api/textcategorizer#init
        N)labelsr'   )r%   r   r&   _rehearsal_modeldictcfgr$   )selfr%   r   r&   r'   r$   r+   r   r   r   __init__`   s   


z#MultiLabel_TextCategorizer.__init__c                 C   s   dS )NTr   )r,   r   r   r   support_missing_values|   s   z1MultiLabel_TextCategorizer.support_missing_values)nlpr(   get_examplesr/   r(   c                C   s   t |d |du r| D ]}|jjD ]}| | qqn
|D ]}| | qtt| d}| | dd |D }| |\}	}
|   t	|dksUJ t
jj| jdt	|	dkseJ t
jj| jd| jj||	d dS )	a\  Initialize the pipe for training, using a representative set
        of data examples.

        get_examples (Callable[[], Iterable[Example]]): Function that
            returns a representative sample of gold-standard Example objects.
        nlp (Language): The current nlp object the component is part of.
        labels: The labels to add to the component, typically generated by the
            `init labels` command. If no labels are provided, the get_examples
            callback is used to extract the labels from the data.

        DOCS: https://spacy.io/api/textcategorizer#initialize
        %MultiLabel_TextCategorizer.initializeN
   c                 S   s   g | ]}|j qS r   )	reference).0egr   r   r   
<listcomp>   s    z9MultiLabel_TextCategorizer.initialize.<locals>.<listcomp>r   )r&   )XY)r   yr   	add_labellistr   _validate_categories_examples_to_truth_require_labelslenr   E923formatr&   r   
initialize)r,   r0   r/   r(   examplecatlabelsubbatch
doc_samplelabel_sample_r   r   r   rB      s"   


  r1   r   c                 C   sB   |D ]}|j j D ]}|dks|dksttjj|dq
qdS )zThis component allows any type of single- or multi-label annotations.
        This method overwrites the more strict one from 'textcat'.g      ?g        )valN)r3   r   values
ValueErrorr   E851rA   )r,   r   exrJ   r   r   r   r<      s   z/MultiLabel_TextCategorizer._validate_categories)r#   )__name__
__module____qualname____doc__r    r   r
   strfloatr   r   r-   propertyr.   r   r   r   rB   r<   r   r   r   r   r"   Z   s<    	



%r"   c                 C   s,   | dkrt d}|jS tdt d|  )Nmake_multilabel_textcatzspacy.pipeline.factorieszmodule z has no attribute )	importlibimport_modulerV   AttributeErrorrO   )r&   moduler   r   r   __getattr__   s   
r[   )+rW   sys	itertoolsr   typingr   r   r   r   r   r   	thinc.apir	   r
   thinc.typesr   errorsr   languager   r$   r   tokensr   trainingr   r   utilr   r%   r   textcatr   multi_label_default_configfrom_strDEFAULT_MULTI_TEXTCAT_MODELmulti_label_bow_configmulti_label_cnn_configrS   r    r!   r"   r[   r   r   r   r   <module>   s,     	V