o
    i4                     @   s  d dl Z d dlZd dlmZ d dlmZmZmZmZm	Z	m
Z
mZ d dlmZmZmZmZ ddlmZ ddlmZ ddlmZ dd	lmZmZmZ dd
lmZ ddlmZ dZe  ed Z!G dd deZ"G dd deZ#de#de$fddZ%dd Z&dd Z'dS )    N)islice)AnyCallableDictIterableListOptionalSequence)ConfigModel	Optimizerset_dropout_rate   )Errors)Language)Doc)Examplevalidate_examplesvalidate_get_examples)Vocab   )TrainablePipez
[model]
@architectures = "spacy.HashEmbedCNN.v2"
pretrained_vectors = null
width = 96
depth = 4
embed_size = 2000
window_size = 1
maxout_pieces = 3
subword_features = true
modelc                   @   s(  e Zd ZdZd-dedededdfdd	Zede	d
 fddZ
ede	e fddZdd
deddfddZdd
dedefddZd.ddZdee fddZdee ddfddZdddddee ded ee d!eeeef  fd"d#Zd.d$d%Zdd&d'eg ee f d(ee fd)d*Zd+d, ZdS )/Tok2VecaA  Apply a "token-to-vector" model and set its outputs in the doc.tensor
    attribute. This is mostly useful to share a single subnetwork between multiple
    components, e.g. to have one embedding and CNN network shared between a
    parser, tagger and NER.

    In order to use the `Tok2Vec` predictions, subsequent components should use
    the `Tok2VecListener` layer as the tok2vec subnetwork of their model. This
    layer will read data from the `doc.tensor` attribute during prediction.
    During training, the `Tok2Vec` component will save its prediction and backprop
    callback for each batch, so that the subsequent components can backpropagate
    to the shared weights. This implementation is used because it allows us to
    avoid relying on object identity within the models to achieve the parameter
    sharing.
    tok2vecvocabr   namereturnNc                 C   s"   || _ || _|| _i | _i | _dS )a  Initialize a tok2vec component.

        vocab (Vocab): The shared vocabulary.
        model (thinc.api.Model[List[Doc], List[Floats2d]]):
            The Thinc Model powering the pipeline component. It should take
            a list of Doc objects as input, and output a list of 2d float arrays.
        name (str): The component instance name.

        DOCS: https://spacy.io/api/tok2vec#init
        N)r   r   r   listener_mapcfg)selfr   r   r    r!   J/home/ubuntu/.local/lib/python3.10/site-packages/spacy/pipeline/tok2vec.py__init__-   s
   
zTok2Vec.__init__Tok2VecListenerc                    s    fdd j D S )zuRETURNS (List[Tok2VecListener]): The listener models listening to this
        component. Usually internals.
        c                    s    g | ]} j | D ]}|q	qS r!   )r   ).0cmr    r!   r"   
<listcomp>C   s     z%Tok2Vec.listeners.<locals>.<listcomp>)listening_componentsr(   r!   r(   r"   	listeners>   s   zTok2Vec.listenersc                 C   s   t | j S )zoRETURNS (List[str]): The downstream components listening to this
        component. Usually internals.
        )listr   keysr(   r!   r!   r"   r*   E   s   zTok2Vec.listening_componentslistenercomponent_namec                 C   s4   | j |g  || j | vr| j | | dS dS )z=Add a listener for a downstream component. Usually internals.N)r   
setdefaultappendr    r.   r/   r!   r!   r"   add_listenerL   s   zTok2Vec.add_listenerc                 C   sB   || j v r|| j | v r| j | | | j | s| j |= dS dS )z@Remove a listener for a downstream component. Usually internals.TF)r   remover2   r!   r!   r"   remove_listenerR   s   

zTok2Vec.remove_listenerc                 C   sV   d| j f}tt|ddtr'|j D ]}t|tr&|j|v r&| ||j  qdS dS )a  Walk over a model of a processing component, looking for layers that
        are Tok2vecListener subclasses that have an upstream_name that matches
        this component. Listeners can also set their upstream_name attribute to
        the wildcard string '*' to match any `Tok2Vec`.

        You're unlikely to ever need multiple `Tok2Vec` components, so it's
        fine to leave your listeners upstream_name on '*'.
        *r   N)	r   
isinstancegetattrr   r   walkr$   upstream_namer3   )r    	componentnamesnoder!   r!   r"   find_listeners]   s   
	zTok2Vec.find_listenersdocsc                    sB   t dd |D s jd fdd|D S  j|}|S )a?  Apply the pipeline's model to a batch of docs, without modifying them.
        Returns a single tensor for a batch of documents.

        docs (Iterable[Doc]): The documents to predict.
        RETURNS: Vector representations for each token in the documents.

        DOCS: https://spacy.io/api/tok2vec#predict
        c                 s   s    | ]}t |V  qd S N)lenr%   docr!   r!   r"   	<genexpr>u   s    z"Tok2Vec.predict.<locals>.<genexpr>nOc                    s   g | ]} j jd fqS )r   )r   opsallocrB   r    widthr!   r"   r)   x   s    z#Tok2Vec.predict.<locals>.<listcomp>)anyr   get_dimpredict)r    r?   tokvecsr!   rH   r"   rL   l   s
   	zTok2Vec.predictc                 C   s4   t ||D ]\}}|jd t|ksJ ||_qdS )zModify a batch of documents, using pre-computed scores.

        docs (Iterable[Doc]): The documents to modify.
        tokvecses: The tensors to set, produced by Tok2Vec.predict.

        DOCS: https://spacy.io/api/tok2vec#set_annotations
        r   N)zipshaperA   tensor)r    r?   	tokvecsesrC   rM   r!   r!   r"   set_annotations|   s   zTok2Vec.set_annotations        )dropsgdlossesexamplesrT   rU   rV   c          	         s   du ri t |d dd |D }tj| j|\fddD jd fdd  fd	d
}t|}jdd D ]	}|	|  qOjrfjd 	|| S )a  Learn from a batch of documents and gold-standard information,
        updating the pipe's model.

        examples (Iterable[Example]): A batch of Example objects.
        drop (float): The dropout rate.
        sgd (thinc.api.Optimizer): The optimizer.
        losses (Dict[str, float]): Optional record of the loss during training.
            Updated using the component name as the key.
        RETURNS (Dict[str, float]): The updated losses dictionary.

        DOCS: https://spacy.io/api/tok2vec#update
        NTok2Vec.updatec                 S   s   g | ]}|j qS r!   )	predicted)r%   egr!   r!   r"   r)      s    z"Tok2Vec.update.<locals>.<listcomp>c                       g | ]
} j jj|j qS r!   r   rF   alloc2frO   r%   t2vr(   r!   r"   r)          rS   c                    sZ   t t| D ]} |  | | 7  < j  t| | d  7  < qfddD S )zAccumulate tok2vec loss and gradient. This is passed as a callback
            to all but the last listener. Only the last one does the backprop.
            r   c                    r[   r!   r\   r^   r(   r!   r"   r)      r`   z?Tok2Vec.update.<locals>.accumulate_gradient.<locals>.<listcomp>)rangerA   r   floatsum)one_d_tokvecsi)	d_tokvecsrV   r    rM   r!   r"   accumulate_gradient   s   $z+Tok2Vec.update.<locals>.accumulate_gradientc                    s&    |  }dur  |S )z>Callback to actually do the backprop. Passed to last listener.N)finish_update)rd   d_docs)rg   
bp_tokvecsrf   r    rU   r!   r"   backprop   s
   
z Tok2Vec.update.<locals>.backprop)
r   r   r   begin_updater0   r   r$   get_batch_idr+   receive)	r    rW   rT   rU   rV   r?   rk   batch_idr.   r!   )rg   rj   rf   rV   r    rU   rM   r"   update   s    


rX   c                 C   s   d S r@   r!   )r    rW   scoresr!   r!   r"   get_loss      zTok2Vec.get_loss)nlpget_examplesru   c                C   sV   t |d g }t| dD ]}||j q|s"J tjj| jd| jj	|d dS )at  Initialize the pipe for training, using a representative set
        of data examples.

        get_examples (Callable[[], Iterable[Example]]): Function that
            returns a representative sample of gold-standard Example objects.
        nlp (Language): The current nlp object the component is part of.

        DOCS: https://spacy.io/api/tok2vec#initialize
        Tok2Vec.initialize
   r   )XN)
r   r   r1   xr   E923formatr   r   
initialize)r    rv   ru   
doc_sampleexampler!   r!   r"   r~      s   
rw   c                 C   s   t r@   )NotImplementedError)r    labelr!   r!   r"   	add_label   rt   zTok2Vec.add_label)r   )r   N) __name__
__module____qualname____doc__r   r   strr#   propertyr   r+   r*   r3   boolr5   r>   r   r   rL   r	   rR   r   rb   r   r   r   rq   rs   r   r   r~   r   r!   r!   r!   r"   r      s@    


6
r   c                   @   sd   e Zd ZdZdZdededdfddZed	e	e
 defd
dZdeddfddZdefddZdS )r$   a$  A layer that gets fed its answers from an upstream connection,
    for instance from a component earlier in the pipeline.

    The Tok2VecListener layer is used as a sublayer within a component such
    as a parser, NER or text categorizer. Usually you'll have multiple listeners
    connecting to a single upstream Tok2Vec component, that's earlier in the
    pipeline. The Tok2VecListener layers act as proxies, passing the predictions
    from the Tok2Vec component into downstream components, and communicating
    gradients back upstream.
    ztok2vec-listenerr:   rI   r   Nc                 C   s4   t j| | jtd|id || _d| _d| _d| _dS )a  
        upstream_name (str): A string to identify the 'upstream' Tok2Vec component
            to communicate with. The upstream name should either be the wildcard
            string '*', or the name of the `Tok2Vec` component. You'll almost
            never have multiple upstream Tok2Vec components, so the wildcard
            string will almost always be fine.
        width (int):
            The width of the vectors produced by the upstream tok2vec component.
        rE   )r   forwarddimsN)r   r#   r   r   r:   	_batch_id_outputs	_backprop)r    r:   rI   r!   r!   r"   r#      s
   

zTok2VecListener.__init__inputsc                 C   s   t dd |D S )zCalculate a content-sensitive hash of the batch of documents, to check
        whether the next batch of documents is unexpected.
        c                 s   s"    | ]}t d d |D V  qdS )c                 s   s    | ]}|j V  qd S r@   )orth)r%   tokenr!   r!   r"   rD      s    z9Tok2VecListener.get_batch_id.<locals>.<genexpr>.<genexpr>Nrc   rB   r!   r!   r"   rD      s     z/Tok2VecListener.get_batch_id.<locals>.<genexpr>r   )clsr   r!   r!   r"   rn      s   zTok2VecListener.get_batch_idrp   c                 C   s   || _ || _|| _dS )zStore a batch of training predictions and a backprop callback. The
        predictions and callback are produced by the upstream Tok2Vec component,
        and later will be used when the listener's component's model is called.
        N)r   r   r   )r    rp   outputsrk   r!   r!   r"   ro      s   
zTok2VecListener.receivec                 C   sL   | j du r| jdu rttj| |}|| j kr$ttjj|| j ddS )z_Check that the batch of Doc objects matches the ones we have a
        prediction for.
        N)id1id2T)r   r   
ValueErrorr   E954rn   E953r}   )r    r   rp   r!   r!   r"   verify_inputs	  s   


zTok2VecListener.verify_inputs)r   r   r   r   r   r   intr#   classmethodr   r   rn   ro   r   r   r!   r!   r!   r"   r$      s    	r$   is_trainc                 C   s   |r2| j du r'g }|D ]}|jjdkrttjjdd||j q|tfS | 	| | j
| jfS g }| d}|D ]}|jjdkrP|| jt|| q;||j q;|tfS )z7Supply the outputs from the upstream Tok2Vec component.Nr   r   ry   rE   )r   rP   sizer   r   E203r}   r1   _empty_backpropr   r   r   rK   rF   r]   rA   )r   r   r   r   rC   rI   r!   r!   r"   r     s"   

	
r   c                 C   s   g S r@   r!   )dXr!   r!   r"   r   <  rt   r   c                 C   s,   | dkrt d}|jS tdt d|  )Nmake_tok2veczspacy.pipeline.factorieszmodule z has no attribute )	importlibimport_moduler   AttributeErrorr   )r   moduler!   r!   r"   __getattr__A  s   
r   )(r   sys	itertoolsr   typingr   r   r   r   r   r   r	   	thinc.apir
   r   r   r   errorsr   languager   tokensr   trainingr   r   r   r   r   trainable_piper   default_model_configfrom_strDEFAULT_TOK2VEC_MODELr   r$   r   r   r   r   r!   r!   r!   r"   <module>   s&    $ ?<%