o
    i%                     @   s  d dl Z d dlZd dlmZ d dlmZ d dlmZmZm	Z	m
Z
mZ d dlZd dlmZmZmZmZmZmZ d dlmZ d dlmZ dd	lmZ dd
lmZ ddlmZ ddlmZm Z m!Z! ddl"m#Z# 					d4dedede
e de
e$ de$de%de%fddZ&deeee#f  de	e fddZ'	d5dedede
e$ de%de$f
d d!Z(ded"ee d#ed$ede)f
d%d&Z*d'd( Z+d)d* Z,G d+d, d,Z-	.d6d/ee)e$f d0e$d1e$de.fd2d3Z/dS )7    N)Counter)Path)CallableIterableListOptionalUnion)ConfigModel	Optimizerfix_random_seedset_dropout_rateset_gpu_allocator)ConfigValidationError)Printer   )Errors)ConfigSchemaPretrain)Doc)dot_to_objectload_model_from_configregistry   )ExampleTFconfig
output_dirresume_pathepoch_resumeuse_gpusilent	skip_lastc              
      sX  t |d}| d d d urt| d d  | d d }|dkr%|r%t| d | d d< t| }	|	j }
tj|
d td	}t	|
|d
 }td
|id
 }|d }t
|	| |d |d urgt |||d}nd} jd }tdd|d r|d| d|d  d n|d|  ddd}|jd#i | d$ fdd	}zyt||d D ]a}t|||	D ]3\}}t|}t ||}|||}|r|j|fi | |d r||d  dkr||dd q|d r||d  dks||d d  kr|| n|| d!_qW |s||d dd" d S d S |s+||d dd" w w )%Nno_printtrainingseedgpu_allocatorr   
initializeinit_tok2vecpretraining)schemacorpusbatcher	optimizer)r    lossi'  )	frequencyn_save_epochz/Pre-training tok2vec layer - starting at epoch z - saving every z epoch)   
   r2         )rr5   r5   r5   r5   )widthsaligns#z# Wordsz
Total LossLosszw/sFc              	      s  |rdnd}  jl |rd }n
d|  | d }|d}| d  W d    n1 s8w   Y  jjj| d}d	 d
}|t	
|d  W d    n1 scw   Y  W d    d S W d    d S 1 s{w   Y  d S )Nz.temp zmodel-last.binmodelz.binwbtok2vec)nr_wordr.   
epoch_lossepochz	log.jsonla
)
use_paramsaveragesopenwriteget_refto_bytesr?   r.   r@   srsly
json_dumps)rA   is_tempis_lastis_temp_str	save_pathfile_logr<   r-   r   tracker K/home/ubuntu/.local/lib/python3.10/site-packages/spacy/training/pretrain.py_save_modelF   s&   
"zpretrain.<locals>._save_model
max_epochsn_save_everyT)rL   r           )rM   )r8   )FF)r   r   r   r   r   interpolater   resolver   r   create_pretraining_model_resume_modelattrsProgressTrackerdividerrowrange	enumerateensure_docsmake_updateupdater@   )r   r   r   r   r   r    r!   msg	allocatornlp_configPr+   r,   	objectiverow_settingsrV   rA   batch_idbatchdocsr.   progressrT   rR   rU   pretrain   sf   
	





$
rr   examples_or_docsreturnc                 C   s4   g }| D ]}t |tr|| q||j q|S N)
isinstancer   append	reference)rs   rp   	eg_or_docrT   rT   rU   rd   p   s   
rd   r<   c                 C   s   t |d}|d|  |d}| }| d| W d    n1 s)w   Y  |d u rStdt|}|rNt	|
ddd  d d d	 }nttj|d
|  |S )Nr"   zResume training tok2vec from: rbr>   zmodel\d+\.binr      r   zResuming from epoch: )r   inforF   readrH   
from_bytesresearchstrintgroup
ValueErrorr   E1020)r<   r   r   r    rg   rP   weights_data
model_namerT   rT   rU   r]   z   s   
$
r]   rp   r-   objective_funcc                 C   s:   |  |\}}|| j||\}}|| | | t|S )zPerform an update over a single batch of documents.

    docs (iterable): A batch of `Doc` objects.
    optimizer (callable): An optimizer.
    RETURNS loss: A float for the loss.
    )begin_updateopsfinish_updatefloat)r<   rp   r-   r   predictionsbackpropr.   	gradientsrT   rT   rU   re      s
   	
re   c                 C   s   | j g d |   W d   n1 sw   Y  t| |}t|jdkr6|jdkr.|jnd}| |j}z|j| dgd W n t	y[   |d }|d	 }t	t
jj||d
w |d }|| j|}|j| dgd t||d  |S )a  Define a network for the pretraining. We simply add an output layer onto
    the tok2vec input model. The tok2vec input model needs to be a model that
    takes a batch of Doc objects (as a list), and returns a list of arrays.
    Each array in the output needs to have one row per token in the doc.
    The actual tok2vec layer is stored as a reference, and only this bit will be
    serialized to file and read back in when calling the 'train' command.
    )enableNTok2VecListener*r>   zGive it a doc to infer shapes)X	componentlayer)r   r   rl   dropout)select_pipesr'   get_tok2vec_reftype__name__upstream_nameget_piper<   make_docr   r   E874formatvocabr   )ri   pretrain_configr>   original_tok2vecr   r   create_functionr<   rT   rT   rU   r\      s(   

r\   c                 C   sb   |d }|d u rd}d}ddg|dg}t | jd ||d| |j}|d r/||d }|S )Nr   zpTo use pretrained tok2vec weights, [pretraining.component] needs to specify the component that should load them.zcomponent can't be nullr)   )locrg   )r   errorsdescr   )r   r   r   r<   rH   )ri   r   tok2vec_componentr   errr   r   rT   rT   rU   r      s   r   c                   @   s   e Zd ZdddZdd ZdS )r_   @B c                 C   s:   d| _ d| _d| _t | _|| _t | _d| _d| _	d S )NrY   r   )
r.   	prev_lossr?   r   words_per_epochr/   time	last_timelast_updater@   )selfr/   rT   rT   rU   __init__   s   

zProgressTracker.__init__c           	      C   s   |  j |7  _ |  j|7  _tdd |D }| j|  |7  < |  j|7  _| j| j }|| jkre|t | j  }| j| _t | _| j | j	 }|| jt
| j ddt
|ddt|f}t| j | _	|S d S )Nc                 s   s    | ]}t |V  qd S ru   )len).0docrT   rT   rU   	<genexpr>   s    z)ProgressTracker.update.<locals>.<genexpr>r2   )widthr3   )r.   r@   sumr   r?   r   r/   r   r   r   _smart_roundr   r   )	r   rA   r.   rp   words_in_batchwords_since_updatewpsloss_per_wordstatusrT   rT   rU   rf      s(   


zProgressTracker.updateN)r   )r   
__module____qualname__r   rf   rT   rT   rT   rU   r_      s    

r_   r2   r4   figurer   max_decimalc                 C   sR   t tt| }||d  }|dkrtt| S t||}dt| d }||  S )z=Round large numbers as integers, smaller numbers as decimals.r   z%.f)r   r   r   min)r   r   r   n_digits	n_decimal
format_strrT   rT   rU   r      s   
r   )NNr   TF)T)r2   r4   )0r   r   collectionsr   pathlibr   typingr   r   r   r   r   rJ   	thinc.apir	   r
   r   r   r   r   thinc.configr   wasabir   r   r   schemasr   tokensr   utilr   r   r   exampler   r   boolrr   rd   r]   r   re   r\   r   r_   r   r   rT   rT   rT   rU   <module>   s     
"V

%
