o
    i@7                     @   s4  d dl Z d dlZd dlZd dlZd dlmZ d dlmZ d dlm	Z	m
Z
mZmZmZmZ d dlZd dlZd dlZd dlmZmZmZmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z' ddl(m)Z* ddl(m+Z+ ddl,m-Z- e
rddl.m/Z/ dddede0ddfddZ1dddddddee dee dee2 ddf
ddZ3dd ddd!ee2ef d"e4ddfd#d$Z5ddd%ee2ef d&ee2ef de4fd'd(Z6de*j7d)d*ddd+ee d,e0d-e0d!ee2 d.e2d/e2ddfd0d1Z8e*j7d2d+ed3e0d.e2fd4d5Z9d6ee2ef de	fd7d8Z:d9d: Z;dS );    Nislice)Path)IOTYPE_CHECKINGAnyDictOptionalUnion)ConfigConfigValidationErrorfix_random_seedset_gpu_allocator   )ErrorsWarnings)Lookups)ConfigSchemaTraining)	DEFAULT_OOV_PROBOOV_RANKensure_pathget_sourced_components
load_modelload_model_from_configloggerregistryresolve_dot_names)Mode)Vectors   )get_tok2vec_ref)Language)use_gpuconfigr#   returnr!   c                   s  | }|  } d| d vrttjjddd| d vr$ttjjdd| d d d ur4t| d d  | d d }|dkrD|rDt| t| }t|dd	t	
d
 j  } tj| d td}|d |d g}t|d tsttjjdt|d ddt|d tsttjjdt|d ddt| |\}|d }|d   fdd|D }	t	
dj |	r؈j|	d t	
d|	 j|d W d    n1 sw   Y    jg  |	d8 |d dkrdt	d jfdd |d njfd!d |d t	
d"j W d    n	1 s"w   Y  jD ]H\}
}t|d#g D ]<}|jvr>q4| v rS|
 vrSt	tjj|
|d$ | vro|
 v ro|
|d% vrot	tj j|
|d$ q4q*S )&Nseedtrainingz[training] seed)valuegpu_allocatorz[training] gpu_allocatorr   T)	auto_fillzSet up nlp object from config)schematrain_corpus
dev_corpusztraining.train_corpus)fieldtype)descztraining.dev_corpus	optimizerfrozen_componentsc                    s   g | ]}| vr|qS  r3   ).0p)r2   r3   M/home/ubuntu/.local/lib/python3.10/site-packages/spacy/training/initialize.py
<listcomp>I   s    zinit_nlp.<locals>.<listcomp>zPipeline: %s)enablezResuming training for: %s)sgddisable
max_epochsr"   d   zDue to streamed train corpus, using only first %s examples for initialization. If necessary, provide all labels in [initialize]. More info: https://spacy.io/api/cli#init_labelsc                      s   t  S Nr   r3   )nlpsample_sizer,   r3   r6   <lambda>\   s    zinit_nlp.<locals>.<lambda>c                      s    S r>   r3   r3   )r?   r,   r3   r6   rA   _   s    z#Initialized pipeline components: %slistening_components)namelistenerannotating_components)!interpolate
ValueErrorr   E1015formatr   r   r   r   r   infor$   r   resolver   
isinstancestrr   E897r/   r   
pipe_namesselect_pipesresume_training_link_componentsdebug
initializepipelinegetattrwarningr   W087W086)r$   r#   
raw_config	allocatorsourcedT	dot_namesr-   r1   resume_componentsrC   procrD   r3   )r2   r?   r@   r,   r6   init_nlp$   s   

ra   )datalookupsvectorsr?   rb   rc   rd   c                C   s^  |r|| j _tdd|j t|}|d uret|}| j D ]}t	|_
q |D ]}d|v r/q(| j |d  }|jdi | q(t| j rQtdd | j D d }nt}| j jd|i td	t| j  td
 |d uryt| | td| | jdi }	t|	dkrt| j jjdgd}
|	 D ]\}}|
|krttjj|d qtd d S )NzAdded vocab lookups: %sz, settingsorthc                 s   s    | ]}|j V  qd S r>   )prob)r4   lexr3   r3   r6   	<genexpr>   s    zinit_vocab.<locals>.<genexpr>r   oov_probz%Added %d lexical entries to the vocabzCreated vocabularyzAdded vectors: %s_sourced_vectors_hashesr   strings)excluderC   z Finished initializing nlp objectr3   )vocabrc   r   rJ   jointablesr   srsly
read_jsonlr   rank	set_attrslenminr   cfgupdateload_vectors_into_modelmetapophashrd   to_bytesitemswarningswarnr   W113rI   )r?   rb   rc   rd   	data_path	lex_attrslexemeattrsrj   sourced_vectors_hashesvectors_hashsourced_componentsourced_vectors_hashr3   r3   r6   
init_vocabr   s<   




r   T)add_stringsrC   r   c          
   
   C   s   zdg}|s| d t|| j|d}W n ty2 } zd| }d}tj|||d}|dd}~ww t|jj dkrE|jjjt	j
ksV|jjjd dkr`|jjjt	j
kr`ttjj|d	 | jD ]}	| jjj|	jt|	_qcdS )
zHLoad word vectors from an installed model or path into a model instance.rc   rl   )ro   rm   z$Config validation error for vectors zThis typically means that there's a problem in the config.cfg included with the packaged vectors. Make sure that the vectors package you're loading is compatible with the current version of spaCy.)titler0   Nr   rn   )appendr   ro   r   
from_errorrv   rd   keysmodeVectorsModefloretshaper   rW   r   W112rI   key2rowgetrf   r   rt   )
r?   rC   r   rm   vectors_nlper   r0   errrh   r3   r3   r6   rz      s*   


rz   pretrain_configinit_configc                 C   s   |}|}d }t |d }|d urA| s(d| }ddg|dg}t| j|d|d}	|	 }W d    n1 s<w   Y  |d urWt| |}
|
| t	d| dS d	S )
Ninit_tok2veczcan't find pretrained tok2vec: rT   )locmsg)r$   errorsrbz!Loaded pretrained weights from %sTF)
r   existsr   r$   openreadr    
from_bytesr   rJ   )r?   r   r   PIweights_datar   r   r   file_layerr3   r3   r6   r      s$   



r   ORTH)rC   r   attrvectors_loctruncatepruner   r   c                C   s  t |}|rE|jd drE|dkrtdt| jjt|	dd| j_
| jD ]}|jr>|jtkr>| jj
j|j|jd q)| j  n_|r^td| t|||d	\}}	}
td
| nd\}}	|	d urz|tjkrz|	D ]}|| jvry| j|  qm|d ur|tjkrtd| jj||d|
| j_
nt| jj||	|d| j_
| j  |d u r| jd  d| jd  d| jj
_n|| jj
_| jj
j| jd d< |dkr|tjkr| j| d S d S d S )Nr"   z.npzr   z@ORTH is the only attribute supported for vectors in .npz format.r   )rl   rb   )rowzReading vectors from %sr   zLoaded vectors from %s)NN)rl   rb   r   )rl   rb   r   r   lang_rC   z.vectorsrd   r   r3   )r   partsendswithrG   r   ro   rl   numpyloadr   rd   rt   r   addrf   deduplicate_vectorsr   rJ   read_vectorsr   r   r{   rC   prune_vectors)r?   r   r   r   rC   r   r   rh   vectors_datavector_keysfloret_settingswordr3   r3   r6   convert_vectors   sf   







$
r   r   truncate_vectorsc                C   sr  t | }t| }tdd |d d D }i }|tjkrPt|dkr(tddt|d t|d t|d t|d	 |d
 |d d}|dkrOtt	j
nt|dksXJ |dkrb||d f}tj|dd}g }ttj|d dD ]?\}	}
|
 }
|
d|jd }|d}t||jd krtt	jj|	| dtj|dd||	< || |	|d kr nqt|||fS )Nc                 s       | ]}t |V  qd S r>   intr4   sizer3   r3   r6   ri         zread_vectors.<locals>.<genexpr>r      z^Invalid header for floret vectors. Expected: bucket dim minn maxn hash_count hash_seed BOW EOWr                  )r   minnmaxn
hash_count	hash_seedboweowr   f)r   dtyper:    r   )line_numr   )r   )ensure_shapenextsplittupler   r   rv   rG   r   r   E860r   zeros	enumeratetqdmrstriprsplitr   r|   E094rI   asarrayr   )r   r   r   r   header_partsr   r   r   vectors_keysilinepiecesr   r3   r3   r6   r     sJ   




	



r   r   c                 C   s   t | } tt| rtt| dS | jd dr(dd tt| dD S | jd drIt	t| }|
 }||d }d	d |D S | jdd
dS )z%Handle .gz, .tar.gz or unzipped fileszr:gzr"   gzc                 s       | ]}| d V  qdS utf8Ndecoder4   r   r3   r3   r6   ri   D      zopen_file.<locals>.<genexpr>rzipr   c                 s   r   r   r   r   r3   r3   r6   ri   I  r   r   )encoding)r   tarfile
is_tarfilerM   r   r   r   gzipzipfileZipFilenamelist)r   zip_filenamesr   r3   r3   r6   	open_file>  s   r   c                 c   s    t | }t|}ztdd | dd D }W n ty%   d}Y nw |dur3|V  |E dH  n(t| d }d}|D ]}|d7 }q?| d| V  t | }|E dH  |  |  dS )zEnsure that the first line of the data is the vectors shape.
    If it's not, we read in the data and output the shape as the first result,
    so that the reader doesn't have to deal with the problem.
    c                 s   r   r>   r   r   r3   r3   r6   ri   V  r   zensure_shape.<locals>.<genexpr>Nr   r   r   )r   r   r   r   rG   rv   close)r   lines
first_liner   widthlengthr   lines2r3   r3   r6   r   N  s(   "

r   )<r   r   r   r   	itertoolsr   pathlibr   typingr   r   r   r   r	   r
   r   rr   r   	thinc.apir   r   r   r   r   r   r   rc   r   schemasr   utilr   r   r   r   r   r   r   r   r   rd   r   r   r   pretrainr    languager!   r   ra   rM   r   boolrz   r   defaultr   r   r   r   r3   r3   r3   r6   <module>   s     ,Q
)

"


	
A
+