o
    iE                     @   s  d dl Z d dlZd dlmZ d dlmZ d dlmZmZm	Z	m
Z
mZmZmZmZmZmZmZ d dlZd dlZd dlZd dlmZmZmZ ddlmZ ddlmZ dd	lmZ dd
lm Z  ddl!m"Z"m#Z#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z, ddl-m.Z.m/Z/ ddl0m1Z1 ddlm2Z2m3Z3 ddl4m5Z6 ddl7m8Z8m9Z9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z? dZ@dZAdZBdZCdZDdZEdZFe<jGddddde;jGdddddd e8d!d"ddd#e9dd$d%d&d'd(e9d)d*d+d,d(e9d)d-d.d/d(e9d)d0d1d2d(fd3ejHd4ed5ee d6eId7eId8eIfd9d:ZJi d)d)ddd;d4ed<eeKef d6eId7eId8eId=eIfd>d?ZLd@edAedBdfdCdDZMdEee. dFe
eK dGedHeIdBeeKef f
dIdJZNeddKe	eK dLed) dBeKfdMdNZOedKe	eeKePf  dLed dBeKfdOdNZO	)ddKee	eK e	eeKePf  f dLeIdBeKfdPdNZOddQeePeQf dReIdBeKfdSdTZR	U	Vddee. dWeKdXedY dZeeK dBePf
d[d\ZSdGed]eKdBeeK fd^d_ZTdGedBeeKeeK f fd`daZUdbe
dBeQfdcddZVdeeeKeQf dfedBeQfdgdhZWddieIdBefdjdkZXdledmedBeQfdndoZYdpe
e dKe
eK dBe
e fdqdrZZdEe
e. dseeKef dZeKdBeeKef fdtduZ[dveeKef fdwdxZ\eFfdyedBeePeQf fdzd{Z]d|eePeQf d}ePdBeePeQf fd~dZ^dS )    N)Counter)Path)AnyDictIterableListOptionalSequenceSetTupleUnioncastoverload)MESSAGESPrintermsg   )util)Literal)Language)
Morphology)MorphologizerSpanCategorizerTrainablePipe)	EditTrees)nonproj)	DELIMITER)ConfigSchemaTraining)Exampleremove_bilu_prefix)get_sourced_components)registryresolve_dot_names)Mode   )ArgOpt_format_numberapp	debug_cliimport_codeparse_config_overridesshow_validation_error2      d   i  Z   dataT)allow_extra_argsignore_unknown_options)context_settings
debug-data)r4   hidden.zPath to config file)helpexists
allow_dashz--code-pathz--codez-czNPath to Python file with additional code (registered functions) to be imported)r7   Fz--ignore-warningsz-IWz+Ignore warnings, only show stats and errorsz	--verbosez-Vz-Print additional information and explanationsz--no-formatz-NFzDon't pretty-print the resultsctxconfig_path	code_pathignore_warningsverbose	no_formatc                 C   s@   | j jdkrtd t| j}t| t|||||dd dS )a  
    Analyze, debug and validate your training and development data. Outputs
    useful stats, and can help you find problems like invalid entity annotations,
    cyclic dependencies, low data labels and more.

    DOCS: https://spacy.io/api/cli#debug-data
    r5   zThe debug-data command is now available via the 'debug data' subcommand (without the hyphen). You can run python -m spacy debug --help for an overview of the other available debugging commands.Fconfig_overridesr=   r>   r?   silentN)commandnamer   warnr+   argsr*   
debug_data)r:   r;   r<   r=   r>   r?   	overrides rI   H/home/ubuntu/.local/lib/python3.10/site-packages/spacy/cli/debug_data.pydebug_data_cli?   s   

rK   r@   rA   rB   c          I         s  t || |d}t| " tj| |d}t|j }tj|d t	d}	W d    n1 s1w   Y  t
|}
|	d   fdd|
D }j}fddjD }|d	 |	d
 |	d g}t||\}fdd |d t}t|}|d t||ddt||dd}t||dd}d }|d }|	d  |d |dj  |dd|  |r|dd|   r|dd   |t| d |t| d t|s|d t||}|r|| d n|d |sFt|tk rFdt| d }t|tk r3|| n|| |jd!t d"t d |d# |d$ d% }|| d&td'  d( d) d*krqd) }|| d+ |d) d*kr|d) }|| d, d' d-}|jd.t|dd/ |d# tjjr
jjj t!j"kr|d0tjj d1jj# d2jjj$ d3jjj% d4	 nG|tjj d5jjj& d6jj# d7 t'd8 ( }|d9)|d:|d%    |jd;)td8 d-dd/|d# n|d< d=|v sd>|v rt*}d}d}|d? |j+|d@dAgddB |jdC|d# d= , D ]\}} |jdD| dt| , dd/ |d# q=fdEdF|- D }!|!, D ]=\}} | , D ]d\}"}#||- v }$|$r|"|| vr|dG|" dH| dI |#t.kr|dJ|" dK| dL|# d  d}|/dM t0||"d=|}%W d    n	1 sw   Y  |%d*kr|dN|" dO d}qn|/dP t1||}&W d    n	1 sw   Y  |dQ| dO |dR t2|& t3dS | }'t4|'t5dT}(|dUt5 dVt6|(-  dW|&dX  dY|&dZ  d[t7|( d\ |jd]t7|' |d# |&d^ t8k rG|d_ n|d` |&da ( })t'|)t9 }*dbd |*d-D }+|jdc)t|+|d# |&dd t:k r{|de n|df |&dg ( },t'|,t9 }-dhd |-d-D }.|jdi)t|.|d# qe|r|jdjt. dk|d# n|dl |r|jdm|d# n|dn do|v rt;dpdq do D }/do }0t<do}1d}d}d}2d}3|dr |t|1 ds |0d3 }4||4 dt |/D ]}"t|"d*kr|du qdvd |0 D }5t|5dd/}5|jdw|5 |d# |1|/ }6|6rA|dxt|6 dy dz rR|dz  d{ d}2|/D ]F}"|0|" t.kr|dJ|" dL|0|"  d  d}|/dM t0||"do}%W d    n	1 sw   Y  |%d*kr|dN|" dO d}qTd| r|d|  d} d}3|s|dl |s|dn |2s|d~ |3s|d |r|jdt. dk|d# |r|jd|d# |2r|d d|v r|d t<d}/|dt|/ ds |jdt|/ |d# |/t;d  }6|6r(|dxt|6 dy t;d t;|d krI|dtd  dt|d  dy t|/dk rU|d d d*ksc|d d*krh|d d d*krt|d |d d*kr|d d|v r|d t<d}/|dt|/ ds |jdt|/ |d# |/t;d  }6|6r|dxt|6 dy t;d t;|d kr|dtd  dt|d  dy d d*ks|d d*kr|d d d*kr|d d*kr|d n|d |d d*kr|d d|v r|d t=d ,  \}7}8|t|7 d t>?|8}9|9|9'  }9|9 t>@|9 ' t>@t|7 }:||: d t<d}1t;|7}/|1|/ }6|6rr|dxt|6 dy td  dd/}5|j|5|d# d|v r|d dd d D }7t<d}1|t|7 d t;|7}/|1|/ }6|6r|dxt|6 dy td  dd/}5|j|5|d# d|v rSd}|d |dd  dd% d  dd d td  };|;dk r|d|;dd dd d D }<dd |d D }=dd |d D }>|d d*kr5|d }?|d|? d |d d*krI|d }?|d|? d |t|= d |t|< d t|d  dd/}5|j|5|d# |d D ]}"|d |" tAkr|dJ|" dL|d |"  d  d}qrg }@d D ]}"d |" tAkrtB|"v r|@C|" dd |"   qt|@d*kr|dt|@ d |jdd|@|d# d}t;|<t;|> r|jddt;|<t;|> |d# t;|>t;|< r
|jddt;|>t;|< |d# |r|jdtA d|d# t|d dkr/|dd|d  d d d*krA|dd  d d d*krS|dd  d d|v 	r,|d¡ d }A|d }B|t|A dĝ |t|B dŝ |B|A }Ct|Cd*krt|Ct|B }D|t|C d|Dd: ddǝ n|dȡ d d*krd }E||E dʝ |d d*kr|d }E||E d˝ d d*kr܈d }E||E d͝ n|dΡ |d d*kr|d }E||E dϝ n|dС d d*k	rd }E||E dҝ n|dӡ |d d*k	r'|d }E||E dԝ n|dա |d֡ |jDtEjF }F|jDtEjG }G|jDtEjH }H|F	rX||F d|Fdk	rRdndٛ dڝ |G	rl||G d|Gdk	rgdndܛ  |H	r||H d|Hdk	r{dndޛ  tIJd d S d S )N)no_printprettyr=   )rH   training)schemafrozen_componentsc                    s   g | ]}| vr|qS rI   rI   ).0p)rP   rI   rJ   
<listcomp>       zdebug_data.<locals>.<listcomp>c                    s   g | ]}  |jqS rI   get_pipe_metafactory)rQ   pipenlprI   rJ   rS      rT   zData file validationtrain_corpus
dev_corpusc                      s    S NrI   rI   )rZ   r[   rI   rJ   <lambda>   s    zdebug_data.<locals>.<lambda>z%Pipeline can be initialized with datazCorpus is loadableT)	make_projFtextszTraining statsz
Language: zTraining pipeline: , z!Components from other pipelines: zFrozen components: z training docsz evaluation docszNo evaluation docsz* training examples also in evaluation dataz/No overlap between training and evaluation dataz0Low number of examples to train a new pipeline ()z!It's recommended to use at least z examples (minimum )showzVocab & Vectorsn_wordsz total word(s) in the data (wordsz unique)n_misaligned_wordsr   z' misaligned tokens in the training dataz" misaligned tokens in the dev data
   z10 most common words: )countszfloret vectors with z
 vectors, z dimensions, -z char n-gram subwordsz
 vectors (z unique keys, z dimensions)words_missing_vectorsz3{} words in training data without vectors ({:.0f}%)r/   z(10 most common words without vectors: {}z&No word vectors present in the packagespancatspancat_singlelabelzSpan Categorizationz	Spans KeyLabels)headerdividerzLabel counts in train data: zKey: c                    s   i | ]	}| d  | qS )rk   rI   )rQ   	spans_key)gold_train_datarI   rJ   
<dictcomp>   s    zdebug_data.<locals>.<dictcomp>zLabel 'z-' is not present in the model labels of key 'z*'. Performance may degrade after training.z"Low number of examples for label 'z
' in key '' (zAnalyzing label distribution...z)No examples for texts WITHOUT new label ''z!Obtaining span characteristics...z$Span characteristics for spans_key 'z8SD = Span Distinctiveness, BD = Boundary Distinctivenessspans_length)	thresholdzOver z % of spans have lengths of 1 -- z (min=
min_lengthz, max=
max_lengthz%). The most common span lengths are: z. If you are using the n-gram suggester, note that omitting infrequent n-gram lengths can greatly improve speed and memory usage.z#Full distribution of span lengths: avg_sdz5Spans may not be distinct from the rest of the corpusz.Spans are distinct from the rest of the corpusp_spansc                 S      g | ]\}}|qS rI   rI   rQ   w_rI   rI   rJ   rS   =      z10 most common span tokens: {}avg_bdz<Boundary tokens are not distinct from the rest of the corpusz8Boundary tokens are distinct from the rest of the corpusp_boundsc                 S   r{   rI   rI   r|   rI   rI   rJ   rS   M  r   z'10 most common span boundary tokens: {}z<To train a new span type, your data should include at least z instances of the new labelz&Good amount of examples for all labelszpTraining data should always include examples of spans in context, as well as examples without a given span type.z5Examples without occurrences available for all labelsnerc                 s   s    | ]	}|d vr|V  qdS )Ori   NNrI   rQ   labelrI   rI   rJ   	<genexpr>j  s    zdebug_data.<locals>.<genexpr>zNamed Entity Recognitionz	 label(s)z) missing value(s) (tokens with '-' label)zEmpty label found in train datac                 S   s    g | ]\}}|d kr||fqS )ri   rI   )rQ   r   countrI   rI   rJ   rS   {  s
    zLabels in train data: z|Some model labels are not present in the train data. The model performance may be degraded for these labels after training: .ws_entsz  invalid whitespace entity spansboundary_cross_entsz, entity span(s) crossing sentence boundariesz<No entities consisting of or starting/ending with whitespacez(No entities crossing sentence boundariesz>To train a new entity type, your data should include at least zuTraining data should always include examples of entities in context, as well as examples without a given entity type.z`Entity spans consisting of or starting/ending with whitespace characters are considered invalid.textcatz'Text Classification (Exclusive Classes)zText Classification: zLabels: catszWPotential train/dev mismatch: the train and dev labels are not the same. Train labels: z. Dev labels: r   zThe model does not have enough labels. 'textcat' requires at least two labels due to mutually-exclusive classes, e.g. LABEL/NOT_LABEL or POSITIVE/NEGATIVE for a binary classification task.n_cats_bad_valueszMUnsupported values for cats: the supported values are 1.0/True and 0.0/False.n_cats_multilabelzThe train data contains instances without mutually-exclusive classes. Use the component 'textcat_multilabel' instead of 'textcat'.zThe dev data contains instances without mutually-exclusive classes. Use the component 'textcat_multilabel' instead of 'textcat'.textcat_multilabelz Text Classification (Multilabel)zPotential train/dev mismatch: the train data contains instances without mutually-exclusive classes while the dev data contains only instances with mutually-exclusive classes.zThe train data contains only instances with mutually-exclusive classes. You can potentially use the component 'textcat' instead of 'textcat_multilabel'.zTrain/dev mismatch: the dev data contains instances without mutually-exclusive classes while the train data contains only instances with mutually-exclusive classes.taggerzPart-of-speech Taggingtagsz label(s) in train dataz  is the normalised label entropymorphologizerzMorphologizer (POS+Morph)c                 S      g | ]}|qS rI   rI   r   rI   rI   rJ   rS   5      morphsparserzDependency ParsingzFound n_sentsz' sentence(s) with an average length of z.1fz words.g?zThe training data contains z.2fz sentences per document. When there are very few documents containing more than one sentence, the parser will not learn how to segment longer texts into sentences.c                 S   r   rI   rI   r   rI   rI   rJ   rS   Z  r   depsc                 S   r   rI   rI   r   rI   rI   rJ   rS   [  s    c                 S   r   rI   rI   r   rI   rI   rJ   rS   ^  r   	n_nonprojz  nonprojective train sentence(s)z nonprojective dev sentence(s)z% label(s) in projectivized train dataz: zLow number of examples for z label(s) in the projectivized dependency trees used for training. You may want to projectivize labels such as punct before training in order to improve parser performance.z3Projectivized labels with low numbers of examples: z7The following labels were found only in the train data:z5The following labels were found only in the dev data:z5To train a parser, your data should include at least z instances of each label.rootsr$   zMultiple root labels (zq) found in training data. spaCy's parser uses a single root label ROOT so this distinction will not be available.z. nonprojective projectivized train sentence(s)n_cyclesz, projectivized train sentence(s) with cyclestrainable_lemmatizerzTrainable Lemmatizerlemmatizer_treesz. lemmatizer trees generated from training dataz) lemmatizer trees generated from dev dataz lemmatizer trees (z7% of dev trees) were found exclusively in the dev data.z/All trees in dev data present in training data.n_low_cardinality_lemmasz) training docs with 0 or 1 unique lemmas.z$ dev docs with 0 or 1 unique lemmas.no_lemma_annotationsz) training docs with no lemma annotations.z)All training docs have lemma annotations.z$ dev docs with no lemma annotations.z$All dev docs have lemma annotations.partial_lemma_annotationsz. training docs with partial lemma annotations.z2All training docs have complete lemma annotations.z) dev docs with partial lemma annotations.z-All dev docs have complete lemma annotations.Summary checkchecksz passedwarningwarningserrorerrors)Kr   r,   r   load_configload_model_from_configconfiginterpolater!   resolver   r    
pipe_namesro   r"   
initializegoodlist_compile_goldtextlangjoinlenfailintersectionrE   BLANK_MODEL_THRESHOLDBLANK_MODEL_MIN_THRESHOLDinfomost_common_format_labelsvocabvectorsmodeVectorsModefloretvectors_lengthminnmaxnn_keyssumvaluesformat_get_labels_from_spancattableitemskeysNEW_LABEL_THRESHOLDloading_get_examples_without_label_get_span_characteristics_print_span_characteristics_get_spans_length_freq_dist_filter_spans_length_freq_dist SPAN_LENGTH_THRESHOLD_PERCENTAGEmax_format_freqsSPAN_DISTINCT_THRESHOLDr   BOUNDARY_DISTINCT_THRESHOLDset_get_labels_from_modelzipnumpyarraylog2DEP_LABEL_THRESHOLDr   appendrh   r   GOODWARNFAILsysexit)Ir;   rA   r=   r>   r?   rB   r   cfgr   Tsourced_componentsresume_componentspipelinefactory_names	dot_namesr\   train_datasetdev_datasetgold_train_unpreprocessed_datagold_dev_datatrain_texts	dev_textsoverlapr   rd   n_misalignedmost_common_wordsn_missing_vectorsmodel_labels_spancathas_low_data_warninghas_no_neg_warningrp   data_labelsdata_labels_in_componentr   r   spans_key_in_modelneg_docsspan_characteristics_span_freqs_filtered_span_freqsrz   all_span_tokensmost_common_spansr   all_span_bound_tokensmost_common_boundslabelslabel_countsmodel_labelshas_ws_ents_errorhas_boundary_cross_ents_warningmissing_valueslabels_with_countsmissing_labels
label_listrh   rR   norm_entropysents_per_doclabels_trainlabels_train_unpreprocessed
labels_devr   rare_projectivized_labelstrees_train	trees_devdev_not_trainpctngood_countswarn_countsfail_countsrI   )rP   rq   rZ   r[   rJ   rG   j   s,  	


















































"
















$""rG   	file_pathr   returnc                 C   s   | j d }| jdkr2|d| d t| }W d    n1 s#w   Y  |d|  |S | jdkr_|d| d t| }W d    n1 sPw   Y  |d|  |S |jd| j dd	d
 d S )Nz.jsonzLoading z...zLoaded z.jsonlzCan't load file extension zExpected .json or .jsonlr$   )exits)partssuffixr   srsly	read_jsonr   
read_jsonlr   )r  r   	file_namer1   rI   rI   rJ   
_load_file  s$   




r(  examplesr   rZ   r_   c           $      C   s  i dt  dt  dt  dt  dt  dt  dt  dt d	t d
t dt dddddddddt  ddddddt t dddd	}d|v rYt|jj}| D ]p}|j}|j}dd |D }	|d |	 |d  t	|	7  < |j
}
|D ]}|j rq|
jj|j dkr|d  d7  < q|d |j t	|jjrdd |D D ]}|jj| |jjvr|d |g qd|v r'| }t| D ]R\}}|d u rq|dr|| jr|d  d7  < |drt|}|d |  d7  < || r|dr|d  d7  < q|dkr&|d d  d7  < qd|v s1d|v r^t|jj D ]#}||d vrJt  |d |< t|jj| D ]\}}|jd u r^qR|d | |j  d7  < qR||d	 vr{t |d	 |< |j| D ],}|jd u rq|j|d	 | vrg |d	 | |j< |d	 | |j t	| q||d
 vrt |d
 |< |j| D ]"}|j|d
 | vrg |d
 | |j< |d
 | |j | qd}||d vrt |d |< |j| D ]c}|j|d | vrg g d|d | |j< t|D ]E}|j|d  }|dkr6|d | |j d  |||d   |j |d  }|t	|krX|d | |j d! ||d |  qqq9d"|v shd#|v r|d |j! t"d$d% |j!# D r|d&  d7  < t|j!# $ddkr|d'  d7  < d(|v r|j%d)d*d+}|d d,d |D  d-|v r
|j%d.d*d+}|j%d/d*d+}t&||D ]?\}}|d u s|d u rڐq|d0kr|d0krqt'(|}|r||t)j*< |jjj|jjj+| }|d |g qd1|v rc|j,|d2\}}|d d3d |D  tt&||D ]\}\}} | |krE|d |g |d  d7  < q*t-.|rU|d4  d7  < t-/|rc|d5  d7  < d|v rt0d6d% |D r{|d7  d7  < q[t"d8d% |D r|d9  d7  < t }!|D ]#}|j1dkr|!|j1 ||j|j2}"|3|"}#|d: |# qt	|!d;k rt	|dkr|d<  d7  < q[|S )=Nr   r   r   r   r   re   r   rk   ru   spans_per_typesb_per_typer   r   r   rd   rf   rj   r   )	r   r   r   r   r`   r   r   r   r   r   c                 S      g | ]}|j qS rI   r   rQ   xrI   rI   rJ   rS   6      z!_compile_gold.<locals>.<listcomp>r$   r`   c                 S   r,  rI   r-  )rQ   trI   rI   rJ   rS   A  r0  )B-U-L-)r2  r3  )zI-r4  ri   rl   )startendr5  r6  r   r   c                 s   s    | ]}|d vV  qdS ))r   r$   NrI   )rQ   valrI   rI   rJ   r         z _compile_gold.<locals>.<genexpr>r   r   r   TAGT)	as_stringc                 S      g | ]}|d ur|qS r]   rI   r.  rI   rI   rJ   rS     rT   r   POSMORPH r   )projectivizec                 S   r;  r]   rI   r.  rI   rI   rJ   rS     rT   r   r   c                 s       | ]}|j d kV  qdS r   NlemmarQ   tokenrI   rI   rJ   r         r   c                 s   r@  rA  rB  rD  rI   rI   rJ   r     rF  r   r   r   r   )4r   dictr   r   r   strings	reference	predictedupdater   	alignmentorth_isspacex2ylengthsiaddr   r   get_aligned_sent_starts	enumerateget_aligned_ner
startswithis_spacer   r   spansr   label_r   ranger5  r6  r   anyr   r   get_alignedr   r   feats_to_dictr   POS_FEAT
morphologyget_aligned_parser   is_nonproj_treecontains_cycleallrC  lemma_tree_to_str)$r)  r   rZ   r_   r1   treeseggolddocvalid_wordsalignrE  wordsent_startsrQ  r   combined_labelrp   spanwindow_sizeoffsetsb_start_idx
sb_end_idxr   pos_tagsr   posmorph
label_dictaligned_headsaligned_depsdephead	lemma_settree_idtree_strrI   rI   rJ   r     sf  	




 








r   r  rh   c                 C      d S r]   rI   r  rh   rI   rI   rJ   r     s   r   c                 C   r  r]   rI   r  rI   rI   rJ   r     s   c                 C   sH   |rd dd tttttf  | D S d dd ttt | D S )Nra   c                 S   s"   g | ]\}}d | d| dqS )rt   rs   rb   rI   rQ   lcrI   rI   rJ   rS     s   " z"_format_labels.<locals>.<listcomp>c                 S   s   g | ]}d | d qS )rt   rI   rQ   r  rI   rI   rJ   rS     rT   )r   r   r   r   strintr  rI   rI   rJ   r     s
   freqssortc                 C   sL   |r
t t|  } dd |  D }ddd tttttf  |D S )Nc                 S   s   g | ]
\}}t ||fqS rI   )r  rQ   kvrI   rI   rJ   rS         z!_format_freqs.<locals>.<listcomp>ra   c                 S   s    g | ]\}}| d | dqS )z (z%)rI   r  rI   rI   rJ   rS     s     )	rG  sortedr   r   r   r   r   r  float)r  r  _freqsrI   rI   rJ   r     s   r   r   scr   	component)r   rk   rp   c                 C   sj   d}| D ].}|dkrdd |  D }|dkr*||jjv r(dd |jj| D ng }||vr2|d7 }q|S )Nr   r   c                 S   s   g | ]
}|d vrt |qS )r   )r   r   rI   rI   rJ   rS     s
    z/_get_examples_without_label.<locals>.<listcomp>rk   c                 S   r,  rI   )rY  )rQ   ro  rI   rI   rJ   rS     r0  r$   )rU  rI  rX  )r1   r   r  rp   r   rg  r  rI   rI   rJ   r     s   r   factory_namec                    sN    fddj D }t }|D ]}|}t|tsJ ||j q|S )Nc                    s    g | ]} |j kr|qS rI   rU   rQ   	pipe_namer  rZ   rI   rJ   rS     
    z*_get_labels_from_model.<locals>.<listcomp>)r   r   get_pipe
isinstancer   rK  r  )rZ   r  r   r  r  rX   rI   r  rJ   r     s   
r   c                    sf    fdd j D }i }|D ]"} |}t|tsJ |j|vr't ||j< ||j |j q|S )Nc                    s    g | ]}  |jd v r|qS ))rk   rl   rU   r  rY   rI   rJ   rS     r  z,_get_labels_from_spancat.<locals>.<listcomp>)r   r  r  r   keyr   rK  r  )rZ   r   r  r  rX   rI   rY   rJ   r     s   


r   r  c                 C   s"   t t dd | D t|  S )z Compute geometric mean of a listc                 s   s    | ]}t |V  qd S r]   )mathlog)rQ   rQ  rI   rI   rJ   r   %  rF  z_gmean.<locals>.<genexpr>)r  expfsumr   )r  rI   rI   rJ   _gmean#  s   "r  metricfrequenciesc                    s*   t  fdd|  D }|t    S )Nc                 3   s     | ]\}}| |  V  qd S r]   rI   )rQ   	span_typevaluer  rI   rJ   r   )  s    z_wgt_average.<locals>.<genexpr>)r   r   r   )r  r  totalrI   r  rJ   _wgt_average(  s   r  	normalizec                    st   t  }| D ]}|D ]}|j dddd}||  d7  < q	q|r8t| d t  fdd| D }|S )z2Get the frequency distribution given a set of Docsz``"z''r$           c                    s   i | ]	\}}||  qS rI   rI   r  r  rI   rJ   rr   7  s    z%_get_distribution.<locals>.<dictcomp>)r   r   lowerreplacer   r   r   )docsr  word_countsri  rE  r1  rI   r  rJ   _get_distribution-  s   r  rR   qc                 C   s4   d}|   D ]\}}||t|||   7 }q|S )zHCompute the Kullback-Leibler divergence from two frequency distributionsr  )r   r  r  )rR   r  r  rl  p_wordrI   rI   rJ   _get_kl_divergence;  s   r  	span_datac                    s    fdd|D }t | S )z*Compile into one list for easier reportingc                    s*   i | ]   gt  fd dD  qS )c                 3   s    | ]	}t |  V  qd S r]   )r'   )rQ   dr   rI   rJ   r   F  s    z._format_span_row.<locals>.<dictcomp>.<genexpr>)r   )rQ   r  r  rJ   rr   E  s    z$_format_span_row.<locals>.<dictcomp>)r   r   )r  r  r  rI   r  rJ   _format_span_rowC  s   
r  compiled_goldc                    s*  |d | }dd |d |   D }dd |d |   D }dd |d |  D }d	d |d |  D }td
d | D dd dd |d |   D }dd |d |   D }	 fdd|  D }
 fdd|	  D }|
|||t|t|t|
|t||t||t| ||	dS )zObtain all span characteristicsrk   c                 S      i | ]	\}}|t |qS rI   )r  )rQ   r   r  rI   rI   rJ   rr   R      z-_get_span_characteristics.<locals>.<dictcomp>ru   c                 S   r  rI   r   rQ   r   rX  rI   rI   rJ   rr   V  r  r*  c                 S      g | ]}t |qS rI   )minr  rI   rI   rJ   rS   Z  r   z-_get_span_characteristics.<locals>.<listcomp>c                 S   r  rI   )r   r  rI   rI   rJ   rS   [  r   c                 S   r,  rI   )rI  )rQ   rg  rI   rI   rJ   rS   ^  r0  Tr  c                 S   s   i | ]\}}|t |d dqS )Tr  r  r  rI   rI   rJ   rr   _  s    c                 S   s*   i | ]\}}|t |d  |d  ddqS )r5  r6  Tr  r  )rQ   r   sbrI   rI   rJ   rr   c  s    r+  c                       i | ]
\}}|t | qS rI   r  rQ   r   	freq_distp_corpusrI   rJ   rr   i      
c                    r  rI   r  r  r  rI   rJ   rr   m  r  )sdbdr*  rP  rw   rx   ry   r   
avg_lengthr  rz   r   )r   r   r  r  r   r  r   r   )r)  r  rp   r   span_lengthr*  min_lengthsmax_lengthsrz   r   span_distinctivenesssb_distinctivenessrI   r  rJ   r   L  sF   


r   r   c              	   C   s   d}t dt dd | d D }| d | d | d | d	 g}t|| d d
}| d | d | d g}dgdd |D  dg }tj|||ddgdgt|d   |d dS )z+Print all span characteristics into a table)z	Span TypeLengthSDBDN   c                 s   s    | ]}t |V  qd S r]   r  r   rI   rI   rJ   r     r8  z._print_span_characteristics.<locals>.<genexpr>r  rP  r  r  r*  )r  r  r  ry   r   zWgt. Averagec                 S   s   g | ]
}d  t|dqS )z{:.2f}r   )r   round)rQ   frI   rI   rJ   rS     r  z/_print_span_characteristics.<locals>.<listcomp>ri   Tr  rr$   )footerrn   ro   alignsmax_colN)r   r  r   r   r   )r   headersr  
table_datar   footer_datar  rI   rI   rJ   r     s0   
r   length_dictc                 C   s   g }|   D ]	\}}|| qt }|D ]}||r%||  d7  < qd||< qi }| D ]\}}	|	t| d }
t|
d}
|
||< q0|S )zDGet frequency distribution of spans length under a certain thresholdr$   g      Y@r   )r   extendr   getr   r   r  )r  rv   all_span_lengthsr~   rP  r  rQ  freq_dist_percentager  r   
percentagerI   rI   rJ   r     s   



r   r  rv   c                 C   s<   d}i }|   D ]\}}||kr |S |||< ||7 }q|S )zFilter frequency distribution with respect to a threshold

    We're going to filter all the span lengths that fall
    around a percentage threshold when summed.
    r  )r   )r  rv   r  filtered_freq_distr  distrI   rI   rJ   r     s   
r   )F)T)r   r  )_r  r   collectionsr   pathlibr   typingr   r   r   r   r   r	   r
   r   r   r   r   r   r$  typerwasabir   r   r   r>  r   compatr   languager   r_  r   r   r   r   r   (pipeline._edit_tree_internals.edit_treesr   pipeline._parser_internalsr   "pipeline._parser_internals.nonprojr   schemasr   rN   r   r   training.initializer    r!   r"   r   r#   r   _utilr%   r&   r'   r(   r)   r*   r+   r,   r   r   r   r   r   r   r   rC   ContextboolrK   r  rG   r(  r   r   r  r  r   r   r   r   r  r  r  r  r  r   r   r   r   rI   rI   rI   rJ   <module>   s4   4(
&

     

 <"
 
"	


6$



