o
     ¢i$(  ã                   @   s¦   d dl Z d dlmZ ddlmZmZmZ ddlmZm	Z	 ddl
mZ ddlmZ 			
		
	
ddd„Zdd„ Z	
	
		ddd„Zddd„Z	
	
		
ddd„Zdd„ ZdS )é    N)ÚPrinteré   )ÚDocÚSpanÚToken)Úbiluo_tags_to_spansÚiob_to_biluo)ÚVocabé   )Ún_sents_infoé
   Fc                 k   s|    d}t |d}t||ƒ t| ||||d}	g }
|	D ]}|
 |¡ t|
ƒ| dkr1t |
¡V  g }
q|
r<t |
¡V  dS dS )aJ  
    Convert conllu files into JSON format for use with train cli.
    append_morphology parameter enables appending morphology to tags, which is
    useful for languages such as Spanish, where UD tags are not so rich.

    Extract NER tags if available and convert them so that they follow
    BILUO and the Wikipedia scheme
    z%^((?:name|NE)=)?([BILU])-([A-Z_]+)|O$)Úno_print)Úappend_morphologyÚner_tag_patternÚner_mapÚmerge_subtokensr   N)r   r   Úread_conllxÚappendÚlenr   Ú	from_docs)Ú
input_dataÚn_sentsr   r   r   r   Ú_ÚMISC_NER_PATTERNÚmsgÚ	sent_docsÚsent_docs_to_mergeÚsent_doc© r   ú\/home/ubuntu/.local/lib/python3.10/site-packages/spacy/training/converters/conllu_to_docs.pyÚconllu_to_docs   s*   €

û
€ÿr    c                 C   s¢   |   ¡  d¡D ]G}|  ¡  d¡}|rN|d  d¡r%| d¡ |d  d¡s|D ]&}| d¡}|\
}}}}	}
}}}}}| d¡D ]}t ||¡rL   dS q?q'qdS )	z-
    Check the MISC column for NER tags.
    ú

Ú
r   ú#ú	ú|TF)ÚstripÚsplitÚ
startswithÚpopÚreÚmatch)r   r   ÚsentÚlinesÚlineÚpartsÚid_ÚwordÚlemmaÚposÚtagÚmorphÚheadÚdepÚ_1ÚmiscÚ	misc_partr   r   r   Úhas_ner0   s    
ÿ

ÿÿ€r;   Ú c           
   
   c   s~    t ƒ }t| |ƒ}|  ¡  d¡D ],}| ¡  d¡}|r<|d  d¡r.| d¡ |d  d¡s"t|||||||d}	|	V  qdS )z!Yield docs, one for each sentencer!   r"   r   r#   )r   r   r   Úset_entsN)r	   r;   r&   r'   r(   r)   Úconllu_sentence_to_doc)
r   r   r   r   r   Úvocabr=   r,   r-   Údocr   r   r   r   B   s*   €

ÿù	€òr   c                 C   sæ   g }| D ]!}|  d¡}|\
}}}}	}
}}}}}d|v sd|v r q| |¡ qg }|D ]D}d}|  d¡D ]5}t ||¡}|rh| d¡}| d¡}|rf|rf|d | }|rf| ||¡}|dkr`d}n|d | } nq3| |¡ q*t|ƒS )	a  Find entities in the MISC column according to the pattern and map to
    final entity type with `ner_map` if mapping present. Entity tag is 'O' if
    the pattern is not matched.

    lines (str): CONLL-U lines for one sentences
    tag_pattern (str): Regex pattern for entity tag
    ner_map (dict): Map old NER tag names to new ones, '' maps to O.
    RETURNS (list): List of BILUO entity tags
    r$   ú-Ú.ÚOr%   é   r   r<   )r'   r   r*   r+   ÚgroupÚgetr   )r-   Útag_patternr   Úmiscsr.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   ÚiobÚiob_tagr:   Ú	tag_matchÚprefixÚsuffixr   r   r   Úget_entities]   s4   



õrN   c           $         sÄ  t  d¡st jddd t  d¡st jddd t  d¡s$t jddd t  d¡s0t jddd g g g g g g f\}}}	}
}}g g }}d}d}tt|ƒƒD ]À}|| }| d¡}|\
}}}}}}}}}}d	|v riqMd
|v rod}d
|v rƒd}|}| d
¡\}}d|v} qM|r|r| |¡ n| |¡ |r¤||krž| | ¡ n| d¡ nd|v r®| d¡ n| d¡ |r½||kr½d}d}t|ƒd }|dvrÍt|ƒd n|}|dkrÕ|n|}|dkrÝ|nd}|dkrå|nd}|dkrídn|}| |¡ |
 |¡ |	 |¡ | |¡ | |¡ | |¡ qMt| |||	|
||||d	}!tt|!ƒƒD ]#}|| |!| j	_
|| |!| j	_|| |!| j	_|| |!| j	_q!d}"|rVt|||ƒ}"t|!|"ƒ|!_|r^t||!ƒ}!g g g g g g f\}}}	}}}
g g }}t|!ƒD ]O\}}#| |#j	j
¡ | |#j	j¡ | |#j	j¡ | |#j	j¡ |rª|#j	jrª|	 |#jd |#j	j ¡ n|	 |#j¡ |
 |#j¡ | |#jj¡ | |#j¡ qut| |||	|||
||d	‰ |rà‡ fdd„|!jD ƒˆ _ˆ S )aH  Create an Example from the lines for one CoNLL-U sentence, merging
    subtokens and appending morphology to tags if required.

    lines (str): The non-comment lines for a CoNLL-U sentence
    ner_tag_pattern (str): The regex pattern for matching NER in MISC col
    RETURNS (Example): An example containing the annotation
    Úmerged_orthr<   )ÚdefaultÚmerged_lemmaÚmerged_morphÚmerged_spaceafterFr$   rB   rA   TzSpaceAfter=Nor
   )Ú0r   r   ÚrootÚROOT)ÚwordsÚspacesÚtagsr3   ÚdepsÚlemmasÚmorphsÚheadsNÚ__)rW   rX   rY   r\   r[   r3   rZ   r]   c                    s"   g | ]}t ˆ |j|j|jd ‘qS ))Úlabel)r   ÚstartÚendr_   )Ú.0Úent©Údoc_xr   r   Ú
<listcomp>  s    ÿz*conllu_sentence_to_doc.<locals>.<listcomp>)r   Úhas_extensionÚset_extensionÚranger   r'   r   Úintr   r   rO   rR   rQ   rS   rN   r   ÚentsÚmerge_conllu_subtokensÚ	enumerateÚtag_Úpos_r6   ÚiÚdep_)$r?   r-   r   r   r   r   r=   rW   rX   rY   Úposesr\   r[   r]   rZ   Úsubtok_wordÚ	in_subtokrp   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   Úsubtok_startÚ
subtok_endÚsubtok_spaceafterr@   rk   Útr   rd   r   r>   „   sÌ   












÷

÷
ÿr>   c                 C   sº  g }| D ]·}|  d¡}|\
}}}}}	}
}}}}d|v r»|  d¡\}}|t|ƒd t|ƒ… }| |¡ g }i }g }|D ]<}| |j¡ | |j¡ |jjrw|jj  d¡D ]"}|  dd¡\}}||vrgtƒ ||< |  d¡D ]	}||  |¡ qlqTq;| 	¡ D ]\}}|d d 
t|ƒ¡ ||< q||D ]*}|j|j_d 
|¡|j_d 
|¡|_d 
t| ¡ ƒ¡|j_|d	 jr¶d
nd|j_qq| ¡ }|D ]}| |¡ qÃW d   ƒ |S 1 sÖw   Y  |S )Nr$   rA   r
   r%   ú=ú,ú r   éÿÿÿÿTF)r'   rj   r   rn   Úlemma_r   rR   ÚsetÚaddÚitemsÚjoinÚsortedÚorth_rO   rQ   ÚvaluesÚwhitespace_rS   Ú
retokenizeÚmerge)r-   r@   Úsubtok_spansr.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   ru   rv   Úsubtok_spanrY   r\   r[   ÚtokenÚfeatureÚfieldr„   ÚvalueÚretokenizerÚspanr   r   r   rl     sR   


ÿ€
ÿ€
ÿ
ÿürl   )r   FNFF)FFr<   N)N)FFNF)r*   Úwasabir   Útokensr   r   r   Útrainingr   r   r?   r	   Úconll_ner_to_docsr   r    r;   r   rN   r>   rl   r   r   r   r   Ú<module>   s4    
ú%
û
+
ù 