o
     ¢i4	  ã                   @   sr   d dl mZ ddlmZ ddlmZmZ ddlmZm	Z	 ddl
mZ ddlmZ dd	lmZ ddd„Zdd„ ZdS )é    )ÚPrinteré   )ÚErrors)ÚDocÚSpan)Úiob_to_biluoÚtags_to_entities)Ú	minibatch)ÚVocabé   )Ún_sents_infoé
   Fc                 o   s@    t ƒ }t|d}|dkrt||ƒ t|  d¡||ƒE dH  dS )a   
    Convert IOB files with one sentence per line and tags separated with '|'
    into Doc objects so they can be saved. IOB and IOB2 are accepted.

    Sample formats:

    I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O
    I|O like|O London|B-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O
    I|PRP|O like|VBP|O London|NNP|I-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O
    I|PRP|O like|VBP|O London|NNP|B-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O
    )Úno_printr   Ú
N)r
   r   r   Úread_iobÚsplit)Ú
input_dataÚn_sentsr   ÚargsÚkwargsÚvocabÚmsg© r   úY/home/ubuntu/.local/lib/python3.10/site-packages/spacy/training/converters/iob_to_docs.pyÚiob_to_docs   s   €

r   c                 #   sd   t | |dD ]¨}g }g }g }g }g }|D ]b}	|	 ¡ sqdd„ |	 ¡ D ƒ}
t|
d ƒdkr5t|
Ž \}}}nt|
d ƒdkrKt|
Ž \}}dgt|ƒ }nttjƒ‚| |¡ | |¡ | |¡ | |
¡ | 	d¡ | d	d„ |d
d … D ƒ¡ qt
||d‰ t|ƒD ]	\}}|ˆ | _q‚t|ƒD ]	\}}|ˆ | _qt|ƒ}t|ƒ}‡ fdd„|D ƒˆ _ˆ V  qd S )N)Úsizec                 S   s   g | ]}|  d ¡‘qS )ú|)r   )Ú.0Útr   r   r   Ú
<listcomp>(   s    zread_iob.<locals>.<listcomp>r   r   é   ú-Tc                 S   s   g | ]}d ‘qS )Fr   )r   Ú_r   r   r   r   5   s    r   )Úwordsc                    s&   g | ]\}}}t ˆ ||d  |d‘qS )r   )ÚstartÚendÚlabel)r   )r   ÚLÚsÚe©Údocr   r   r   =   s   & )r	   Ústripr   ÚlenÚzipÚ
ValueErrorr   ÚE902ÚextendÚappendr   Ú	enumerateÚtag_Úis_sent_startr   r   Úents)Ú	raw_sentsr   r   ÚgroupÚtokensr#   ÚtagsÚiobÚsent_startsÚlineÚsent_tokensÚ
sent_wordsÚ	sent_tagsÚsent_iobÚiÚtagÚ
sent_startÚbiluoÚentitiesr   r*   r   r      sB   €





ár   N)r   F)Úwasabir   Úerrorsr   r9   r   r   Útrainingr   r   Úutilr	   r   r
   Úconll_ner_to_docsr   r   r   r   r   r   r   Ú<module>   s    
