o
     ¢i!  ã                   @   sz   d dl mZ ddlmZ ddlmZmZ ddlmZ ddl	m
Z
mZ ddlmZ 	
ddd„Zddd„Zdd„ Zdd„ ZdS )é    )ÚPrinteré   )ÚErrors)ÚDocÚSpan)Úiob_to_biluo)Úget_lang_classÚ
load_modelé   )Útags_to_entitiesé
   FNc                 +   st   t |d}d}d| v r|r| d¡ d}|| v r"|r"| d¡ d}d| v r7|| vr7|r7t||ƒ t| ||ƒ} d| vrJ|| v rJ|rJt| dd||d	} d| vrp|| vrp|dkrb|sb| d
|› d¡ nt||ƒ t| ||||d	} d| vry| d¡ || vr‚| d¡ |r‰t|ƒ}ntdƒƒ }|  ¡  |¡D ]¢}	|	 ¡ }	|	sžq•g }
g }g }g }|	 d¡D ]_}| ¡ }|s´q«dd„ | d¡D ƒ}t	t
dd„ |D ƒŽ ƒ}t|ƒdk rÔttjƒ‚t|d ƒ}|
 |d ¡ | dgdg|d   ¡ | t|d ƒ¡ | t|ƒdkr|d ndg| ¡ q«t|j|
d‰ tˆ ƒD ]\}}|| |_|| |_qt|ƒ}‡ fdd„|D ƒˆ _ˆ V  q•dS )a  
    Convert files in the CoNLL-2003 NER format and similar
    whitespace-separated columns into Doc objects.

    The first column is the tokens, the final column is the IOB tags. If an
    additional second column is present, the second column is the tags.

    Sentences are separated with whitespace and documents can be separated
    using the line "-DOCSTART- -X- O O".

    Sample format:

    -DOCSTART- -X- O O

    I O
    like O
    London B-GPE
    and O
    New B-GPE
    York I-GPE
    City I-GPE
    . O

    )Úno_printz-DOCSTART- -X- O Oú

zNSentence boundaries found, automatic sentence segmentation with `-s` disabled.FzNDocument delimiters found, automatic document segmentation with `-n` disabled.r   Ú )ÚmodelÚmsgz4No sentence boundaries found to use with option `-n zD`. Use `-s` to automatically segment sentences or `-n 0` to disable.zJNo sentence boundaries found. Use `-s` to automatically segment sentences.zWNo document delimiters found. Use `-n` to automatically group sentences into documents.Úxxc                 S   s   g | ]
}|  ¡ r|  ¡ ‘qS © )Ústrip©Ú.0Úliner   r   ú_/home/ubuntu/.local/lib/python3.10/site-packages/spacy/training/converters/conll_ner_to_docs.pyÚ
<listcomp>f   ó    z%conll_ner_to_docs.<locals>.<listcomp>Ú
c                 S   s   g | ]}|  ¡ ‘qS r   )Úsplitr   r   r   r   r   g   s    r
   Té   éÿÿÿÿú-©Úwordsc                    s&   g | ]\}}}t ˆ ||d  |d‘qS )r   )ÚstartÚendÚlabel)r   )r   ÚLÚsÚe©Údocr   r   r   u   s   & N)r   ÚwarnÚn_sents_infoÚsegment_docsÚsegment_sents_and_docsr	   r   r   r   ÚlistÚzipÚlenÚ
ValueErrorr   ÚE903Úextendr   r   ÚvocabÚ	enumerateÚtag_Úis_sent_startr   Úents)Ú
input_dataÚn_sentsÚ	seg_sentsr   r   Úkwargsr   Údoc_delimiterÚnlpÚ	conll_docr!   Úsent_startsÚpos_tagsÚ
biluo_tagsÚ
conll_sentÚlinesÚcolsÚlengthÚiÚtokenÚentitiesr   r(   r   Úconll_ner_to_docs
   s†   €
ÿÿ

ÿ

ÿÿÿ


(
ärJ   c                 C   s  d }|r3t |ƒ}d|jv r3| d|› d¡ |jD ]\}}dt|dg ƒv r-| |ddg¡ q| d¡}|sD| d¡ tdƒƒ }| d¡}|  	¡  
d	¡}	d
d„ |	D ƒ}
t|j|
d}||ƒ g }d}t|ƒD ]$\}}|jr‚|ry|| dkry| |¡ | d¡ |d7 }| |	| ¡ qed	 |¡S )NÚparserz-Segmenting sentences with parser from model 'z'.Úlistening_componentszmodel.tok2veczhSegmenting sentences with sentencizer. (Use `-b model` for improved parser-based sentence segmentation.)r   Úsentencizerr   c                 S   s   g | ]
}|  ¡  ¡ d  ‘qS )r   )r   r   r   r   r   r   r   ‹   r   z*segment_sents_and_docs.<locals>.<listcomp>r    r   r   r   )r	   Ú
pipe_namesÚinfoÚpipelineÚgetattrÚreplace_listenersÚget_piper   Úcreate_piper   r   r   r4   r5   r7   ÚappendÚjoin)r)   r:   r=   r   r   rM   r>   ÚnameÚprocrD   r!   ÚnlpdocÚlines_with_segsÚ
sent_countrG   rH   r   r   r   r-   y   s<   
€
ÿ




r-   c                    sZ   d}|   |¡‰‡ ‡fdd„tdtˆƒˆ ƒD ƒ}d} |D ]}| || 7 } | | |¡7 } q| S )Nr   c                    s   g | ]
}ˆ||ˆ  … ‘qS r   r   )r   rG   ©r:   Úsentsr   r   r      r   z segment_docs.<locals>.<listcomp>r   r   )r   Úranger0   rV   )r9   r:   r=   Úsent_delimiterÚdocsr)   r   r\   r   r,   š   s   
 r,   c                 C   s,   |   d|› d¡ |dkr|  d¡ d S d S )NzGrouping every z sentences into a document.r   z^To generate better training data, you may want to group sentences into documents with `-n 10`.)rO   r*   )r   r:   r   r   r   r+   ¥   s   ÿÿr+   )r   FNF)NN)Úwasabir   Úerrorsr   Útokensr   r   Útrainingr   Úutilr   r	   r   r   rJ   r-   r,   r+   r   r   r   r   Ú<module>   s    
ÿ
o!