o
    i$                     @   s  U d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZm	Z	m
Z
mZmZmZ d dlZd dlmZ ddlmZmZ ddlmZ dd	lmZmZmZmZ d
dlmZmZmZmZ eeeeeedZ ee!e	de
e f f e"d< dZ#dZ$G dd de!eZ%e&deddddedddddeddddded
dd d!ded"d#d$d%dedd&d'd(d)ded"d*d+d,ded"d-d.d/dee#d0d1d2e'e (  dedd3d4d5ddedd6d7d8dedd9d:d;dfd<e!d=ed>e%d?e)d@e*dAee! dBe*dCe*dDe!dEee dFee! dGe*fdHdIZ+dJd
d"dd"d"ddd"dddKd<ed=ee!ef d>e!d?e)d@e*dAee! dBe*dCe*dDe!dEee dFee! dGe*dLe*dMee dNdfdOdZ,dPedQe!dNdfdRdSZ-dPedTedQe!dNdfdUdVZ.dWe!dNee! fdXdYZ/dMed<ed=ee!ef d>e!dDe!dEee fdZd[Z0d<efd\d]Z1dS )^    N)Enum)Path)AnyCallableIterableMappingOptionalUnion)Printer   )DocDocBin)docs_to_json)conll_ner_to_docsconllu_to_docsiob_to_docsjson_to_docs   )ArgOptappwalk_directory)	conllubioconlluconllneriobjson.
CONVERTERSauto)r   c                   @   s   e Zd ZdZdZdS )	FileTypesr   spacyN)__name__
__module____qualname__r   r!    r%   r%   E/home/ubuntu/.local/lib/python3.10/site-packages/spacy/cli/convert.pyr    *   s    r    convertzInput file or directoryT)helpexists-z!Output directory. '-' for stdout.)r(   
allow_dashr)   r!   z--file-typez-tzType of data to produce)r(   z	--n-sentsz-nz*Number of sentences per doc (0 to disable)Fz--seg-sentsz-szSegment sentences (for -c ner)z--modelz--basez-bzQTrained spaCy pipeline for sentence segmentation to use as base (for --seg-sents)z--morphologyz-mz#Enable appending morphology to tagsz--merge-subtokensz-TzMerge CoNLL-U subtokensz--converterz-czConverter: z	--ner-mapz-nmz6NER tag mapping (as JSON-encoded dict of entity types)z--langz-lz Language (if tokenizer required)z--concatenatez-Cz#Concatenate output to a single file
input_path
output_dir	file_typen_sents	seg_sentsmodel
morphologymerge_subtokens	converterner_maplangconcatenatec                 C   sx   t | } |t dkrdn|}|dk}t|d}t||| }t|| ||j||	 t| ||j|||||||	|
|||d dS )a  
    Convert files into json or DocBin format for training. The resulting .spacy
    file can be used with the train command and other experiment management
    functions.

    If no output_dir is specified and the output format is JSON, the data
    is written to stdout, so you can pipe them forward to a JSON file:
    $ spacy convert some_file.conllu --file-type json > some_file.json

    DOCS: https://spacy.io/api/cli#convert
    r*   no_print)r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   silentmsgN)r   r
   _get_converterverify_cli_argsvaluer'   )r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r:   r;   r%   r%   r&   convert_cli/   s,   

r?   r   )r.   r/   r0   r1   r2   r3   r5   r6   r7   r:   r;   r:   r;   returnc                C   s  t | } |st|d}|	d urt|	nd }	g }t| |D ]5}|jddd}| }W d    n1 s5w   Y  t| }|||||||
|||	d	}|||f q|ret	j
dd |D }| |fg}|D ]b\}}|dkryt|g}t|}nt|d	d
}t|}| }|dkrt|| qg|| kr|| }t ||d|  }nt ||jd  }|d| }t||| |d| d|  qgd S )Nr8   rzutf-8encoding)r/   r0   append_morphologyr3   r6   r1   r9   r5   c                 S   s   g | ]\}}|qS r%   r%   ).0_docsr%   r%   r&   
<listcomp>   s    zconvert.<locals>.<listcomp>r   T)rG   store_user_datar*   .zGenerated output file (z documents): )r   r
   srsly	read_jsonr   openreadr   append	itertoolschainfrom_iterabler   lenr   to_bytes_print_docs_to_stdoutrelative_towith_suffixparts_write_docs_to_filegood)r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r:   r;   	doc_files	input_locinfile
input_datafuncrG   all_docsdatalen_docsdbsubpathoutput_filer%   r%   r&   r'   c   sT   





rb   output_typec                 C   s*   |dkrt d|  d S tjj|  d S )Nr   r*   )rL   
write_jsonsysstdoutbufferwrite)rb   rg   r%   r%   r&   rV      s   rV   rf   c                 C   sj   |j  s|j jdd |dkrt||  d S |d}||  W d    d S 1 s.w   Y  d S )NT)parentsr   wb)parentr)   mkdirrL   rh   rN   rl   )rb   rf   rg   file_r%   r%   r&   rZ      s   
"rZ   r_   c                 C   s   |  dd d }ddd}td}td}|D ] }| }||r-|d  d7  < ||r:|d	  d7  < q|d dkrI|d	 dkrId	S |d	 dkrW|d dkrWdS d S )
N
   r   )r   r   z\S+\|(O|[IB]-\S+)z\S+\s+(O|[IB]-\S+)$r   r   r   )splitrecompilestripsearch)r_   linesformat_guessesiob_rener_reliner%   r%   r&   autodetect_ner_format   s    




r~   c                 C   s   |t vr|dkr| jd| ddd | s| jd|dd |dkr1t| s1| jd|dd |d urCt| sC| jd|dd | rZt||}t|d	krZ| jd
|dd |tvrj| jd| dd d S d S )Nr*   zCan't write .z4 data to stdout. Please specify an output directory.r   exitszInput file not foundzOutput directory not foundzNER map not foundr   zNo input files in directoryzCan't find converter for )FILE_TYPES_STDOUTfailr)   r   is_dirr   rT   r   )r;   r,   r-   r.   r4   r5   
input_locsr%   r%   r&   r=      s$   

r=   c           	      C   s  |  r9|tkr1t|d d}ttdd |D }t|dkr,d|}| jd|dd |d	 }nt||dd	 }|tkrD|jdd  }|d
ksL|dkr|j	dd}|
 }W d    n1 saw   Y  t|}|d
krw| d |}|S |dkr| d |}|S | d |S )Nsuffixc                 S   s   g | ]	}|j d d qS )r   Nr   )rE   locr%   r%   r&   rH      s    z"_get_converter.<locals>.<listcomp>r   ,z!All input files must be same typer   r   r   r   r   utf8rB   z'Auto-detected token-per-line NER formatz*Auto-detected sentence-per-line NER formatzgCan't automatically detect NER format. Conversion may not succeed. See https://spacy.io/api/cli#convert)r   AUTOr   listsetrT   joinr   r   rN   rO   r~   infowarn)	r;   r4   r,   r   
file_typesfile_types_strrq   r_   converter_autodetectr%   r%   r&   r<      s8   





r<   )2rQ   ru   ri   enumr   pathlibr   typingr   r   r   r   r   r	   rL   wasabir
   tokensr   r   trainingr   training.convertersr   r   r   r   _utilr   r   r   r   r   str__annotations__r   r   r    commandtuplekeysintboolr?   r'   rV   rZ   r~   r=   r<   r%   r%   r%   r&   <module>   s   
  "		
7
	

?


