o
    i.                     @   s  d dl Z d dlZd dlmZ d dlmZmZmZmZm	Z	m
Z
mZ d dlZddlmZ ddlmZmZ ddlmZmZ ddlmZ d	d
lmZ d	dlmZ erTddlmZ dZejd	 	 	d-de
e de de!de!de
e dedgee f fddZ"ejd	 	 	 d.de
ee#ef  de!de!de!dedgee f f
ddZ$ejddddede fd d!Z%ejd"	 	 d/de
e de!de!dedgee f fd#d$Z&dee#ef de	e fd%d&Z'G d'd( d(Z(G d)d* d*Z)G d+d, d,Z*dS )0    N)Path)TYPE_CHECKINGCallableIterableIteratorListOptionalUnion   )util)ErrorsWarnings)DocDocBin)Vocab   )dont_augment)Example)Languagez.spacyzspacy.Corpus.v1pathgold_preproc
max_lengthlimit	augmenterreturnr   c                 C   s2   | d u r	t tjtjd|  t| ||||dS )NzLoading corpus from path: %s)r   r   r   r   )
ValueErrorr   E913r   loggerdebugCorpus)r   r   r   r   r    r    I/home/ubuntu/.local/lib/python3.10/site-packages/spacy/training/corpus.pycreate_docbin_reader   s   
r"   zspacy.JsonlCorpus.v1
min_lengthc                 C   s   t | |||dS )N)r#   r   r   )JsonlCorpus)r   r#   r   r   r    r    r!   create_jsonl_reader*   s   r%   zspacy.read_labels.v1F)requirer&   c                C   s   |s|   sd S t| S N)existssrsly	read_json)r   r&   r    r    r!   read_labels4   s   
r+   zspacy.PlainTextCorpus.v1c                 C   s    | du r	t tjt| ||dS )  Iterate Example objects from a file or directory of plain text
    UTF-8 files with one line per doc.

    path (Path): The directory or filename to read from.
    min_length (int): Minimum document length (in tokens). Shorter documents
        will be skipped. Defaults to 0, which indicates no limit.
    max_length (int): Maximum document length (in tokens). Longer documents will
        be skipped. Defaults to 0, which indicates no limit.

    DOCS: https://spacy.io/api/corpus#plaintextcorpus
    Nr#   r   )r   r   r   PlainTextCorpus)r   r#   r   r    r    r!   create_plain_text_reader=   s   
r/   c                 C   s   t | } |  s| jd |r| gS | }| g}g }t }|D ]5} t| |v r)q |t|  | jr<| jd dr<q |  rH|	| 
  q | jd |rU||  q t|dkrgttjj||d |  |S )N.r   )r   format)r   ensure_pathis_dirpartsendswithsetstradd
startswithextenditerdirappendlenwarningswarnr   W090r2   sort)r   	file_type	orig_pathpathslocsseenr    r    r!   walk_corpusS   s,   

rH   c                   @   s   e Zd ZdZdddddddeeef deded	ed
e	e
 deddfddZdddee fddZdddededefddZdddee dee fddZdddee dee fddZdedeeeef  dee fddZdS )r   a6  Iterate Example objects from a file or directory of DocBin (.spacy)
    formatted data files.

    path (Path): The directory or filename to read from.
    gold_preproc (bool): Whether to set up the Example object with gold-standard
        sentences and tokens for the predictions. Gold preprocessing helps
        the annotations align to the tokenization, and may result in sequences
        of more consistent length. However, it may reduce run-time accuracy due
        to train/test skew. Defaults to False.
    max_length (int): Maximum document length. Longer documents will be
        split into sentences, if sentence boundaries are available. Defaults to
        0, which indicates no limit.
    limit (int): Limit corpus to a subset of examples, e.g. for debugging.
        Defaults to 0, which indicates no limit.
    augment (Callable[Example, Iterable[Example]]): Optional data augmentation
        function, to extrapolate additional examples from your annotations.
    shuffle (bool): Whether to shuffle the examples.

    DOCS: https://spacy.io/api/corpus
    r   FN)r   r   r   r   shuffler   r   r   r   r   rI   r   c                C   s:   t || _|| _|| _|| _|d ur|nt| _|| _d S r'   )	r   r3   r   r   r   r   r   r   rI   )selfr   r   r   r   r   rI   r    r    r!   __init__   s   

zCorpus.__init__nlpr   c                 c   sv    |  |jt| jt}| jrt|}t| | jr"| 	||}n| 
||}|D ]}| ||D ]}|V  q2q*dS )zYield examples from the data.

        nlp (Language): The current nlp object.
        YIELDS (Example): The examples.

        DOCS: https://spacy.io/api/corpus#call
        N)read_docbinvocabrH   r   	FILE_TYPErI   listrandomr   make_examples_gold_preprocmake_examplesr   )rJ   rL   ref_docsexamplesreal_egaugmented_egr    r    r!   __call__   s   
zCorpus.__call__	referencec                 C   sF   |s|j rtt|jdd |D dd |D d|S t||j|S )Nc                 S      g | ]}|j qS r    text.0wordr    r    r!   
<listcomp>       z(Corpus._make_example.<locals>.<listcomp>c                 S      g | ]}t |jqS r    boolwhitespace_r]   r    r    r!   r`          wordsspaces)has_unknown_spacesr   r   rN   make_docr\   )rJ   rL   rY   r   r    r    r!   _make_example   s   
	zCorpus._make_examplereference_docsc                 c   s    |D ]F}t |dkrq| jdkst || jk r!| ||dV  q|drI|jD ]}t |dkr2q)| jdks>t || jk rH| || dV  q)qd S )Nr   F
SENT_START)r>   r   rl   has_annotationsentsas_doc)rJ   rL   rm   rY   ref_sentr    r    r!   rS      s   

zCorpus.make_examplesc                 c   sZ    |D ]'}| drdd |jD }n|g}|D ]}| ||d}t|jr)|V  qqd S )Nrn   c                 S   s   g | ]}|  qS r    )rq   )r^   sentr    r    r!   r`      s    z5Corpus.make_examples_gold_preproc.<locals>.<listcomp>T)ro   rp   rl   r>   x)rJ   rL   rm   rY   	ref_sentsrr   egr    r    r!   rR      s   

z!Corpus.make_examples_gold_preprocrN   rF   c                 c   s|    d}|D ]6}t |}|jd tr;t |}||}|D ]}t|r:|V  |d7 }| j	dkr:|| j	kr: nq!qdS )z(Yield training examples as example dictsr   r0   r   N)
r   r3   r5   r6   rO   r   	from_diskget_docsr>   r   )rJ   rN   rF   ilocdoc_bindocsdocr    r    r!   rM      s   

zCorpus.read_docbin)__name__
__module____qualname____doc__r	   r8   r   intrd   r   r   rK   r   r   rX   r   rl   r   rS   rR   r   rM   r    r    r    r!   r   l   sj    
	



r   c                   @   s^   e Zd ZdZdZdddddeeeef  de	de	de	d	d
f
ddZ
ddd	ee fddZd
S )r$   ac  Iterate Example objects from a file or directory of jsonl
    formatted raw text files.

    path (Path): The directory or filename to read from.
    min_length (int): Minimum document length (in tokens). Shorter documents
        will be skipped. Defaults to 0, which indicates no limit.

    max_length (int): Maximum document length (in tokens). Longer documents will
        be skipped. Defaults to 0, which indicates no limit.
    limit (int): Limit corpus to a subset of examples, e.g. for debugging.
        Defaults to 0, which indicates no limit.

    DOCS: https://spacy.io/api/corpus#jsonlcorpus
    jsonlr   )r   r#   r   r   r   r#   r   r   Nc                C   s"   t || _|| _|| _|| _d S r'   )r   r3   r   r#   r   r   )rJ   r   r   r#   r   r    r    r!   rK      s   
zJsonlCorpus.__init__rL   r   c              	   c   s    t | jdD ]G}t|}|D ]=}||d }| jdkr&t|| jk r&q| jdkr3t|| jkr3qdd |D }dd |D }t|t	|j
||dV  qqdS )	zYield examples from the data.

        nlp (Language): The current nlp object.
        YIELDS (Example): The example objects.

        DOCS: https://spacy.io/api/corpus#jsonlcorpus-call
        z.jsonlr\   r   c                 S   rZ   r    r[   r^   wr    r    r!   r`     ra   z(JsonlCorpus.__call__.<locals>.<listcomp>c                 S   rb   r    rc   r   r    r    r!   r`     rf   rg   N)rH   r   r)   
read_jsonlrk   r#   r>   r   r   r   rN   )rJ   rL   rz   recordsrecordr}   rh   ri   r    r    r!   rX     s   
zJsonlCorpus.__call__r~   r   r   r   rC   r   r	   r8   r   r   rK   r   r   rX   r    r    r    r!   r$      s$    
r$   c                	   @   sX   e Zd ZdZdZddddeeeef  de	de	dd	fd
dZ
dddee fddZd	S )r.   r,   txtr   r-   r   r#   r   r   Nc                C   s   t || _|| _|| _d S r'   )r   r3   r   r#   r   )rJ   r   r#   r   r    r    r!   rK   +  s   
zPlainTextCorpus.__init__rL   r   c              	   c   s    t | jdD ]M}t|dd=}|D ]2}|d}t|rD||}| jdkr/t|| jk r/q| jdkr<t|| jkr<qt||	 V  qW d   n1 sOw   Y  qdS )zYield examples from the data.

        nlp (Language): The current nlp object.
        YIELDS (Example): The example objects.

        DOCS: https://spacy.io/api/corpus#plaintextcorpus-call
        z.txtzutf-8)encodingz
r   N)
rH   r   openrstripr>   rk   r#   r   r   copy)rJ   rL   rz   fr\   r}   r    r    r!   rX   6  s"   

zPlainTextCorpus.__call__r   r    r    r    r!   r.     s    
r.   )r   r   N)r   r   r   )r   r   )+rQ   r?   pathlibr   typingr   r   r   r   r   r   r	   r)    r   errorsr   r   tokensr   r   rN   r   augmentr   exampler   languager   rO   registryreadersrd   r   r"   r8   r%   r+   r/   rH   r   r$   r.   r    r    r    r!   <module>   s    $


	
y7