o
    .i                     @   s   d dl Z d dlmZ d dlmZ d dlmZ d dlZd dl	Z	d dl
mZ d dlmZ e	jjeZeG dd de	jZG d	d
 d
e	jZdS )    N)	dataclass)StringIO)Optionalrequire_storage_cast)
table_castc                   @   sd   e Zd ZU dZdZeej ed< dZ	e
ed< dZee
 ed< dZeed< d	Zeed
< dZe
ed< dS )
TextConfigzBuilderConfig for text files.Nfeatureszutf-8encodingencoding_errorsi   	chunksizeFkeep_linebreaksline	sample_by)__name__
__module____qualname____doc__r	   r   datasetsFeatures__annotations__r
   strr   r   intr   boolr    r   r   W/home/ubuntu/.local/lib/python3.10/site-packages/datasets/packaged_modules/text/text.pyr      s   
 r   c                   @   s>   e Zd ZeZdd Zdd ZdejdejfddZ	d	d
 Z
dS )Textc                 C   s   t j| jjdS )N)r	   )r   DatasetInfoconfigr	   )selfr   r   r   _info   s   z
Text._infoc                    s   | j jstd| j j d j_ | j j}g }| D ]!\}}t|tr*|g} fdd|D }|	t
j|d|id q|S )a  The `data_files` kwarg in load_dataset() can be a str, List[str], Dict[str,str], or Dict[str,List[str]].

        If str or List[str], then the dataset returns only the 'train' split.
        If dict, then keys should be from the `datasets.Split` enum.
        z=At least one data file must be specified, but got data_files=Tc                    s   g | ]}  |qS r   )
iter_files).0file
dl_managerr   r   
<listcomp>0       z*Text._split_generators.<locals>.<listcomp>files)name
gen_kwargs)r   
data_files
ValueErrordownload_configextract_on_the_flydownload_and_extractitems
isinstancer   appendr   SplitGenerator)r   r%   r+   splits
split_namer(   r   r$   r   _split_generators"   s   
zText._split_generatorspa_tablereturnc                 C   sd   | j jd ur&| j jj}tdd | j j D r||}|S t||}|S |tdt	 iS )Nc                 s   s    | ]}t | V  qd S )Nr   )r"   featurer   r   r   	<genexpr>7   s    z#Text._cast_table.<locals>.<genexpr>text)
r   r	   arrow_schemaallvaluescastr   paschemastring)r   r7   rA   r   r   r   _cast_table4   s   


zText._cast_tablec              	   c   s
   | j jd urt| j jndg}ttj|D ]\}}t|| j j| j j	d}| j j
dkrnd}	 || j j}|s;n2|| 7 }t| }| j jsRdd |D }tjjt|g|d}||f| |fV  |d	7 }q1n| j j
d
krd}d}	 || j j}	|	sn7||	7 }|| 7 }|d}tjjtdd |d d D g|d}||f| |fV  |d	7 }|d }qy|rtjjt|gg|d}||f| |fV  n| j j
dkr| }
tjjt|
gg|d}|| |fV  W d    n1 sw   Y  qd S )Nr;   )r
   errorsr   r   Tc                 S   s   g | ]}| d qS )
)rstrip)r"   r   r   r   r   r&   P   r'   z)Text._generate_tables.<locals>.<listcomp>)names   	paragraph z

c                 S   s   g | ]}|r|qS r   r   )r"   exampler   r   r   r&   b   s    document)r   r	   list	enumerate	itertoolschainfrom_iterableopenr
   r   r   readr   readliner   	readlinesr   r@   Tablefrom_arraysarrayrC   split)r   r(   pa_table_namesfile_idxr#   f	batch_idxbatchr7   	new_batchr;   r   r   r   _generate_tablesA   s`   
zText._generate_tablesN)r   r   r   r   BUILDER_CONFIG_CLASSr    r6   r@   rW   rC   ra   r   r   r   r   r      s    r   )rP   dataclassesr   ior   typingr   pyarrowr@   r   datasets.features.featuresr   datasets.tabler   utilslogging
get_loggerr   loggerBuilderConfigr   ArrowBasedBuilderr   r   r   r   r   <module>   s    