o
    .i                     @   s   d dl Z d dlmZ d dlmZmZ d dlZd dlm	Z
 d dlmZ d dlZd dlmZ ejjeZeG dd dejZG dd dejZdS )	    N)	dataclass)OptionalUnion)
table_castc                       s   e Zd ZU dZdZee ed< dZee	e
  ed< dZeej ed< dZeeeje	e e	e	e  f  ed<  fddZ  ZS )	ParquetConfigzBuilderConfig for Parquet.N
batch_sizecolumnsfeaturesfiltersc                    s   t    d S N)super__post_init__self	__class__ ]/home/ubuntu/.local/lib/python3.10/site-packages/datasets/packaged_modules/parquet/parquet.pyr      s   zParquetConfig.__post_init__)__name__
__module____qualname____doc__r   r   int__annotations__r   liststrr	   datasetsFeaturesr
   r   ds
Expressiontupler   __classcell__r   r   r   r   r      s   
 (r   c                   @   s>   e Zd ZeZdd Zdd ZdejdejfddZ	d	d
 Z
dS )Parquetc                 C   s\   | j jd ur&| j jd ur&t| j jt| j jkr&td| j j d| j j tj| j jdS )NzIThe columns and features argument must contain the same columns, but got z and )r	   )configr   r	   set
ValueErrorr   DatasetInfor   r   r   r   _info    s   zParquet._infoc              
      s*  j jstdj j d j_ j j}g }| D ]P\}}t|tr*|g} fdd|D }j	j
du rbtj|D ]"}t|d}tjt|j	_
W d   n1 s\w   Y   |tj|d|id qj jdurtj jtj	j
krtfd	d
j	j
 D j	_
|S )z-We handle string, list and dicts in datafilesz=At least one data file must be specified, but got data_files=Tc                    s   g | ]}  |qS r   )
iter_files).0file)
dl_managerr   r   
<listcomp>7   s    z-Parquet._split_generators.<locals>.<listcomp>Nrbfiles)name
gen_kwargsc                    s"   i | ]\}}| j jv r||qS r   )r#   r   )r)   colfeatr   r   r   
<dictcomp>A   s   " z-Parquet._split_generators.<locals>.<dictcomp>)r#   
data_filesr%   download_configextract_on_the_flydownload_and_extractitems
isinstancer   infor	   	itertoolschainfrom_iterableopenr   r   from_arrow_schemapqread_schemaappendSplitGeneratorr   r$   )r   r+   r4   splits
split_namer.   r*   fr   )r+   r   r   _split_generators,   s*   
$zParquet._split_generatorspa_tablereturnc                 C   s    | j jd urt|| j jj}|S r   )r:   r	   r   arrow_schema)r   rH   r   r   r   _cast_tableE   s   zParquet._cast_tablec                 c   s~   | j jd ur.| j jd ur.tdd | jjjD t| j jkr.td| j j d| jj dt| j jt	r<t
| j jn| j j}ttj|D ]t\}}t|dc}t |}|jr| j jpe|jd j}z)t|j|| j j|dddD ]\}}	tj|	g}
| d	| | |
fV  quW n ty } ztd
| dt| d|   d }~ww W d    n1 sw   Y  qHd S )Nc                 s   s    | ]}|j V  qd S r   )r/   )r)   fieldr   r   r   	<genexpr>N   s    z+Parquet._generate_tables.<locals>.<genexpr>z)Tried to load parquet data with columns 'z' with mismatching features ''r-   r   )r   r   filterbatch_readaheadfragment_readahead_zFailed to read file 'z' with error z: )r#   r	   r   sortedr:   rJ   r%   r9   r
   r   r@   filters_to_expression	enumerater;   r<   r=   r>   r   ParquetFileFormatmake_fragment
row_groupsr   num_rows
to_batchespaTablefrom_batchesrK   loggererrortype)r   r.   filter_exprfile_idxr*   rF   parquet_fragmentr   	batch_idxrecord_batchrH   er   r   r   _generate_tablesL   sJ   $	 zParquet._generate_tablesN)r   r   r   r   BUILDER_CONFIG_CLASSr'   rG   r[   r\   rK   rg   r   r   r   r   r"      s    r"   )r;   dataclassesr   typingr   r   pyarrowr[   pyarrow.datasetdatasetr   pyarrow.parquetparquetr@   r   datasets.tabler   utilslogging
get_loggerr   r^   BuilderConfigr   ArrowBasedBuilderr"   r   r   r   r   <module>   s    