o
    bi                  
   @   sF  U d dl Z d dlmZmZmZmZmZ d dlmZ d dl	m
Z
 d dlmZmZmZ d dlmZ d dlmZmZ er=d dlZdZee ed< z?d d	lmZ d
e jvre rd dlZd dlZd dlZejej ! dZ"ej#$d
e"Z%ej#&e%Z'e'e je%j(< e%j)*e' W n ey Z+ ze+ZW Y dZ+[+ndZ+[+ww G dd deZ,dS )    N)TYPE_CHECKINGIterableListOptionalUnion)pyarrow_table_from_pydict)_check_pyarrow_version)BlockBlockAccessorBlockMetadata)Dataset)
DatasourceReadTaskTRANSFORMERS_IMPORT_ERROR)is_datasets_availabledatasets_modulesz__init__.pyc                   @   s~   e Zd ZdZ	dded defddZeded defd	d
Z	de
e fddZdee fddZdedee fddZdS )HuggingFaceDatasourceah  Hugging Face Dataset datasource, for reading from a
    `Hugging Face Datasets Dataset <https://huggingface.co/docs/datasets/package_reference/main_classes#datasets.Dataset/>`_.
    This Datasource implements a streamed read using a
    single read task, most beneficial for a
    `Hugging Face Datasets IterableDataset <https://huggingface.co/docs/datasets/package_reference/main_classes#datasets.IterableDataset/>`_
    or datasets which are too large to fit in-memory.
    For an in-memory Hugging Face Dataset (`datasets.Dataset`), use :meth:`~ray.data.from_huggingface`
    directly for faster performance.
       dataset)zdatasets.Datasetzdatasets.IterableDataset
batch_sizec                 C   s   t d urt || _|| _d S N)r   _dataset_batch_size)selfr   r    r   h/home/ubuntu/.local/lib/python3.10/site-packages/ray/data/_internal/datasource/huggingface_datasource.py__init__<   s   
zHuggingFaceDatasource.__init__returnc                 C   s   ddl }|jj}|jj}t|j}t||js;ddl m} z||||d}|j	|j	kr.g W S W n t
y:   g  Y S w ddl}d| d| d| }	||	}
|
j|jd kr[|
 S g S )	zReturn list of Hugging Face hosted parquet file URLs if they
        exist for the data (i.e. if the dataset is a public dataset that
        has not been transformed) else return an empty list.r   N)load_dataset)splitz$https://huggingface.co/api/datasets/z	/parquet//ok)datasetsinfodataset_nameconfig_namestrr   
isinstanceIterableDatasetr   _fingerprint	Exceptionrequestsgetstatus_codecodesjson)clsr   r"   r$   r%   
split_namer   dsr+   
public_urlrespr   r   r   list_parquet_urls_from_datasetG   s2   

z4HuggingFaceDatasource.list_parquet_urls_from_datasetc                 C   s   | j jS r   )r   dataset_size)r   r   r   r   estimate_inmemory_data_sizev   s   z1HuggingFaceDatasource.estimate_inmemory_data_sizec                 c   s    dd l }dd l}dd l}| jdj| jdD ]5}t||j|j	t
|jfs0tdt| dt||jr:d|i}t|t
rCt|}t| }|V  qd S )Nr   arrow)r   zBatch format z isn't supported. Only the following batch formats are supported: dict (corresponds to `None` in `dataset.with_format()`), pyarrow.Table, np.array, pd.DataFrame.item)numpypandaspyarrowr   with_formatiterr   r'   Table	DataFramedictarray
ValueErrortypendarrayr   r
   	for_block
to_default)r   nppdr<   batchblockr   r   r   _read_datasety   s$   

z#HuggingFaceDatasource._read_datasetparallelismc                 C   s(   t   td d d d d}t| j|g}|S )N)num_rows
size_bytesinput_files
exec_stats)r   r   r   rL   )r   rM   meta
read_tasksr   r   r   get_read_tasks   s   z$HuggingFaceDatasource.get_read_tasksN)r   )__name__
__module____qualname____doc__r   intr   classmethodr   r5   r   r7   r   r	   rL   r   r   rT   r   r   r   r   r   1   s*    
.$r   )-systypingr   r   r   r   r   $ray.air.util.tensor_extensions.arrowr   ray.data._internal.utilr   ray.data.blockr	   r
   r   ray.data.datasetr   ray.data.datasourcer   r   r"   r   ImportError__annotations__transformers.utilsr   modules	importlibosdatasets.loadpathjoinloadinit_dynamic_modulesdynamic_modules_pathutilspec_from_file_locationspecmodule_from_specr   nameloaderexec_moduleer   r   r   r   r   <module>   s>   
 
