o
    8wi=                     @   s  d Z ddlZddlmZmZ ddlmZmZ ddlm	Z	 ddl
mZ ddlmZ dd	lmZ dd
lmZmZmZ ddlmZ ddlmZ eeZG dd deZ					d!dedeeeeef  dee	 deeeef  deeeef  deeeef  fddZ 				d"dedeeeef  dee	 deeeef  deeeeef  f
ddZ!				d"dedeeeef  dee	 deeeef  deeeeef  dee fddZ"						d#dedee deeeee eeeeee f f f  dee	 deeeef  deeeef  deeeef  defddZ#						d#dedee deeeee eeeeee f f f  dee	 deeeef  deeeef  deeeef  fdd Z$dS )$zList and inspect datasets.    N)MappingSequence)OptionalUnion   )DownloadConfig)DownloadMode)StreamingDownloadManager)DatasetInfo)dataset_module_factoryget_dataset_builder_classload_dataset_builder)
get_logger)Versionc                   @   s   e Zd ZdS )SplitsNotFoundErrorN)__name__
__module____qualname__ r   r   M/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/datasets/inspect.pyr   &   s    r   path
data_filesdownload_configdownload_moderevisiontokenc                    s2   t d} fdd|D S )a6  Get the meta information about a dataset, returned as a dict mapping config name to DatasetInfoDict.

    Args:
        path (`str`): path to the dataset repository. Can be either:

            - a local path to the dataset directory containing the data files,
                e.g. `'./dataset/squad'`
            - a dataset identifier on the Hugging Face Hub (list all available datasets and ids with [`huggingface_hub.list_datasets`]),
                e.g. `'rajpurkar/squad'`, `'nyu-mll/glue'` or``'openai/webtext'`
        revision (`Union[str, datasets.Version]`, *optional*):
            If specified, the dataset module will be loaded from the datasets repository at this version.
            By default:
            - it is set to the local version of the lib.
            - it will also try to load it from the main branch if it's not available at the local version of the lib.
            Specifying a version that is different from your local version of the lib might cause compatibility issues.
        download_config ([`DownloadConfig`], *optional*):
            Specific download configuration parameters.
        download_mode ([`DownloadMode`] or `str`, defaults to `REUSE_DATASET_IF_EXISTS`):
            Download/generate mode.
        data_files (`Union[Dict, List, str]`, *optional*):
            Defining the data_files of the dataset configuration.
        token (`str` or `bool`, *optional*):
            Optional string or boolean to use as Bearer token for remote files on the Datasets Hub.
            If `True`, or not specified, will get token from `"~/.huggingface"`.
        **config_kwargs (additional keyword arguments):
            Optional attributes for builder class which will override the attributes if supplied.

    Example:

    ```py
    >>> from datasets import get_dataset_infos
    >>> get_dataset_infos('cornell-movie-review-data/rotten_tomatoes')
    {'default': DatasetInfo(description="Movie Review Dataset.
This is a dataset of containing 5,331 positive and 5,331 negative processed
sentences from Rotten Tomatoes movie reviews...), ...}
    ```
    )r   r   r   r   r   r   c                    s,   i | ]}|t d|d  qS ))r   config_namer   r   r   r   r   r   )get_dataset_config_info).0r   config_kwargsr   r   r   r   r   r   r   r   
<dictcomp>^   s    z%get_dataset_infos.<locals>.<dictcomp>)get_dataset_config_names)r   r   r   r   r   r   r    config_namesr   r   r   get_dataset_infos*   s   ,r$   c                 K   sR   t | f||||d|}t|tj| d}t|j p(|j	d|j
p&dgS )aG  Get the list of available config names for a particular dataset.

    Args:
        path (`str`): path to the dataset repository. Can be either:

            - a local path to the dataset directory containing the data files,
                e.g. `'./dataset/squad'`
            - a dataset identifier on the Hugging Face Hub (list all available datasets and ids with [`huggingface_hub.list_datasets`]),
                e.g. `'rajpurkar/squad'`, `'nyu-mll/glue'` or``'openai/webtext'`
        revision (`Union[str, datasets.Version]`, *optional*):
            If specified, the dataset module will be loaded from the datasets repository at this version.
            By default:
            - it is set to the local version of the lib.
            - it will also try to load it from the main branch if it's not available at the local version of the lib.
            Specifying a version that is different from your local version of the lib might cause compatibility issues.
        download_config ([`DownloadConfig`], *optional*):
            Specific download configuration parameters.
        download_mode ([`DownloadMode`] or `str`, defaults to `REUSE_DATASET_IF_EXISTS`):
            Download/generate mode.
        data_files (`Union[Dict, List, str]`, *optional*):
            Defining the data_files of the dataset configuration.
        **download_kwargs (additional keyword arguments):
            Optional attributes for [`DownloadConfig`] which will override the attributes in `download_config` if supplied,
            for example `token`.

    Example:

    ```py
    >>> from datasets import get_dataset_config_names
    >>> get_dataset_config_names("nyu-mll/glue")
    ['cola',
     'sst2',
     'mrpc',
     'qqp',
     'stsb',
     'mnli',
     'mnli_mismatched',
     'mnli_matched',
     'qnli',
     'rte',
     'wnli',
     'ax']
    ```
    r   r   r   r   dataset_namer   default)r   r   osr   basenamelistbuilder_configskeysbuilder_kwargsgetDEFAULT_CONFIG_NAME)r   r   r   r   r   download_kwargsdataset_modulebuilder_clsr   r   r   r"   m   s   4r"   returnc           
      K   sh   t | f||||d|}t|tj| d}t|j }|r-t|dkr*|d nd}	nd}	|j	p3|	S )aW  Get the default config name for a particular dataset.
    Can return None only if the dataset has multiple configurations and no default configuration.

    Args:
        path (`str`): path to the dataset repository. Can be either:

            - a local path to the dataset directory containing the data files,
                e.g. `'./dataset/squad'`
            - a dataset identifier on the Hugging Face Hub (list all available datasets and ids with [`huggingface_hub.list_datasets`]),
                e.g. `'rajpurkar/squad'`, `'nyu-mll/glue'` or``'openai/webtext'`
        revision (`Union[str, datasets.Version]`, *optional*):
            If specified, the dataset module will be loaded from the datasets repository at this version.
            By default:
            - it is set to the local version of the lib.
            - it will also try to load it from the main branch if it's not available at the local version of the lib.
            Specifying a version that is different from your local version of the lib might cause compatibility issues.
        download_config ([`DownloadConfig`], *optional*):
            Specific download configuration parameters.
        download_mode ([`DownloadMode`] or `str`, defaults to `REUSE_DATASET_IF_EXISTS`):
            Download/generate mode.
        data_files (`Union[Dict, List, str]`, *optional*):
            Defining the data_files of the dataset configuration.
        **download_kwargs (additional keyword arguments):
            Optional attributes for [`DownloadConfig`] which will override the attributes in `download_config` if supplied,
            for example `token`.

    Returns:
        Optional[str]: the default config name if there is one

    Example:

    ```py
    >>> from datasets import get_dataset_default_config_name
    >>> get_dataset_default_config_name("openbookqa")
    'main'
    ```
    r%   r&   r   r   Nr(   )
r   r   r)   r   r*   r+   r,   r-   lenr0   )
r   r   r   r   r   r1   r2   r3   r,   default_config_namer   r   r   get_dataset_default_config_name   s    -
r7   r   c              
      s   t  f||||||d|}|j}	|	jdu rX|r| nt }|dur'||_|t|j|d z fdd|	t|j|dD |	_W |	S  t
yW }
 ztd|
d}
~
ww |	S )a  Get the meta information (DatasetInfo) about a dataset for a particular config

    Args:
        path (`str`): path to the dataset repository. Can be either:

            - a local path to the dataset directory containing the data files,
                e.g. `'./dataset/squad'`
            - a dataset identifier on the Hugging Face Hub (list all available datasets and ids with [`huggingface_hub.list_datasets`]),
                e.g. `'rajpurkar/squad'`, `'nyu-mll/glue'` or``'openai/webtext'`
        config_name (:obj:`str`, optional): Defining the name of the dataset configuration.
        data_files (:obj:`str` or :obj:`Sequence` or :obj:`Mapping`, optional): Path(s) to source data file(s).
        download_config (:class:`~download.DownloadConfig`, optional): Specific download configuration parameters.
        download_mode (:class:`DownloadMode` or :obj:`str`, default ``REUSE_DATASET_IF_EXISTS``): Download/generate mode.
        revision (:class:`~utils.Version` or :obj:`str`, optional): Version of the dataset to load.
            As datasets have their own git repository on the Datasets Hub, the default version "main" corresponds to their "main" branch.
            You can specify a different version than the default "main" by using a commit SHA or a git tag of the dataset repository.
        token (``str`` or :obj:`bool`, optional): Optional string or boolean to use as Bearer token for remote files on the Datasets Hub.
            If True, or not specified, will get token from `"~/.huggingface"`.
        **config_kwargs (additional keyword arguments): optional attributes for builder class which will override the attributes if supplied.

    )namer   r   r   r   r   N)	base_pathr   c                    s   i | ]
}|j |j  d qS ))r8   r'   )r8   )r   split_generatorr   r   r   r!     s    z+get_dataset_config_info.<locals>.<dictcomp>z<The split names could not be parsed from the dataset config.)r   infosplitscopyr   r   _check_manual_downloadr	   r9   _split_generators	Exceptionr   )r   r   r   r   r   r   r   r    builderr<   errr   r;   r   r      s@   




r   c           	   	   K   s,   t | f||||||d|}t|j S )ae  Get the list of available splits for a particular config and dataset.

    Args:
        path (`str`): path to the dataset repository. Can be either:

            - a local path to the dataset directory containing the data files,
                e.g. `'./dataset/squad'`
            - a dataset identifier on the Hugging Face Hub (list all available datasets and ids with [`huggingface_hub.list_datasets`]),
                e.g. `'rajpurkar/squad'`, `'nyu-mll/glue'` or``'openai/webtext'`
        config_name (`str`, *optional*):
            Defining the name of the dataset configuration.
        data_files (`str` or `Sequence` or `Mapping`, *optional*):
            Path(s) to source data file(s).
        download_config ([`DownloadConfig`], *optional*):
            Specific download configuration parameters.
        download_mode ([`DownloadMode`] or `str`, defaults to `REUSE_DATASET_IF_EXISTS`):
            Download/generate mode.
        revision ([`Version`] or `str`, *optional*):
            Version of the dataset to load.
            As datasets have their own git repository on the Datasets Hub, the default version "main" corresponds to their "main" branch.
            You can specify a different version than the default "main" by using a commit SHA or a git tag of the dataset repository.
        token (`str` or `bool`, *optional*):
            Optional string or boolean to use as Bearer token for remote files on the Datasets Hub.
            If `True`, or not specified, will get token from `"~/.huggingface"`.
        **config_kwargs (additional keyword arguments):
            Optional attributes for builder class which will override the attributes if supplied.

    Example:

    ```py
    >>> from datasets import get_dataset_split_names
    >>> get_dataset_split_names('cornell-movie-review-data/rotten_tomatoes')
    ['train', 'validation', 'test']
    ```
    )r   r   r   r   r   r   )r   r+   r=   r-   )	r   r   r   r   r   r   r   r    r<   r   r   r   get_dataset_split_names*  s   -
rD   )NNNNN)NNNN)NNNNNN)%__doc__r)   collections.abcr   r   typingr   r   download.download_configr   download.download_managerr   #download.streaming_download_managerr	   r<   r
   loadr   r   r   utils.loggingr   utils.versionr   r   logger
ValueErrorr   strdictr+   boolr$   r"   r7   r   rD   r   r   r   r   <module>   s   
E
D
@(	
?(