o
    .iB                     @   s  d Z ddlZddlmZmZ ddlmZmZ ddlm	Z	 ddl
mZ ddlmZ dd	lmZ dd
lmZmZmZ ddlmZ ddlmZ eeZG dd deZ					d"dedeeeeef  dee	 deeeef  deeeef  deeeef  fddZ 					d"dedeeeef  dee	 deeeef  dee deeeeef  fddZ!					d"dedeeeef  dee	 deeeef  dee deeeeef  dee fddZ"						d#dedee deeeee eeeeee f f f  dee	 deeeef  deeeef  deeeef  defddZ#						d#dedee deeeee eeeeee f f f  dee	 deeeef  deeeef  deeeef  fd d!Z$dS )$zList and inspect datasets.    N)MappingSequence)OptionalUnion   )DownloadConfig)DownloadMode)StreamingDownloadManager)DatasetInfo)dataset_module_factoryget_dataset_builder_classload_dataset_builder)
get_logger)Versionc                   @   s   e Zd ZdS )SplitsNotFoundErrorN)__name__
__module____qualname__ r   r   D/home/ubuntu/.local/lib/python3.10/site-packages/datasets/inspect.pyr   &   s    r   path
data_filesdownload_configdownload_moderevisiontokenc                    s2   t d} fdd|D S )a  Get the meta information about a dataset, returned as a dict mapping config name to DatasetInfoDict.

    Args:
        path (`str`): path to the dataset processing script with the dataset builder. Can be either:

            - a local path to processing script or the directory containing the script (if the script has the same name as the directory),
                e.g. `'./dataset/squad'` or `'./dataset/squad/squad.py'`
            - a dataset identifier on the Hugging Face Hub (list all available datasets and ids with [`huggingface_hub.list_datasets`]),
                e.g. `'rajpurkar/squad'`, `'nyu-mll/glue'` or``'openai/webtext'`
        revision (`Union[str, datasets.Version]`, *optional*):
            If specified, the dataset module will be loaded from the datasets repository at this version.
            By default:
            - it is set to the local version of the lib.
            - it will also try to load it from the main branch if it's not available at the local version of the lib.
            Specifying a version that is different from your local version of the lib might cause compatibility issues.
        download_config ([`DownloadConfig`], *optional*):
            Specific download configuration parameters.
        download_mode ([`DownloadMode`] or `str`, defaults to `REUSE_DATASET_IF_EXISTS`):
            Download/generate mode.
        data_files (`Union[Dict, List, str]`, *optional*):
            Defining the data_files of the dataset configuration.
        token (`str` or `bool`, *optional*):
            Optional string or boolean to use as Bearer token for remote files on the Datasets Hub.
            If `True`, or not specified, will get token from `"~/.huggingface"`.
        **config_kwargs (additional keyword arguments):
            Optional attributes for builder class which will override the attributes if supplied.

    Example:

    ```py
    >>> from datasets import get_dataset_infos
    >>> get_dataset_infos('cornell-movie-review-data/rotten_tomatoes')
    {'default': DatasetInfo(description="Movie Review Dataset.
This is a dataset of containing 5,331 positive and 5,331 negative processed
sentences from Rotten Tomatoes movie reviews...), ...}
    ```
    )r   r   r   r   r   r   c                    s,   i | ]}|t d|d  qS ))r   config_namer   r   r   r   r   r   )get_dataset_config_info).0r   config_kwargsr   r   r   r   r   r   r   r   
<dictcomp>^   s    z%get_dataset_infos.<locals>.<dictcomp>)get_dataset_config_names)r   r   r   r   r   r   r    config_namesr   r   r   get_dataset_infos*   s   ,r$   dynamic_modules_pathc           	      K   sT   t | f|||||d|}t|tj| d}t|j p)|j	d|j
p'dgS )a		  Get the list of available config names for a particular dataset.

    Args:
        path (`str`): path to the dataset processing script with the dataset builder. Can be either:

            - a local path to processing script or the directory containing the script (if the script has the same name as the directory),
                e.g. `'./dataset/squad'` or `'./dataset/squad/squad.py'`
            - a dataset identifier on the Hugging Face Hub (list all available datasets and ids with [`huggingface_hub.list_datasets`]),
                e.g. `'rajpurkar/squad'`, `'nyu-mll/glue'` or `'openai/webtext'`
        revision (`Union[str, datasets.Version]`, *optional*):
            If specified, the dataset module will be loaded from the datasets repository at this version.
            By default:
            - it is set to the local version of the lib.
            - it will also try to load it from the main branch if it's not available at the local version of the lib.
            Specifying a version that is different from your local version of the lib might cause compatibility issues.
        download_config ([`DownloadConfig`], *optional*):
            Specific download configuration parameters.
        download_mode ([`DownloadMode`] or `str`, defaults to `REUSE_DATASET_IF_EXISTS`):
            Download/generate mode.
        dynamic_modules_path (`str`, defaults to `~/.cache/huggingface/modules/datasets_modules`):
            Optional path to the directory in which the dynamic modules are saved. It must have been initialized with `init_dynamic_modules`.
            By default the datasets are stored inside the `datasets_modules` module.
        data_files (`Union[Dict, List, str]`, *optional*):
            Defining the data_files of the dataset configuration.
        **download_kwargs (additional keyword arguments):
            Optional attributes for [`DownloadConfig`] which will override the attributes in `download_config` if supplied,
            for example `token`.

    Example:

    ```py
    >>> from datasets import get_dataset_config_names
    >>> get_dataset_config_names("nyu-mll/glue")
    ['cola',
     'sst2',
     'mrpc',
     'qqp',
     'stsb',
     'mnli',
     'mnli_mismatched',
     'mnli_matched',
     'qnli',
     'rte',
     'wnli',
     'ax']
    ```
    r   r   r   r%   r   dataset_namer   default)r   r   osr   basenamelistbuilder_configskeysbuilder_kwargsgetDEFAULT_CONFIG_NAME)	r   r   r   r   r%   r   download_kwargsdataset_modulebuilder_clsr   r   r   r"   m   s   8	r"   returnc                 K   sj   t | f|||||d|}t|tj| d}t|j }	|	r.t|	dkr+|	d nd}
nd}
|j	p4|
S )a	  Get the default config name for a particular dataset.
    Can return None only if the dataset has multiple configurations and no default configuration.

    Args:
        path (`str`): path to the dataset processing script with the dataset builder. Can be either:

            - a local path to processing script or the directory containing the script (if the script has the same name as the directory),
                e.g. `'./dataset/squad'` or `'./dataset/squad/squad.py'`
            - a dataset identifier on the Hugging Face Hub (list all available datasets and ids with [`huggingface_hub.list_datasets`]),
                e.g. `'rajpurkar/squad'`, `'nyu-mll/glue'` or `'openai/webtext'`
        revision (`Union[str, datasets.Version]`, *optional*):
            If specified, the dataset module will be loaded from the datasets repository at this version.
            By default:
            - it is set to the local version of the lib.
            - it will also try to load it from the main branch if it's not available at the local version of the lib.
            Specifying a version that is different from your local version of the lib might cause compatibility issues.
        download_config ([`DownloadConfig`], *optional*):
            Specific download configuration parameters.
        download_mode ([`DownloadMode`] or `str`, defaults to `REUSE_DATASET_IF_EXISTS`):
            Download/generate mode.
        dynamic_modules_path (`str`, defaults to `~/.cache/huggingface/modules/datasets_modules`):
            Optional path to the directory in which the dynamic modules are saved. It must have been initialized with `init_dynamic_modules`.
            By default the datasets are stored inside the `datasets_modules` module.
        data_files (`Union[Dict, List, str]`, *optional*):
            Defining the data_files of the dataset configuration.
        **download_kwargs (additional keyword arguments):
            Optional attributes for [`DownloadConfig`] which will override the attributes in `download_config` if supplied,
            for example `token`.

    Returns:
        Optional[str]: the default config name if there is one

    Example:

    ```py
    >>> from datasets import get_dataset_default_config_name
    >>> get_dataset_default_config_name("openbookqa")
    'main'
    ```
    r&   r'   r   r   Nr)   )
r   r   r*   r   r+   r,   r-   r.   lenr1   )r   r   r   r   r%   r   r2   r3   r4   r-   default_config_namer   r   r   get_dataset_default_config_name   s"   1	
r8   r   c              
      s   t  f||||||d|}|j}	|	jdu rX|r| nt }|dur'||_|t|j|d z fdd|	t|j|dD |	_W |	S  t
yW }
 ztd|
d}
~
ww |	S )a  Get the meta information (DatasetInfo) about a dataset for a particular config

    Args:
        path (``str``): path to the dataset processing script with the dataset builder. Can be either:

            - a local path to processing script or the directory containing the script (if the script has the same name as the directory),
                e.g. ``'./dataset/squad'`` or ``'./dataset/squad/squad.py'``
            - a dataset identifier on the Hugging Face Hub (list all available datasets and ids with [`huggingface_hub.list_datasets`]),
                e.g. ``'rajpurkar/squad'``, ``'nyu-mll/glue'`` or ``'openai/webtext'``
        config_name (:obj:`str`, optional): Defining the name of the dataset configuration.
        data_files (:obj:`str` or :obj:`Sequence` or :obj:`Mapping`, optional): Path(s) to source data file(s).
        download_config (:class:`~download.DownloadConfig`, optional): Specific download configuration parameters.
        download_mode (:class:`DownloadMode` or :obj:`str`, default ``REUSE_DATASET_IF_EXISTS``): Download/generate mode.
        revision (:class:`~utils.Version` or :obj:`str`, optional): Version of the dataset script to load.
            As datasets have their own git repository on the Datasets Hub, the default version "main" corresponds to their "main" branch.
            You can specify a different version than the default "main" by using a commit SHA or a git tag of the dataset repository.
        token (``str`` or :obj:`bool`, optional): Optional string or boolean to use as Bearer token for remote files on the Datasets Hub.
            If True, or not specified, will get token from `"~/.huggingface"`.
        **config_kwargs (additional keyword arguments): optional attributes for builder class which will override the attributes if supplied.

    )namer   r   r   r   r   N)	base_pathr   c                    s   i | ]
}|j |j  d qS ))r9   r(   )r9   )r   split_generatorr   r   r   r!   )  s    z+get_dataset_config_info.<locals>.<dictcomp>z<The split names could not be parsed from the dataset config.)r   infosplitscopyr   r   _check_manual_downloadr	   r:   _split_generators	Exceptionr   )r   r   r   r   r   r   r   r    builderr=   errr   r<   r   r      s@   




r   c           	   	   K   s,   t | f||||||d|}t|j S )a  Get the list of available splits for a particular config and dataset.

    Args:
        path (`str`): path to the dataset processing script with the dataset builder. Can be either:

            - a local path to processing script or the directory containing the script (if the script has the same name as the directory),
                e.g. `'./dataset/squad'` or `'./dataset/squad/squad.py'`
            - a dataset identifier on the Hugging Face Hub (list all available datasets and ids with [`huggingface_hub.list_datasets`]),
                e.g. `'rajpurkar/squad'`, `'nyu-mll/glue'` or `'openai/webtext'`
        config_name (`str`, *optional*):
            Defining the name of the dataset configuration.
        data_files (`str` or `Sequence` or `Mapping`, *optional*):
            Path(s) to source data file(s).
        download_config ([`DownloadConfig`], *optional*):
            Specific download configuration parameters.
        download_mode ([`DownloadMode`] or `str`, defaults to `REUSE_DATASET_IF_EXISTS`):
            Download/generate mode.
        revision ([`Version`] or `str`, *optional*):
            Version of the dataset script to load.
            As datasets have their own git repository on the Datasets Hub, the default version "main" corresponds to their "main" branch.
            You can specify a different version than the default "main" by using a commit SHA or a git tag of the dataset repository.
        token (`str` or `bool`, *optional*):
            Optional string or boolean to use as Bearer token for remote files on the Datasets Hub.
            If `True`, or not specified, will get token from `"~/.huggingface"`.
        **config_kwargs (additional keyword arguments):
            Optional attributes for builder class which will override the attributes if supplied.

    Example:

    ```py
    >>> from datasets import get_dataset_split_names
    >>> get_dataset_split_names('cornell-movie-review-data/rotten_tomatoes')
    ['train', 'validation', 'test']
    ```
    )r   r   r   r   r   r   )r   r,   r>   r.   )	r   r   r   r   r   r   r   r    r=   r   r   r   get_dataset_split_names4  s   -
rE   )NNNNN)NNNNNN)%__doc__r*   collections.abcr   r   typingr   r   download.download_configr   download.download_managerr   #download.streaming_download_managerr	   r=   r
   loadr   r   r   utils.loggingr   utils.versionr   r   logger
ValueErrorr   strdictr,   boolr$   r"   r8   r   rE   r   r   r   r   <module>   s   
E
I
E(	
?(