o
    ॵi}                     @   s  d dl Z d dlZd dlZd dlZd dlmZ d dlmZ d dlm	Z	m
Z
mZmZmZmZmZmZmZ d dlmZ d dlZd dlmZmZmZmZmZmZmZmZmZmZm Z m!Z!m"Z"m#Z#m$Z$ d dl%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z. d dl/m0Z0m1Z1m2Z2 d d	l3m4Z4m5Z5 d d
l6m7Z7 d dl8m9Z9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z?m@Z@mAZAmBZBmCZCmDZDmEZEmFZFmGZGmHZHmIZImJZJmKZKmLZL d dlMmNZN d dlOmPZPmQZQmRZRmSZS d dlTmUZU d dlVmWZWmXZXmYZYmZZZm[Z[m\Z\ d dl]m^Z^ d dl_m`Z` d dlambZb d dlcmdZd d dlemfZf d dlgmhZh d dlimjZj d dlkmlZlmmZm d dlnmoZp d dlnmqZqmrZrmsZs d dltmuZu d dlvmwZw d dlxmyZy d dlzm{Z{ d dl|m}Z} d dl~mZ d d lmZ e Zed! Zd"ed#ed$efd%d&Zddd'ddd(d)ed*ee d+ee d,ed-eeeef  d.eee  d$epfd/d0Z	dSd1d'dddd2d)ed3ee d4ed.ed*ee d5ee d-eeeef  d$e
eeresf  fd6d7Zd'dddd8d)ed9eee ef d.ed*ee d5ee d-eeeef  d$eeeresf  fd:d;Z		dTd<d=Z		dTd>ed?ed@eee  d#ee d$ee f
dAdBZ	dSd?ed#ee d$e	eee f fdCdDZd$e<fdEdFZdGedHedIed*edJeeeeef d#ee d$eeeef  fdKdLZd$e<fdMdNZG dOdP dPZejdQdR ZdS )U    N)partial)Path)	DictIterableListMappingOptionalSequenceUnionTupleLiteral)	urlencode)BuilderConfigDatasetDatasetBuilderDatasetDictDownloadConfigDownloadManagerDownloadModeFeaturesIterableDatasetIterableDatasetDictSplitVerificationModeVersionconfig
data_files)	FILES_TO_IGNOREDataFilesDictDataFilesListEmptyDatasetError_get_data_files_patterns"_is_inside_unrequested_special_dir?_is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dirget_metadata_patternssanitize_patterns)!_prepare_path_and_storage_options	xbasenamexjoin)DataFilesNotFoundErrorDatasetNotFoundError)DatasetInfosDict)ALL_ALLOWED_EXTENSIONSBuilderConfigsParametersCachedDatasetModuleFactoryDatasetModule$HubDatasetModuleFactoryWithoutScript(HubDatasetModuleFactoryWithParquetExport!HubDatasetModuleFactoryWithScript&LocalDatasetModuleFactoryWithoutScript#LocalDatasetModuleFactoryWithScriptPackagedDatasetModuleFactory,create_builder_configs_from_metadata_configsget_dataset_builder_classimport_main_classinfer_module_for_data_filesfiles_to_hash_get_importable_file_pathresolve_trust_remote_code_create_importable_file_load_importable_fileinit_dynamic_modules)camelcase_to_snakecase)_EXTENSION_TO_MODULE_MODULE_SUPPORTS_METADATA_MODULE_TO_EXTENSIONS_PACKAGED_DATASETS_MODULES)
file_utils)OfflineModeIsEnabled!_raise_if_offline_mode_is_enabledcached_pathis_local_pathis_relative_pathrelative_to_absolute_path)is_small_dataset)MetadataConfigs)get_imports)tracked_str)
filesystem)	_un_chain)stringify_path)DatasetCardDatasetCardData)DatasetInfo)HfApiRepoFile
RepoFolder)version)HubApi)get_endpoint)get_from_cache_ms)MS_DATASETS_CACHE)DEFAULT_DATASET_NAMESPACE)
get_logger)authorcardDatacitation	createdAtdisableddescription	downloadsdownloadsAllTimegatedlastModifiedlikespaperswithcode_idprivatesiblingsshatagsurl_or_filenamedownload_configreturnc                 C   s   t |}d }|dr|ddd dd\}}t|r0|p d}d||d}t|}| j| }t||d	}t|}|| |S )
Nzhf://@   /masterSDK)SourceRevisionFilePathrq   )	str
startswithsplitrJ   r   
_base_pathrH   rO   
set_origin)selfrp   rq   revisionparamsout r   `/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/msdatasets/utils/hf_datasets_util.py_download_ms[   s   


r   F)r   timeoutfiles_metadatatokenexpandrepo_idr   r   r   r   r   c             	   C   s,  t  }|d\}}	|j|	|d\}
}|pd}|j|
|||d}|d }|d }||d< d|d	< |r9|dd
 nd|d< ||d< d|d< d|d< d|d< d
|d< d
|d< g |d< g |d< d|d< g |d< |D ]'}|d |d |d |d dkrzdnd|d |d d
dd }|d | qgtd!i |S )"a  
    Get info on one specific dataset on huggingface.co.

    Dataset can be private if you pass an acceptable token.

    Args:
        repo_id (`str`):
            A namespace (user or an organization) and a repo name separated
            by a `/`.
        revision (`str`, *optional*):
            The revision of the dataset repository from which to get the
            information.
        timeout (`float`, *optional*):
            Whether to set a timeout for the request to the Hub.
        files_metadata (`bool`, *optional*):
            Whether or not to retrieve metadata for files in the repository
            (size, LFS metadata, etc). Defaults to `False`.
        token (`bool` or `str`, *optional*):
            A valid authentication token (see https://huggingface.co/settings/token).
            If `None` or `True` and machine is logged in (through `huggingface-cli login`
            or [`~huggingface_hub.login`]), token will be retrieved from the cache.
            If `False`, token is not sent in the request header.

    Returns:
        [`hf_api.DatasetInfo`]: The dataset repository information.

    <Tip>

    Raises the following errors:

        - [`~utils.RepositoryNotFoundError`]
          If the repository to download from cannot be found. This may be because it doesn't exist,
          or because it is set to `private` and you do not have access.
        - [`~utils.RevisionNotFoundError`]
          If the revision to download from cannot be found.

    </Tip>
    rv   dataset_name	namespacerw   )dataset_hub_idr   r   r   DataFilesidFrl   r   Nr`   rn   ri   rh   rd   rf   rj   ro   ra   rc   rm   r   IdSizeTypetree	directoryfileSha256)sizesha256pointerSize)	rfilenameblobIdr   typelfsr   )rZ   r   get_dataset_id_and_typeget_dataset_infosappendHfDatasetInfo)r   r   r   r   r   r   r   _api
_namespace_dataset_namer   dataset_typedatadata_ddata_file_listfile_info_d	file_infor   r   r   _dataset_infop   sL   0
r   T)	recursiver   r   	repo_typer   path_in_repor   r   c             	   c   sz   t  }t|r|ddkr|d\}	}
n$t|r3|ddkr3td| dt  t|}	}
ntd| dd}d}g }	 |j|
|	|pId
|pLd d	||d}d|v r\|d dkskt	d| d|d   d S |d d }|
| |D ]7}i }|d dkrdnd|d< |d |d< |d |d< |d |d< |d dkrtdi |ntdi |V  qxt||k rd S |d7 }qB)Nrv   rt   r   zGot a relative path: z+ without namespace, Use default namespace: zInvalid repo_id: z !d   Trw   )r   r   r   	root_pathr   page_number	page_sizeCode   zGet dataset: z file list failed, message: Messager   r   r   r   r   r   r   r   pathr   r   r   oidr   )rZ   rJ   countr   loggerwarningr^   
ValueErrorlist_repo_treeerrorextendrW   rX   len)r   r   r   r   r   r   r   r   r   r   r   r   r   total_data_listr   r   r   	path_infor   r   r   _list_repo_tree   sL   
,r   )r   r   r   r   pathsc                C   s^   t  }|d\}}	|j|	|d\}
}|pd}|j|
|ddd}|d }|d }d	d
 |D S )Nrv   r   rw   FFalse)r   r   r   r   r   r   c              
   S   s8   g | ]}|d  dkrt |d  |d |d ddddqS )Name	README.mdr   rz   N)r   r   r   r   last_commitsecurity)rW   ).0item_dr   r   r   
<listcomp>"  s    z#_get_paths_info.<locals>.<listcomp>)rZ   r   r   r   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   _get_paths_info
  s    
r   c                 C   s   t | tttfr| stdtt| d }nt| }|pi }|r%||d< t||p*i }i }tt|D ].\}}|\}}	}
|t	|d krOt
di |
|}q4t
di |
||d< |	|d< ||d< q4|d \}}}t|fi |}|S )	Nzempty urlpath sequencer   protocolrt   target_optionstarget_protocolfor   )
isinstancelisttuplesetr   rR   rQ   	enumeratereversedr   dictrP   )urlpathstorage_optionsr   urlpath0chaininkwargsichurlsnested_protocolkwr   _fsr   r   r   get_fs_token_paths-  s*   

r   pattern	base_pathallowed_extensionsc                    s  t | r
t|| } nt| rtj| d tj }nd}t| |d\} }t| |d}|	dd 	dd p8|j
| 	dd 	dd ttt| h t|jtrW|jn|jd }|dkrd|d ndi }|d	krxtjtd
krxd|d< |j| fddi|}fdd| D }	 dur fdd|	D }
t|
t|	k rtt|	t|
 }td|  d|  n|	}
|
sd|  d} dur|dt  7 }t||
S )a	  
    Resolve the paths and URLs of the data files from the pattern passed by the user.

    You can use patterns to resolve multiple local files. Here are a few examples:
    - *.csv to match all the CSV files at the first level
    - **.csv to match all the CSV files at any level
    - data/* to match all the files inside "data"
    - data/** to match all the files inside "data" and its subdirectories

    The patterns are resolved using the fsspec glob.

    glob.glob, Path.glob, Path.match or fnmatch do not support ** with a prefix/suffix other than a forward slash /.
    For instance, this means **.json is the same as *.json. On the contrary, the fsspec glob has no limits regarding the ** prefix/suffix,  # noqa: E501
    resulting in **.json being equivalent to **/*.json.

    More generally:
    - '*' matches any character except a forward-slash (to match just the file or directory name)
    - '**' matches any character including a forward-slash /

    Hidden files and directories (i.e. whose names start with a dot) are ignored, unless they are explicitly requested.
    The same applies to special directories that start with a double underscore like "__pycache__".
    You can still include one if the pattern explicilty mentions it:
    - to include a hidden file: "*/.hidden.txt" or "*/.*"
    - to include a hidden directory: ".hidden/*" or ".*/*"
    - to include a special directory: "__special__/*" or "__*/*"

    Example::

        >>> from datasets.data_files import resolve_pattern
        >>> base_path = "."
        >>> resolve_pattern("docs/**/*.py", base_path)
        [/Users/mariosasko/Desktop/projects/datasets/docs/source/_config.py']

    Args:
        pattern (str): Unix pattern or paths or URLs of the data files to resolve.
            The paths can be absolute or relative to base_path.
            Remote filesystems using fsspec are supported, e.g. with the hf:// protocol.
        base_path (str): Base path to use when resolving relative paths.
        allowed_extensions (Optional[list], optional): White-list of file extensions to use. Defaults to None (all extensions).
            For example: allowed_extensions=[".csv", ".json", ".txt", ".parquet"]
    Returns:
        List[str]: List of paths or URLs to the local or remote files that match the patterns.
    r    r|   )r   z::z://ru   r   hfz0.20.0Fexpand_infodetailTc                    sz   g | ]9\}}|d  dkr;t | vrttj|tjsttj|tjs|r7|n| qS )r   r   )r'   r"   osr   relpathr#   r~   )r   filepathinfo)files_to_ignorefs_base_path
fs_patternprotocol_prefixr   r   r     s$    z$_resolve_pattern.<locals>.<listcomp>Nc                    s8   g | ]}t  fd dt|ddd D r|qS )c                 3   s    | ]	}d |  v V  qdS ).Nr   )r   suffixr   r   r   	<genexpr>  s    z._resolve_pattern.<locals>.<listcomp>.<genexpr>r   rt   N)anyr'   r   )r   r   r   r   r   r     s    z Some files matched the pattern 'z-' but don't have valid data file extensions: zUnable to find ''z with any supported extension )rJ   r(   rI   r   r   
splitdrivesepr&   r   r   root_markerr   r   r'   r   r   r}   r   HF_HUB_VERSIONrY   parseglobitemsr   r   r   r   FileNotFoundError)r   r   r   rq   r   r   r   glob_kwargstmp_file_pathsmatched_pathsr   invalid_matched_files	error_msgr   )r   r   r   r   r   r   _resolve_patternL  sP   1

r  c                 C   s:   t t| |d}zt|W S  ty   td|  ddw )u
  
    Get the default pattern from a directory testing all the supported patterns.
    The first patterns to return a non-empty list of data files is returned.

    Some examples of supported patterns:

    Input:

        my_dataset_repository/
        ├── README.md
        └── dataset.csv

    Output:

        {"train": ["**"]}

    Input:

        my_dataset_repository/
        ├── README.md
        ├── train.csv
        └── test.csv

        my_dataset_repository/
        ├── README.md
        └── data/
            ├── train.csv
            └── test.csv

        my_dataset_repository/
        ├── README.md
        ├── train_0.csv
        ├── train_1.csv
        ├── train_2.csv
        ├── train_3.csv
        ├── test_0.csv
        └── test_1.csv

    Output:

        {'train': ['train[-._ 0-9/]**', '**/*[-._ 0-9/]train[-._ 0-9/]**',
                    'training[-._ 0-9/]**', '**/*[-._ 0-9/]training[-._ 0-9/]**'],
         'test': ['test[-._ 0-9/]**', '**/*[-._ 0-9/]test[-._ 0-9/]**',
                    'testing[-._ 0-9/]**', '**/*[-._ 0-9/]testing[-._ 0-9/]**', ...]}

    Input:

        my_dataset_repository/
        ├── README.md
        └── data/
            ├── train/
            │   ├── shard_0.csv
            │   ├── shard_1.csv
            │   ├── shard_2.csv
            │   └── shard_3.csv
            └── test/
                ├── shard_0.csv
                └── shard_1.csv

    Output:

        {'train': ['train[-._ 0-9/]**', '**/*[-._ 0-9/]train[-._ 0-9/]**',
                'training[-._ 0-9/]**', '**/*[-._ 0-9/]training[-._ 0-9/]**'],
         'test': ['test[-._ 0-9/]**', '**/*[-._ 0-9/]test[-._ 0-9/]**',
                'testing[-._ 0-9/]**', '**/*[-._ 0-9/]testing[-._ 0-9/]**', ...]}

    Input:

        my_dataset_repository/
        ├── README.md
        └── data/
            ├── train-00000-of-00003.csv
            ├── train-00001-of-00003.csv
            ├── train-00002-of-00003.csv
            ├── test-00000-of-00001.csv
            ├── random-00000-of-00003.csv
            ├── random-00001-of-00003.csv
            └── random-00002-of-00003.csv

    Output:

        {'train': ['data/train-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*'],
         'test': ['data/test-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*'],
         'random': ['data/random-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*']}

    In order, it first tests if SPLIT_PATTERN_SHARDED works, otherwise it tests the patterns in ALL_DEFAULT_PATTERNS.
    )r   rq   zThe directory at z doesn't contain any data filesN)r   r  r!   r  r    )r   rq   resolverr   r   r   _get_data_patterns  s   [

r  c              
      s  t  }| j}|d\}}| jpd}d| j d| d| jpd d}| j }|jd u r1d|_z|j	d|||dd	}t
||d
}	tt|	j}
W n tyW   t }
Y nw |jdd }t|
}t|
}| jd urtt| j}n-|rdtt| v r|d ur|| d }n
tt| d }t|}nt|| jd}tj||t| jd}t|| j| jd\}}| t!| }|t"v }| jd u r|rz	t#|| jd}W n ty   d }Y nw |d urt$j|| j|d  rt fdd|% D }t&| \}}|rt'|||||| jd\}}nt(|j)dd|i|g}d }|j*||d| jt+t| jj|d}| j }|jd u r@d|_|d u rRt,|dkrRtt|}|}t-||||t.|||ddS )Nrv   rw   zhf://datasets/rs   r   zDownloading readmer   F	file_namer   r   r   extension_filterrp   rq   namer   r|   )r   r   rq   )r   r   rq   )rq   r   c                    s   i | ]	\}}||  qS r   r   )r   r   data_files_listmetadata_data_files_listr   r   
<dictcomp>m  s    z-get_module_without_script.<locals>.<dictcomp>)r   supports_metadatadefault_builder_kwargsrq   r   r   )r   r   r   r   zDownloading metadatart   )metadata_configsbuilder_configsdefault_config_name)dataset_infosbuilder_configs_parametersr   )/rZ   r  r   r   data_dirrstriprq   copydownload_descget_dataset_file_urlrH   rS   loadr   r   r  rT   r   getrM   from_dataset_card_datar+   r   r%   nextitervaluesr  r   from_patternsr,   r9   filter_extensionsrC   rB   r$   r   r  rD   r6   r8   BUILDER_CONFIG_CLASSget_file_base_pathr@   r   r/   r-   )r   _ms_api_repo_idr   r   r   r   rq   rp   dataset_readme_pathdataset_card_datasubset_namer  r  patternssubset_data_filesr   module_namer  r  metadata_patternsmodule_pathr   r  r  builder_kwargshashr   r  r   get_module_without_script  s   











r;  r  r   r   importsc                 C   s  g }g }|  }|jdu rd|_|D ]_\}}	}
}|dkr$||	|
f q|	| kr9td|  d|	 d|	 d|	 d	|d	krNt }|
d
 }|j||||d}n|dkrU|
}ntdt||d}|durjtj	||}||	|f qi }|D ] \}}zt
|}W qv ty   ||vs||kr|||< Y qvw |rt|dkrdnd}t|dkrdnd}d| v rd|d< d| v rd|d< td|  d| dd	| d| dd	|  d|S ) a  
    Download additional module for a module <name>.py at URL (or local path) <base_path>/<name>.py
    The imports must have been parsed first using ``get_imports``.

    If some modules need to be installed with pip, an error is raised showing how to install them.
    This function return the list of downloaded modules as tuples (import_name, module_file_path).

    The downloaded modules can then be moved into an importable directory
    with ``_copy_script_and_other_resources_in_importable_dir``.
    NzDownloading extra moduleslibraryzError in the z script, importing relative z module but z: is the name of the script. Please change relative import zl to another name and add a '# From: URL_OR_PATH' comment pointing to the original relative import file path.internal.py)r  r   r   r   externalzWrong import_typer|   rt   dependencies
dependencythemitsklearnzscikit-learnBio	biopythonzTo be able to use z$, you need to install the following : z, z.
Please install z using 'pip install  z' for instance.)r"  r#  r   r   rZ   r$  rH   r   r   join	importlibimport_moduleImportErrorr   keysr*  )r  r   r   r   r<  rq   local_importslibrary_importsimport_typeimport_nameimport_pathsub_directoryr   r  rp   local_import_pathneeds_to_be_installedlibrary_import_namelibrary_import_pathlib_dependencies_str	_them_strr   r   r   _download_additional_modules  sz   
r\  c              
   C   s  t  }| jdd }| jdd }| d}|j|||| jdd}t|| jd}d }|jd||| jdd}t|| jd}	t|}
t| j||| j|
| jd	}g }|r[|	t
j|f |	re|	t
j|	f | jrk| jnt }t|gd
d |D  }t|d|| jd}tj|st| j| j}|rt||||d|| j| jd n	td| j dt|d|| jd\}}t  |j||d| jd}t|||S )Nrv   ru   r   r?  Fr  r  r   )r  r   r   r   r<  rq   c                 S   s   g | ]}|d  qS )rt   r   )r   locr   r   r   r   2      z*get_module_with_script.<locals>.<listcomp>datasets)dynamic_modules_pathmodule_namespacesubdirectory_namer  )
local_pathrO  additional_filesr`  ra  rb  r  download_modezLoading z requires you to execute the dataset script in that repo on your local machine. Make sure you have read the code there to avoid malicious use, then set the option `trust_remote_code=True` to remove this error.r  )r   r   )rZ   r  r   r$  r   rH   rq   rN   r\  r   r   DATASETDICT_INFOS_FILENAMEREPOCARD_FILENAMEr`  r?   r:   r;   r   r   existsr<   trust_remote_coder=   re  r   r>   rK  invalidate_cachesr.  r/   )r   r   r   r   script_file_name
script_urllocal_script_pathdataset_infos_pathdataset_readme_urlr1  r<  rO  rd  r`  r:  importable_file_pathri  r8  r9  r   r   r   get_module_with_script  s   

rq  c                /   @   sv  e Zd Ze																					d dedee dee deeeee eeeeee f f f  deeee	f  d	ee d
ee
 dee deeeef  deeeef  dee dedeeeef  deeeef  dedee dee dedee deeeeeef f(ddZe													d!dedee dee deeeee eeeeee f f f  d	ee d
ee
 dee deeeef  deeeef  deeeef  dee dee defddZe										d"dedeeeef  dee deeeef  dee dee deeeeeef  d	ee dee defddZdS )#DatasetsWrapperHFN
deprecatedFr   r  r   r   r   	cache_dirfeaturesrq   re  verification_modekeep_in_memory
save_infosr   r   	streamingnum_procr   ri  dataset_info_onlyrr   c           #      K   s  |dkrt dt |}|
dkr#|
rtjntj}	t d|	j dt |dkr.t dt nd }|d ur>|s>td| dt| t	j
 rJtd|rT|d urTtd	t|pYtj}t|sc|	pbtjntj}	tjd| ||||||||||||d u d
|}|ri }t| tr| drtj| rddlm} || }dd |D }|S |d u st|dstd|  d |S |j}| D ]%\}}t|}t|dr|jd urdd t|j  D ||< qg ||< q|S |r|j!|dS |j"|||	d||d |d ur|nt#|j$j%}|j&||	|d}|d ur6t '  t (dt |)|}W d    n	1 s1w   Y  |r=|*  z't+ }t,| r_| -ddkrb| .d\} }!|j/|!| d W |S W |S W |S  t0y }" zt1d|"  W Y d }"~"|S d }"~"ww ) Nrs  'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.
You can remove this warning by passing 'token=<use_auth_token>' instead.z'ignore_verifications' was deprecated in favor of 'verification_mode' in version 2.9.1 and will be removed in 3.0.0.
You can remove this warning by passing 'verification_mode=z
' instead.zF'task' was deprecated in version 2.13.0 and will be removed in 3.0.0.
zEmpty 'data_files': 'z3'. It should be either non-empty or None (default).zjYou are trying to load a dataset that was saved using `save_to_disk`. Please use `load_from_disk` instead.zLoading a streaming dataset in parallel with `num_proc` is not implemented. To parallelize streaming, you can wrap the dataset with a PyTorch DataLoader using `num_workers` > 1 instead.)r   r  r   r   rt  ru  rq   re  r   r   r   ri  _require_default_config_namer?  r   )get_dataset_config_namesc                 S   s   i | ]}|g qS r   r   )r   _subsetr   r   r   r    s    z2DatasetsWrapperHF.load_dataset.<locals>.<dictcomp>r  zNo builder_configs found for z	 dataset.r   c                 S   s   g | ]}t |qS r   )r}   )r   itemr   r   r   r     r^  z2DatasetsWrapperHF.load_dataset.<locals>.<listcomp>)r   F)rq   re  rv  try_from_hf_gcsrz  r   )r   rv  	in_memoryignorerv   rt   r   z&Could not record download statistics: r   )2warningswarnFutureWarningr   	NO_CHECKS
ALL_CHECKSvaluer   r   r   DATASET_STATE_JSON_FILENAMErh  NotImplementedErrorr   REUSE_DATASET_IF_EXISTSBASIC_CHECKSrr  load_dataset_builderr   r}   endswithr   r   r_  r~  hasattrr   r   r  r  r   r   rN  as_streaming_datasetdownload_and_preparerL   r   dataset_size
as_datasetcatch_warningssimplefilterprepare_for_task_save_infosrZ   rJ   r   r   dataset_download_statistics	Exceptionr   )#r   r  r   r   r   rt  ru  rq   re  rv  ignore_verificationsrw  rx  r   r   use_auth_tokentaskry  rz  r   ri  r{  config_kwargsbuilder_instanceret_dictr~  subset_list_tmp_builder_configstmp_config_nametmp_builder_configdsr   r   r   er   r   r   load_dataset^  s   
 


zDatasetsWrapperHF.load_datasetTc                    s  |
dkrt dt |
}	t|ptj}|	d ur#|r| nt }|	|_|d ur6|r-| nt }|j	| t
j ||||||||t||d}|j}|d|}|d|}|d|p_|jj}|dd }|jrp|j|nd } tv r|d u r|jjd jd u rd	  d
} fddtD }|r|d|d  d7 }t|t||d}|d||||||j|||	|d
||}|| |S )Nrs  r|  )
r   rq   re  r   r   rt  ri  r}  _require_custom_configsr  r   r   config_namer   r   z@Please specify the data files or data directory to load for the z dataset builder.c                    s   g | ]
}t |  kr|qS r   )rA   )r   	extensionr   r   r   r   D  s
    z:DatasetsWrapperHF.load_dataset_builder.<locals>.<listcomp>z9
For example `data_files={"train": "path/to/data/train/*.z"}`)r   )
rt  r   r  r   r   r:  r   ru  r   r   r   )r  r  r  r   r  r"  r   r   r   updaterr  dataset_module_factoryboolr9  popr  r  r  r&  rD   r  r   rA   r   r7   r:  !_use_legacy_cache_dir_if_possible)r   r  r   r   rt  ru  rq   re  r   r   r  r   ri  r}  r  dataset_moduler9  r  r   r   r  example_extensionsbuilder_clsr  r   r  r   r    s   



z&DatasetsWrapperHF.load_dataset_builderr`  c                 K   s  | dd }|d u rtd&i |}|jd|i |r#|jd u r#t|_t|p(tj}d|_d|_	|tj
k|_ttdd | tjddd }|dsR|d }tj| |}| tv rht| ||||d S | |rtj| r}t| |||d	 S td
t|  tj|rt||||d	 S tj| rt| |||d S t| r| ddkrzt   zt! j"| ||j#dd}W nf t$y* } zYt%|t&t'j(j)t'j(j*frt*d|  dt+|j, ddt-|v rd|  d}t.|r|d| d |dt-|v r$d|  d}|r|d| d n|}t.|d |d }~ww |dd |j/D v rqd}t0j1rd|rdzt2| ||j3d W W S  t$yc } zt45| W Y d }~nd }~ww t6| |||||d W S t7| |||||d W S  t$y } zUzt8| ||d W W  Y d }~S  t$y   t%|t&rt*d |  d!| d t%|t9t.t:fr|d t%|trtd
t| d"|  d#t+|j, d$| d |d w d }~ww td
t| d%)'Nr  Tc                 S   s   | S Nr   )xr   r   r   <lambda>  s    z:DatasetsWrapperHF.dataset_module_factory.<locals>.<lambda>rv   ru   r?  )r   r   rq   re  )re  r`  ri  z"Couldn't find a dataset script at )r   r   re  rt   g      Y@)r   r   r   r   zCouldn't reach 'z' on the Hub ()404z	Dataset 'z' doesn't exist on the Hubz at revision 'r   401zT. If the repo is private or gated, make sure to log in with `huggingface-cli login`.c                 S   s   g | ]}|j qS r   )r   )r   siblingr   r   r   r     s    z<DatasetsWrapperHF.dataset_module_factory.<locals>.<listcomp>F)rq   r   )r   rq   re  r`  ri  )r   r   r   rq   re  )r`  rt  z1Couldn't reach the Hugging Face Hub for dataset 'z': z8 or any data file in the same directory. Couldn't find 'z"' on the Hugging Face Hub either: rH  z( or any data file in the same directory.r   );r  r   r   r  rt  r]   r   r  extract_compressed_fileforce_extractFORCE_REDOWNLOADforce_downloadr   filterreplacer   r   r   r  r   rJ  rD   r5   
get_moduleisfiler4   r  rK   isdirr3   rJ   r   rG   rV   dataset_infor   r  r   rF   requests
exceptionsConnectTimeoutConnectionErrorr   __name__r}   r*   rm   r   USE_PARQUET_EXPORTr1   rn   r   r   r2   r0   r.   r)   r    )r   r   rq   re  r`  r   r   rt  ri  r}  r  download_kwargsr3  filenamecombined_pathr  r  msg#can_load_config_from_parquet_exporte1r   r   r   r  a  sl  


	

		z(DatasetsWrapperHF.dataset_module_factory)NNNNNNNNNrs  NFNNrs  rs  FNNNF)NNNNNNNNNrs  NNT)
NNNNNNNNTF)r  
__module____qualname__staticmethodr}   r   r
   r	   r   r   r   r   r   r   r  r   intr   r   r   r   r   r   r  r   r  r   r   r/   r  r   r   r   r   rr  \  s(   	

 #	
_	rr  c               	   o   sB   t j}tj}ttdrtjntj}tj	}tj
}tj}tj}tj}	tj}
t t _tt_ttdr4tt_ntt_tt_	tt_
tt_tt_tt_tt_z2tj| i |}|V  W |t _|t_ttdre|t_n|t_|t_	|t_
|t_|t_|	t_|
t_d S |t _|t_ttdr|t_n|t_|t_	|t_
|t_|t_|	t_|
t_w )N	_download)r   HF_ENDPOINTrE   get_from_cacher  r   r  _download_singlerV   r  r   get_paths_infor   resolve_patternr0   r  r2   r[   r\   r   r   r   r   r  r;  rq  rr  r  )argskwargshf_endpoint_originget_from_cache_origin_download_origindataset_info_originlist_repo_tree_originget_paths_info_originresolve_pattern_origin get_module_without_script_originget_module_with_script_origindataset_resr   r   r   load_dataset_with_ctx3  s`   



r  r  )NN)rK  
contextlibr   r  	functoolsr   pathlibr   typingr   r   r   r   r   r	   r
   r   r   urllib.parser   r  r_  r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   datasets.data_filesr   r   r   r    r!   r"   r#   r$   r%   ,datasets.download.streaming_download_managerr&   r'   r(   datasets.exceptionsr)   r*   datasets.infor+   datasets.loadr,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   datasets.namingr@   datasets.packaged_modulesrA   rB   rC   rD   datasets.utilsrE   datasets.utils.file_utilsrF   rG   rH   rI   rJ   rK   datasets.utils.info_utilsrL   datasets.utils.metadatarM   datasets.utils.py_utilsrN   datasets.utils.trackrO   fsspecrP   fsspec.corerQ   fsspec.utilsrR   huggingface_hubrS   rT   huggingface_hub.hf_apirU   r   rV   rW   rX   	packagingrY   
modelscoperZ   modelscope.hub.utils.utilsr[   )modelscope.msdatasets.utils.hf_file_utilsr\   modelscope.utils.config_dsr]   modelscope.utils.constantr^   modelscope.utils.loggerr_   r   ExpandDatasetProperty_Tr}   r   floatr  r   r   r   r   r  r  r;  r\  rq  rr  contextmanagerr  r   r   r   r   <module>   s4  ,D,X 
	
d	

>	
%
"

j
e 
Od   Z