o
    .i{                     @   s  d dl Z d dlZd dlmZ d dlmZ d dlmZmZ d dl	m
Z
mZmZ d dlZd dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlm Z  ddlm!Z" ddl#m$Z$m%Z%m&Z&m'Z'm(Z( ddl)m*Z*m+Z+ ee,e-e-f e,e- e,d f Z.e-ej/Z0e 1e2Z3G dd de-Z4G dd de5Z6dZ7ej/ddgej8g dej9g diZ:dZ;ej<e=dk rd d!gZ>g d"Z?nej<e=d#k rd$d!gZ>g d%Z?nd&d'gZ>g d(Z?ej/ej8ej9gZ@d)d* e@D ZAd+d* e@D ZBej/d,giZCe7gZDeBeAeCgZEd-ZFg d.ZGd/e-d0eHfd1d2ZId3eeJeKe-f d0eJe-eeKe- d4f f fd5d6ZLd7e-d/e-d0eHfd8d9ZMd7e-d/e-d0eHfd:d;ZNd<e
e-geKe- f d0eJe-eKe- f fd=d>ZO		dTd/e-d?e-d@eeKe-  dAee d0eKe- f
dBdCZPdUd?e-dAee d0eJe-eKe- f fdDdEZQ	dUdFe-dAee d0e.fdGdHZR		dTdIeKe- dAee dJeeS d0eKe. fdKdLZTG dMd4 d4eKe- ZUG dNdO dOeJe-eUf ZVG dPdQ dQeKe- ZWG dRdS dSeJe-eWf ZXdS )V    N)partial)	has_magic)PathPurePath)CallableOptionalUnion)	url_to_fs)HTTPFileSystem)HfFileSystem)version)
thread_map   )config)DownloadConfig)	_split_re)Split)logging)tqdm)!_prepare_path_and_storage_optionsis_local_pathis_relative_path	xbasenamexjoin)glob_pattern_to_regexstring_to_dict c                   @      e Zd ZdS )UrlN__name__
__module____qualname__r   r   r   G/home/ubuntu/.local/lib/python3.10/site-packages/datasets/data_files.pyr   "       r   c                   @   r   )EmptyDatasetErrorNr   r   r   r   r#   r%   &   r$   r%   zFdata/{split}-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*traintraining)
validationvaliddevval)testtestingeval
evaluationz-._ 0-9z2023.9.0z**[{sep}/]{keyword}[{sep}]*z{keyword}[{sep}]*)z{keyword}/**z{keyword}[{sep}]*/**z**[{sep}/]{keyword}/**z**[{sep}/]{keyword}[{sep}]*/**z	2023.12.0z**/*[{sep}/]{keyword}[{sep}]*)z{keyword}/**/*z{keyword}[{sep}]*/**/*z**/*[{sep}/]{keyword}/**/*z"**/*[{sep}/]{keyword}[{sep}]*/**/*z**/{keyword}[{sep}]*z**/*[{sep}]{keyword}[{sep}]*)z**/{keyword}/**z**/{keyword}[{sep}]*/**z**/*[{sep}]{keyword}/**z**/*[{sep}]{keyword}[{sep}]*/**c                 C       i | ]}|d d t | D qS )c                 S   $   g | ]}t D ]	}|j|td qqS )keywordsep)"KEYWORDS_IN_FILENAME_BASE_PATTERNSformatNON_WORDS_CHARS.0r3   patternr   r   r#   
<listcomp>M       <dictcomp>.<listcomp>SPLIT_KEYWORDSr9   splitr   r   r#   
<dictcomp>L       rB   c                 C   r0   )c                 S   r1   r2   )"KEYWORDS_IN_DIR_NAME_BASE_PATTERNSr6   r7   r8   r   r   r#   r;   U   r<   r=   r>   r@   r   r   r#   rB   T   rC   z**z*[])z	README.mdzconfig.jsonzdataset_info.jsonzdataset_infos.jsonzdummy_data.zipzdataset_dict.jsonr:   returnc                    s   t  fddtD S )Nc                 3   s    | ]}| v V  qd S Nr   )r9   wilcard_characterr:   r   r#   	<genexpr>t   s    z%contains_wildcards.<locals>.<genexpr>)anyWILDCARD_CHARACTERSrH   r   rH   r#   contains_wildcardss   s   rL   patternsDataFilesListc                 C   s   t | trdd |  D S t | trt| giS t | trntdd | D rj| D ]"}t |trCt|dkrCd|v rCt |dttfsJt	d| q(d	d
 | D }tt
|t|krct	d| dd | D S t| iS tt| S )a/  
    Take the data_files patterns from the user, and format them into a dictionary.
    Each key is the name of the split, and each value is a list of data files patterns (paths or urls).
    The default split is "train".

    Returns:
        patterns: dictionary of split_name -> list of patterns
    c                 S   s*   i | ]\}}t |t|tr|n|gqS r   str
isinstancelist)r9   keyvaluer   r   r#   rB      s   * z%sanitize_patterns.<locals>.<dictcomp>c                 s   s    | ]}t |tV  qd S rF   )rQ   dictr9   r:   r   r   r#   rI          z$sanitize_patterns.<locals>.<genexpr>   rA   pathz]Expected each split to have a 'path' key which can be a string or a list of strings, but got c                 S   s   g | ]}|d  qS rA   r   rV   r   r   r#   r;          z%sanitize_patterns.<locals>.<listcomp>z*Some splits are duplicated in data_files: c                 S   s6   i | ]}t |d  t|d tr|d n|d gqS )rA   rY   rO   rV   r   r   r#   rB      s    ()rQ   rU   itemsrP   SANITIZED_DEFAULT_SPLITrR   rJ   lenget
ValueErrorsetsanitize_patterns)rM   r:   splitsr   r   r#   rb   w   s2   
	


rb   matched_rel_pathc                 C   s<   dd t | jjD }dd t |jjD }t|t|kS )u  
    When a path matches a pattern, we additionnally check if it's inside a special directory
    we ignore by default (if it starts with a double underscore).

    Users can still explicitly request a filepath inside such a directory if "__pycache__" is
    mentioned explicitly in the requested pattern.

    Some examples:

    base directory:

        ./
        └── __pycache__
            └── b.txt

    >>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "**")
    True
    >>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "*/b.txt")
    True
    >>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "__pycache__/*")
    False
    >>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "__*/*")
    False
    c                 S      g | ]	}| d r|qS __
startswithr9   partr   r   r#   r;          z6_is_inside_unrequested_special_dir.<locals>.<listcomp>c                 S   re   rf   rh   rj   r   r   r#   r;      rl   )r   parentpartsr^   )rd   r:   data_dirs_to_ignore_in_pathdata_dirs_to_ignore_in_patternr   r   r#   "_is_inside_unrequested_special_dir   s   rq   c                 C   s8   dd t | jD }dd t |jD }t|t|kS )u:  
    When a path matches a pattern, we additionnally check if it's a hidden file or if it's inside
    a hidden directory we ignore by default, i.e. if the file name or a parent directory name starts with a dot.

    Users can still explicitly request a filepath that is hidden or is inside a hidden directory
    if the hidden part is mentioned explicitly in the requested pattern.

    Some examples:

    base directory:

        ./
        └── .hidden_file.txt

    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_file.txt", "**")
    True
    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_file.txt", ".*")
    False

    base directory:

        ./
        └── .hidden_dir
            └── a.txt

    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/a.txt", "**")
    True
    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/a.txt", ".*/*")
    False
    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/a.txt", ".hidden_dir/*")
    False

    base directory:

        ./
        └── .hidden_dir
            └── .hidden_file.txt

    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", "**")
    True
    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".*/*")
    True
    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".*/.*")
    False
    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".hidden_dir/*")
    True
    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".hidden_dir/.*")
    False
    c                 S   (   g | ]}| d rt|d hks|qS .ri   ra   rj   r   r   r#   r;          
zS_is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir.<locals>.<listcomp>c                 S   rr   rs   ru   rj   r   r   r#   r;      rv   )r   rn   r^   )rd   r:   hidden_directories_in_pathhidden_directories_in_patternr   r   r#   ?_is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir   s   5ry   pattern_resolverc           	         sz  t D ]qdd}z| |}W n	 ty   Y qw t|dkrst |D ]}tt|tt}|dus8J |d  q%t	dd D rTt
dt d	 d
fddtD tdd tD   }fdd|D   S qtD ]< g }  D ]&\}}|D ]}z| |}W n	 ty   Y qw t|dkr||  nqq~|r fdd|D   S qvtd| d|  )a+  
    Get the default pattern from a directory or repository by testing all the supported patterns.
    The first patterns to return a non-empty list of data files is returned.

    In order, it first tests if SPLIT_PATTERN_SHARDED works, otherwise it tests the patterns in ALL_DEFAULT_PATTERNS.
    z{split}*r   NrA   c                 s   s    | ]
}t t| V  qd S rF   )rematchr   r@   r   r   r#   rI     s    z+_get_data_files_patterns.<locals>.<genexpr>zSplit name should match 'z'' but got 'z'.c                    s   g | ]
}| v rt |qS r   rP   r@   )rc   r   r#   r;         z,_get_data_files_patterns.<locals>.<listcomp>c                 S   s   h | ]}t |qS r   r~   r@   r   r   r#   	<setcomp>  r[   z+_get_data_files_patterns.<locals>.<setcomp>c                    s   i | ]
}| j |d gqS )rZ   )r6   r@   )split_patternr   r#   rB     r   z,_get_data_files_patterns.<locals>.<dictcomp>c                    s   i | ]}| | qS r   r   r@   )patterns_dictr   r#   rB   $  s    zCouldn't resolve pattern z with resolver )ALL_SPLIT_PATTERNSreplaceFileNotFoundErrorr^   ra   r   r   r   addrJ   r`   r   DEFAULT_SPLITSsortedALL_DEFAULT_PATTERNSr\   append)	rz   r:   
data_filespp_partssorted_splitsnon_empty_splitsrA   rM   r   )r   r   rc   r#   _get_data_files_patterns   sL   
r   	base_pathallowed_extensionsdownload_configc                    s~  t | r
t|| } nt| rtj| d tj }nd}t| |d\} }t| fi |\}t	t
t| h t|jtr@|jn|jd }|dkrM|d ndi }|dkratjtdkrad|d	< fd
d|j| fddi| D } dur fdd|D }	t|	t|k rtt	|t	|	 }
td|  d|
  n|}	|	sd|  d} dur|dt  7 }t||	S )a  
    Resolve the paths and URLs of the data files from the pattern passed by the user.

    You can use patterns to resolve multiple local files. Here are a few examples:
    - *.csv to match all the CSV files at the first level
    - **.csv to match all the CSV files at any level
    - data/* to match all the files inside "data"
    - data/** to match all the files inside "data" and its subdirectories

    The patterns are resolved using the fsspec glob. In fsspec>=2023.12.0 this is equivalent to
    Python's glob.glob, Path.glob, Path.match and fnmatch where ** is unsupported with a prefix/suffix
    other than a forward slash /.

    More generally:
    - '*' matches any character except a forward-slash (to match just the file or directory name)
    - '**' matches any character including a forward-slash /

    Hidden files and directories (i.e. whose names start with a dot) are ignored, unless they are explicitly requested.
    The same applies to special directories that start with a double underscore like "__pycache__".
    You can still include one if the pattern explicilty mentions it:
    - to include a hidden file: "*/.hidden.txt" or "*/.*"
    - to include a hidden directory: ".hidden/*" or ".*/*"
    - to include a special directory: "__special__/*" or "__*/*"

    Example::

        >>> from datasets.data_files import resolve_pattern
        >>> base_path = "."
        >>> resolve_pattern("docs/**/*.py", base_path)
        [/Users/mariosasko/Desktop/projects/datasets/docs/source/_config.py']

    Args:
        pattern (str): Unix pattern or paths or URLs of the data files to resolve.
            The paths can be absolute or relative to base_path.
            Remote filesystems using fsspec are supported, e.g. with the hf:// protocol.
        base_path (str): Base path to use when resolving relative paths.
        allowed_extensions (Optional[list], optional): White-list of file extensions to use. Defaults to None (all extensions).
            For example: allowed_extensions=[".csv", ".json", ".txt", ".parquet"]
        download_config ([`DownloadConfig`], *optional*): Specific download configuration parameters.
    Returns:
        List[str]: List of paths or URLs to the local or remote files that match the patterns.
    r    r   filez://hfz0.20.0Fexpand_infoc                    sp   g | ]4\}}|d  dks| dr6tjtj|r6t| vrt|st|s|r2|n| qS )typer   islink)	r_   osrY   isfilerealpathr   rq   ry   ri   )r9   filepathinfo)files_to_ignore
fs_patternprotocol_prefixr   r#   r;   g  s    *z#resolve_pattern.<locals>.<listcomp>detailTNc                    s8   g | ]}t  fd dt|ddd D r|qS )c                 3   s    | ]	}d |  v V  qdS )rt   Nr   )r9   suffixr   r   r#   rI   s  s    z-resolve_pattern.<locals>.<listcomp>.<genexpr>rt   r   N)rJ   r   rA   )r9   r   r   r   r#   r;   p  s    &z Some files matched the pattern 'z-' but don't have valid data file extensions: zUnable to find ''z with any supported extension )r   r   r   r   rY   
splitdriver4   r   r	   ra   FILES_TO_IGNOREr   rQ   protocolrP   r   HF_HUB_VERSIONr   parseglobr\   r^   rR   loggerr   r   )r:   r   r   r   storage_optionsfsr   glob_kwargsmatched_pathsoutinvalid_matched_files	error_msgr   )r   r   r   r   r#   resolve_pattern(  sB   0
r   c                 C   s:   t t| |d}zt|W S  ty   td|  ddw )uA
  
    Get the default pattern from a directory testing all the supported patterns.
    The first patterns to return a non-empty list of data files is returned.

    Some examples of supported patterns:

    Input:

        my_dataset_repository/
        ├── README.md
        └── dataset.csv

    Output:

        {'train': ['**']}

    Input:

        my_dataset_repository/
        ├── README.md
        ├── train.csv
        └── test.csv

        my_dataset_repository/
        ├── README.md
        └── data/
            ├── train.csv
            └── test.csv

        my_dataset_repository/
        ├── README.md
        ├── train_0.csv
        ├── train_1.csv
        ├── train_2.csv
        ├── train_3.csv
        ├── test_0.csv
        └── test_1.csv

    Output:

        {'train': ['**/train[-._ 0-9]*', '**/*[-._ 0-9]train[-._ 0-9]*', '**/training[-._ 0-9]*', '**/*[-._ 0-9]training[-._ 0-9]*'],
         'test': ['**/test[-._ 0-9]*', '**/*[-._ 0-9]test[-._ 0-9]*', '**/testing[-._ 0-9]*', '**/*[-._ 0-9]testing[-._ 0-9]*', ...]}

    Input:

        my_dataset_repository/
        ├── README.md
        └── data/
            ├── train/
            │   ├── shard_0.csv
            │   ├── shard_1.csv
            │   ├── shard_2.csv
            │   └── shard_3.csv
            └── test/
                ├── shard_0.csv
                └── shard_1.csv

    Output:

        {'train': ['**/train/**', '**/train[-._ 0-9]*/**', '**/*[-._ 0-9]train/**', '**/*[-._ 0-9]train[-._ 0-9]*/**', ...],
         'test': ['**/test/**', '**/test[-._ 0-9]*/**', '**/*[-._ 0-9]test/**', '**/*[-._ 0-9]test[-._ 0-9]*/**', ...]}

    Input:

        my_dataset_repository/
        ├── README.md
        └── data/
            ├── train-00000-of-00003.csv
            ├── train-00001-of-00003.csv
            ├── train-00002-of-00003.csv
            ├── test-00000-of-00001.csv
            ├── random-00000-of-00003.csv
            ├── random-00001-of-00003.csv
            └── random-00002-of-00003.csv

    Output:

        {'train': ['data/train-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*'],
         'test': ['data/test-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*'],
         'random': ['data/random-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*']}

    In order, it first tests if SPLIT_PATTERN_SHARDED works, otherwise it tests the patterns in ALL_DEFAULT_PATTERNS.
    )r   r   zThe directory at z doesn't contain any data filesN)r   r   r   r   r%   )r   r   resolverr   r   r#   get_data_patterns  s   T
r   	data_filec           	      C   s   t | |d\} }t| fi |^}}t|tr"|| }|j|jfS t|trR| t	j
rRtt	j
|jd}d| tt	j
d d  ddd } || }|j|jfS || }dD ]}||v rht|| f  S qYdS )	Nr   )endpointtokenzhf://r   z	/resolve/@)ETagetagmtimer   )r   r	   rQ   r   resolve_pathrepo_idrevisionr
   ri   r   HF_ENDPOINTr   r^   r   r   rP   )	r   r   r   r   _resolved_pathhffsr   rS   r   r   r#   _get_single_origin_metadata  s    

$

r   r   max_workersc                 C   s:   |d ur|nt j}ttt|d| |tdt| dkpd dS )Nr   zResolving data files   )r   
tqdm_classdescdisable)r   &HF_DATASETS_MULTITHREADING_MAX_WORKERSr   r   r   hf_tqdmr^   )r   r   r   r   r   r#   _get_origin_metadata  s   
r   c                       s0  e Zd ZdZdee dee ddf fddZdd	d
Ze				ddee de
jjdee deee  dee dd fddZe				ddee dee deee  dee dd f
ddZe				ddee dee deee  dee dd f
ddZddddeee  deee  dd fddZ  ZS )rN   a  
    List of data files (absolute local paths or URLs).
    It has two construction methods given the user's data files patterns:
    - ``from_hf_repo``: resolve patterns inside a dataset repository
    - ``from_local_or_remote``: resolve patterns from a local path

    Moreover, DataFilesList has an additional attribute ``origin_metadata``.
    It can store:
    - the last modified time of local files
    - ETag of remote files
    - commit sha of a dataset repository

    Thanks to this additional attribute, it is possible to hash the list
    and get a different hash if and only if at least one file changed.
    This is useful for caching Dataset objects that are obtained from a list of data files.
    r   origin_metadatarE   Nc                       t  | || _d S rF   )super__init__r   )selfr   r   	__class__r   r#   r     s   
zDataFilesList.__init__otherc                 C      t g | || j|j S rF   )rN   r   r   r   r   r   r#   __add__     zDataFilesList.__add__rM   dataset_infor   r   r   c                 C   s6   d|j  d|j d|pd d}| j||||dS )Nzhf://datasets/r   /r   r   r   r   )idsharstripfrom_patterns)clsrM   r   r   r   r   r   r   r#   from_hf_repo  s   $	zDataFilesList.from_hf_repoc                 C   s,   |d ur|nt    }| j||||dS Nr   )r   resolveas_posixr   )r   rM   r   r   r   r   r   r#   from_local_or_remote-  s   z"DataFilesList.from_local_or_remotec              	   C   st   |d ur|nt    }g }|D ]}z|t||||d W q ty.   t|s, Y qw t||d}| ||S Nr   r   )r   r   r   extendr   r   r   r   )r   rM   r   r   r   r   r:   r   r   r   r#   r   :  s&   
zDataFilesList.from_patterns
extensions
file_namesr   r   c                   s   g  |rd dd |D } td| d |r2d dd |D } td| d  rBt fd	d
| D | jdS tt| | jdS )N|c                 s       | ]}t |V  qd S rF   r|   escape)r9   extr   r   r#   rI   Y  rW   z'DataFilesList.filter.<locals>.<genexpr>z.*(z	)(\..+)?$c                 s   r   rF   r   )r9   fnr   r   r#   rI   \  rW   z.*[\/]?(z)$c                    s&   g | ] t  fd dD r qS )c                 3   s    | ]}|  V  qd S rF   )r}   rV   r   r   r#   rI   `  rW   z2DataFilesList.filter.<locals>.<listcomp>.<genexpr>)rJ   )r9   rM   r   r#   r;   `  s   & z(DataFilesList.filter.<locals>.<listcomp>)r   )joinr   r|   compilerN   r   rR   )r   r   r   ext_pattern
fn_patternr   r   r#   filterT  s   zDataFilesList.filter)r   rN   rE   rN   NNN)r    r!   r"   __doc__rR   rP   SingleOriginMetadatar   r   classmethodhuggingface_hubhf_apiDatasetInfor   r   r   r   r   r   __classcell__r   r   r   r#   rN     sv    "





c                   @   s0  e Zd ZdZe			ddeeeee e	f f de
e de
ee  de
e dd f
dd	Ze			ddeeeee e	f f d
ejjde
e de
ee  de
e dd fddZe			ddeeeee e	f f de
e de
ee  de
e dd f
ddZdddde
ee  de
ee  dd fddZdS )DataFilesDicta  
    Dict of split_name -> list of data files (absolute local paths or URLs).
    It has two construction methods given the user's data files patterns :
    - ``from_hf_repo``: resolve patterns inside a dataset repository
    - ``from_local_or_remote``: resolve patterns from a local path

    Moreover, each list is a DataFilesList. It is possible to hash the dictionary
    and get a different hash if and only if at least one file changed.
    For more info, see [`DataFilesList`].

    This is useful for caching Dataset objects that are obtained from a list of data files.

    Changing the order of the keys of this dictionary also doesn't change its hash.
    NrM   r   r   r   rE   c                 C   @   |  }|  D ]\}}t|tr|ntj||||d||< q|S r   )r\   rQ   rN   r   r   rM   r   r   r   r   rS   patterns_for_keyr   r   r#   r   w     
z"DataFilesDict.from_local_or_remoter   c           	      C   sB   |  }|  D ]\}}t|tr|n	tj|||||d||< q|S )N)r   r   r   r   )r\   rQ   rN   r   )	r   rM   r   r   r   r   r   rS   r  r   r   r#   r     s   	zDataFilesDict.from_hf_repoc                 C   r  r   )r\   rQ   rN   r   r  r   r   r#   r     r  zDataFilesDict.from_patternsr   r   r   c                C   s2   t |  }|  D ]\}}|j||d||< q	|S )Nr   )r   r\   r   )r   r   r   r   rS   data_files_listr   r   r#   r     s   
zDataFilesDict.filterr   )r    r!   r"   r   r   rU   rP   r   rR   rN   r   r   r   r   r   r   r   r   r   r   r   r   r#   r  g  sr    




r  c                       s   e Zd ZdZdee deeee   f fddZdd Ze		ddee deee  d	d fd
dZ
	ddedee d	dfddZdee d	d fddZ  ZS )DataFilesPatternsListz
    List of data files patterns (absolute local paths or URLs).
    For each pattern there should also be a list of allowed extensions
    to keep, or a None ot keep all the files for the pattern.
    rM   r   c                    r   rF   )r   r   r   )r   rM   r   r   r   r#   r     s   
zDataFilesPatternsList.__init__c                 C   r   rF   )rN   r   r   r   r   r#   r     r   zDataFilesPatternsList.__add__NrE   c                 C   s   | ||gt | S rF   )r^   )r   rM   r   r   r   r#   r     s   z#DataFilesPatternsList.from_patternsr   r   rN   c              	   C   s   |d ur|nt    }g }t| | jD ]\}}z|t||||d W q ty4   t|s2 Y qw t	||d}t
||S r   )r   r   r   zipr   r   r   r   r   r   rN   )r   r   r   r   r:   r   r   r   r   r#   r     s&   
zDataFilesPatternsList.resolver   c                    s   t |  fdd| jD S )Nc                    s   g | ]}|  qS r   r   )r9   r   r   r   r#   r;     r[   z;DataFilesPatternsList.filter_extensions.<locals>.<listcomp>)r  r   )r   r   r   r
  r#   filter_extensions  s   z'DataFilesPatternsList.filter_extensionsrF   )r    r!   r"   r   rR   rP   r   r   r   r   r   r   r   r  r  r   r   r   r#   r    s4    

r  c                   @   sv   e Zd ZdZe	ddeeee f deee  dd fddZ		dded	ee
 dd
fddZdee dd fddZdS )DataFilesPatternsDictz[
    Dict of split_name -> list of data files patterns (absolute local paths or URLs).
    NrM   r   rE   c                 C   s<   |  }|  D ]\}}t|tr|ntj||d||< q|S )Nr   )r\   rQ   r  r   )r   rM   r   r   rS   r  r   r   r#   r     s   z#DataFilesPatternsDict.from_patternsr   r   r  c                 C   s,   t  }|  D ]\}}|||||< q|S rF   )r  r\   r   )r   r   r   r   rS   data_files_patterns_listr   r   r#   r     s   zDataFilesPatternsDict.resolver   c                 C   s.   t |  }|  D ]\}}||||< q	|S rF   )r   r\   r  )r   r   r   rS   r  r   r   r#   r    s   
z'DataFilesPatternsDict.filter_extensionsrF   )r    r!   r"   r   r   rU   rP   rR   r   r   r   r   r  r   r   r   r#   r    s(    


r  )NNrF   )Yr   r|   	functoolsr   r   r   pathlibr   r   typingr   r   r   r   fsspec.corer	   fsspec.implementations.httpr
   r   	packagingr   tqdm.contrib.concurrentr   r   r   downloadr   namingr   rc   r   utilsr   r   r   utils.file_utilsr   r   r   r   r   utils.py_utilsr   r   tuplerP   r   TRAINr]   
get_loggerr    r   r   r   r%   SPLIT_PATTERN_SHARDED
VALIDATIONTESTr?   r7   FSSPEC_VERSIONr   r5   rD   r   "DEFAULT_PATTERNS_SPLIT_IN_FILENAME"DEFAULT_PATTERNS_SPLIT_IN_DIR_NAMEDEFAULT_PATTERNS_ALLr   r   rK   r   boolrL   rU   rR   rb   rq   ry   r   r   r   r   intr   rN   r  r  r  r   r   r   r#   <module>   s    







0&!,>/

(\]

a]5