o
    8wi%{                     @   s  d dl Z d dlZd dlmZ d dlmZ d dlmZmZ d dl	m
Z
mZmZ d dlZd dlmZ d dlmZ d dlmZ d d	lmZ d
dlmZ d
dlmZ d
dlmZ d
dlmZ d
dlmZ d
dlmZ  d
dl!m"Z"m#Z#m$Z$m%Z%m&Z& d
dl'm(Z( ee)e*e*f e)e* e)d f Z+e*ej,Z-e.e/Z0G dd de*Z1G dd de2Z3dZ4ej,ddgej5g dej6g diZ7dZ8ej9e:dk rdd gZ;g d!Z<nej9e:d"k rd#d gZ;g d$Z<nd%d&gZ;g d'Z<ej,ej5ej6gZ=d(d) e=D Z>d*d) e=D Z?ej,d+giZ@e4gZAe?e>e@gZBd,ZCg d-ZDd.e*d/eEfd0d1ZFd2eeGeHe*f d/eGe*eeHe* d3f f fd4d5ZId6e*d.e*d/eEfd7d8ZJd6e*d.e*d/eEfd9d:ZKd;e
e*geHe* f d/eGe*eHe* f fd<d=ZL		dSd.e*d>e*d?eeHe*  d@ee d/eHe* f
dAdBZMdTd>e*d@ee d/eGe*eHe* f fdCdDZN	dTdEe*d@ee d/e+fdFdGZO		dSdHeHe* d@ee dIeeP d/eHe+ fdJdKZQG dLd3 d3eHe* ZRG dMdN dNeGe*eRf ZSG dOdP dPeHe* ZTG dQdR dReGe*eTf ZUdS )U    N)partial)	has_magic)PathPurePath)CallableOptionalUnion)	url_to_fs)HfFileSystem)version)
thread_map   )config)DownloadConfig)	_split_re)Split)logging)tqdm)!_prepare_path_and_storage_optionsis_local_pathis_relative_path	xbasenamexjoin)string_to_dict c                   @      e Zd ZdS )UrlN__name__
__module____qualname__r   r   r   P/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/datasets/data_files.pyr   !       r   c                   @   r   )EmptyDatasetErrorNr   r   r   r   r!   r#   %   r"   r#   zFdata/{split}-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*traintraining)
validationvaliddevval)testtestingeval
evaluationz-._ 0-9z2023.9.0z**[{sep}/]{keyword}[{sep}]*z{keyword}[{sep}]*)z{keyword}/**z{keyword}[{sep}]*/**z**[{sep}/]{keyword}/**z**[{sep}/]{keyword}[{sep}]*/**z	2023.12.0z**/*[{sep}/]{keyword}[{sep}]*)z{keyword}/**/*z{keyword}[{sep}]*/**/*z**/*[{sep}/]{keyword}/**/*z"**/*[{sep}/]{keyword}[{sep}]*/**/*z**/{keyword}[{sep}]*z**/*[{sep}]{keyword}[{sep}]*)z**/{keyword}/**z**/{keyword}[{sep}]*/**z**/*[{sep}]{keyword}/**z**/*[{sep}]{keyword}[{sep}]*/**c                 C       i | ]}|d d t | D qS )c                 S   $   g | ]}t D ]	}|j|td qqS )keywordsep)"KEYWORDS_IN_FILENAME_BASE_PATTERNSformatNON_WORDS_CHARS.0r1   patternr   r   r!   
<listcomp>L       <dictcomp>.<listcomp>SPLIT_KEYWORDSr7   splitr   r   r!   
<dictcomp>K       r@   c                 C   r.   )c                 S   r/   r0   )"KEYWORDS_IN_DIR_NAME_BASE_PATTERNSr4   r5   r6   r   r   r!   r9   T   r:   r;   r<   r>   r   r   r!   r@   S   rA   z**z*[])z	README.mdzconfig.jsonzdataset_info.jsonzdataset_infos.jsonzdummy_data.zipzdataset_dict.jsonr8   returnc                    s   t  fddtD S )Nc                 3   s    | ]}| v V  qd S Nr   )r7   wildcard_characterr8   r   r!   	<genexpr>s   s    z%contains_wildcards.<locals>.<genexpr>)anyWILDCARD_CHARACTERSrF   r   rF   r!   contains_wildcardsr   s   rJ   patternsDataFilesListc                 C   s   t | trdd |  D S t | trt| giS t | trntdd | D rj| D ]"}t |trCt|dkrCd|v rCt |dttfsJt	d| q(d	d
 | D }tt
|t|krct	d| dd | D S t| iS tt| S )a/  
    Take the data_files patterns from the user, and format them into a dictionary.
    Each key is the name of the split, and each value is a list of data files patterns (paths or urls).
    The default split is "train".

    Returns:
        patterns: dictionary of split_name -> list of patterns
    c                 S   s*   i | ]\}}t |t|tr|n|gqS r   str
isinstancelist)r7   keyvaluer   r   r!   r@      s   * z%sanitize_patterns.<locals>.<dictcomp>c                 s   s    | ]}t |tV  qd S rD   )rO   dictr7   r8   r   r   r!   rG          z$sanitize_patterns.<locals>.<genexpr>   r?   pathz]Expected each split to have a 'path' key which can be a string or a list of strings, but got c                 S   s   g | ]}|d  qS r?   r   rT   r   r   r!   r9          z%sanitize_patterns.<locals>.<listcomp>z*Some splits are duplicated in data_files: c                 S   s6   i | ]}t |d  t|d tr|d n|d gqS )r?   rW   rM   rT   r   r   r!   r@      s    ()rO   rS   itemsrN   SANITIZED_DEFAULT_SPLITrP   rH   lenget
ValueErrorsetsanitize_patterns)rK   r8   splitsr   r   r!   r`   v   s2   
	


r`   matched_rel_pathc                 C   s<   dd t | jjD }dd t |jjD }t|t|kS )u  
    When a path matches a pattern, we additionally check if it's inside a special directory
    we ignore by default (if it starts with a double underscore).

    Users can still explicitly request a filepath inside such a directory if "__pycache__" is
    mentioned explicitly in the requested pattern.

    Some examples:

    base directory:

        ./
        └── __pycache__
            └── b.txt

    >>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "**")
    True
    >>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "*/b.txt")
    True
    >>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "__pycache__/*")
    False
    >>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "__*/*")
    False
    c                 S      g | ]	}| d r|qS __
startswithr7   partr   r   r!   r9          z6_is_inside_unrequested_special_dir.<locals>.<listcomp>c                 S   rc   rd   rf   rh   r   r   r!   r9      rj   )r   parentpartsr\   )rb   r8   data_dirs_to_ignore_in_pathdata_dirs_to_ignore_in_patternr   r   r!   "_is_inside_unrequested_special_dir   s   ro   c                 C   s8   dd t | jD }dd t |jD }t|t|kS )u9  
    When a path matches a pattern, we additionally check if it's a hidden file or if it's inside
    a hidden directory we ignore by default, i.e. if the file name or a parent directory name starts with a dot.

    Users can still explicitly request a filepath that is hidden or is inside a hidden directory
    if the hidden part is mentioned explicitly in the requested pattern.

    Some examples:

    base directory:

        ./
        └── .hidden_file.txt

    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_file.txt", "**")
    True
    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_file.txt", ".*")
    False

    base directory:

        ./
        └── .hidden_dir
            └── a.txt

    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/a.txt", "**")
    True
    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/a.txt", ".*/*")
    False
    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/a.txt", ".hidden_dir/*")
    False

    base directory:

        ./
        └── .hidden_dir
            └── .hidden_file.txt

    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", "**")
    True
    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".*/*")
    True
    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".*/.*")
    False
    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".hidden_dir/*")
    True
    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".hidden_dir/.*")
    False
    c                 S   (   g | ]}| d rt|d hks|qS .rg   r_   rh   r   r   r!   r9          
zS_is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir.<locals>.<listcomp>c                 S   rp   rq   rs   rh   r   r   r!   r9      rt   )r   rl   r\   )rb   r8   hidden_directories_in_pathhidden_directories_in_patternr   r   r!   ?_is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir   s   5rw   pattern_resolverc           	         sv  t D ]odd}z| |}W n	 ty   Y qw t|dkrqt |D ]}tt|t}|dus6J |d  q%tdd D rRt	dt
 d	 d
fddtD tdd tD   }fdd|D   S qtD ]< g }  D ]&\}}|D ]}z| |}W n	 ty   Y qw t|dkr||  nqq||r fdd|D   S qttd| d|  )a+  
    Get the default pattern from a directory or repository by testing all the supported patterns.
    The first patterns to return a non-empty list of data files is returned.

    In order, it first tests if SPLIT_PATTERN_SHARDED works, otherwise it tests the patterns in ALL_DEFAULT_PATTERNS.
    z{split}*r   Nr?   c                 s   s    | ]
}t t| V  qd S rD   )rematchr   r>   r   r   r!   rG     s    z+_get_data_files_patterns.<locals>.<genexpr>zSplit name should match 'z'' but got 'z'.c                    s   g | ]
}| v rt |qS r   rN   r>   )ra   r   r!   r9         z,_get_data_files_patterns.<locals>.<listcomp>c                 S   s   h | ]}t |qS r   r|   r>   r   r   r!   	<setcomp>  rY   z+_get_data_files_patterns.<locals>.<setcomp>c                    s   i | ]
}| j |d gqS )rX   )r4   r>   )split_patternr   r!   r@     r}   z,_get_data_files_patterns.<locals>.<dictcomp>c                    s   i | ]}| | qS r   r   r>   )patterns_dictr   r!   r@   #  s    zCouldn't resolve pattern z with resolver )ALL_SPLIT_PATTERNSreplaceFileNotFoundErrorr\   r_   r   r   addrH   r^   r   DEFAULT_SPLITSsortedALL_DEFAULT_PATTERNSrZ   append)	rx   r8   
data_filespp_partssorted_splitsnon_empty_splitsr?   rK   r   )r   r   ra   r!   _get_data_files_patterns   sL   
r   	base_pathallowed_extensionsdownload_configc                    s~  t | r
t|| } nt| rtj| d tj }nd}t| |d\} }t| fi |\}t	t
t| h t|jtr@|jn|jd }|dkrM|d ndi }|dkratjtdkrad|d	< fd
d|j| fddi| D } dur fdd|D }	t|	t|k rtt	|t	|	 }
td|  d|
  n|}	|	sd|  d} dur|dt  7 }t||	S )a  
    Resolve the paths and URLs of the data files from the pattern passed by the user.

    You can use patterns to resolve multiple local files. Here are a few examples:
    - *.csv to match all the CSV files at the first level
    - **.csv to match all the CSV files at any level
    - data/* to match all the files inside "data"
    - data/** to match all the files inside "data" and its subdirectories

    The patterns are resolved using the fsspec glob. In fsspec>=2023.12.0 this is equivalent to
    Python's glob.glob, Path.glob, Path.match and fnmatch where ** is unsupported with a prefix/suffix
    other than a forward slash /.

    More generally:
    - '*' matches any character except a forward-slash (to match just the file or directory name)
    - '**' matches any character including a forward-slash /

    Hidden files and directories (i.e. whose names start with a dot) are ignored, unless they are explicitly requested.
    The same applies to special directories that start with a double underscore like "__pycache__".
    You can still include one if the pattern explicitly mentions it:
    - to include a hidden file: "*/.hidden.txt" or "*/.*"
    - to include a hidden directory: ".hidden/*" or ".*/*"
    - to include a special directory: "__special__/*" or "__*/*"

    Example::

        >>> from datasets.data_files import resolve_pattern
        >>> base_path = "."
        >>> resolve_pattern("docs/**/*.py", base_path)
        [/Users/mariosasko/Desktop/projects/datasets/docs/source/_config.py']

    Args:
        pattern (str): Unix pattern or paths or URLs of the data files to resolve.
            The paths can be absolute or relative to base_path.
            Remote filesystems using fsspec are supported, e.g. with the hf:// protocol.
        base_path (str): Base path to use when resolving relative paths.
        allowed_extensions (Optional[list], optional): White-list of file extensions to use. Defaults to None (all extensions).
            For example: allowed_extensions=[".csv", ".json", ".txt", ".parquet"]
        download_config ([`DownloadConfig`], *optional*): Specific download configuration parameters.
    Returns:
        List[str]: List of paths or URLs to the local or remote files that match the patterns.
    r    r   filez://hfz0.20.0Fexpand_infoc                    sp   g | ]4\}}|d  dks| dr6tjtj|r6t| vrt|st|s|r2|n| qS )typer   islink)	r]   osrW   isfilerealpathr   ro   rw   rg   )r7   filepathinfo)files_to_ignore
fs_patternprotocol_prefixr   r!   r9   f  s    *z#resolve_pattern.<locals>.<listcomp>detailTNc                    s8   g | ]}t  fd dt|ddd D r|qS )c                 3   s    | ]	}d |  v V  qdS )rr   Nr   )r7   suffixr   r   r!   rG   r  s    z-resolve_pattern.<locals>.<listcomp>.<genexpr>rr   r   N)rH   r   r?   )r7   r   r   r   r!   r9   o  s    &z Some files matched the pattern 'z-' but don't have valid data file extensions: zUnable to find ''z with any supported extension )r   r   r   r   rW   
splitdriver2   r   r	   r_   FILES_TO_IGNOREr   rO   protocolrN   r   HF_HUB_VERSIONr   parseglobrZ   r\   rP   loggerr   r   )r8   r   r   r   storage_optionsfsr   glob_kwargsmatched_pathsoutinvalid_matched_files	error_msgr   )r   r   r   r   r!   resolve_pattern'  sB   0
r   c                 C   s:   t t| |d}zt|W S  ty   td|  ddw )uA
  
    Get the default pattern from a directory testing all the supported patterns.
    The first patterns to return a non-empty list of data files is returned.

    Some examples of supported patterns:

    Input:

        my_dataset_repository/
        ├── README.md
        └── dataset.csv

    Output:

        {'train': ['**']}

    Input:

        my_dataset_repository/
        ├── README.md
        ├── train.csv
        └── test.csv

        my_dataset_repository/
        ├── README.md
        └── data/
            ├── train.csv
            └── test.csv

        my_dataset_repository/
        ├── README.md
        ├── train_0.csv
        ├── train_1.csv
        ├── train_2.csv
        ├── train_3.csv
        ├── test_0.csv
        └── test_1.csv

    Output:

        {'train': ['**/train[-._ 0-9]*', '**/*[-._ 0-9]train[-._ 0-9]*', '**/training[-._ 0-9]*', '**/*[-._ 0-9]training[-._ 0-9]*'],
         'test': ['**/test[-._ 0-9]*', '**/*[-._ 0-9]test[-._ 0-9]*', '**/testing[-._ 0-9]*', '**/*[-._ 0-9]testing[-._ 0-9]*', ...]}

    Input:

        my_dataset_repository/
        ├── README.md
        └── data/
            ├── train/
            │   ├── shard_0.csv
            │   ├── shard_1.csv
            │   ├── shard_2.csv
            │   └── shard_3.csv
            └── test/
                ├── shard_0.csv
                └── shard_1.csv

    Output:

        {'train': ['**/train/**', '**/train[-._ 0-9]*/**', '**/*[-._ 0-9]train/**', '**/*[-._ 0-9]train[-._ 0-9]*/**', ...],
         'test': ['**/test/**', '**/test[-._ 0-9]*/**', '**/*[-._ 0-9]test/**', '**/*[-._ 0-9]test[-._ 0-9]*/**', ...]}

    Input:

        my_dataset_repository/
        ├── README.md
        └── data/
            ├── train-00000-of-00003.csv
            ├── train-00001-of-00003.csv
            ├── train-00002-of-00003.csv
            ├── test-00000-of-00001.csv
            ├── random-00000-of-00003.csv
            ├── random-00001-of-00003.csv
            └── random-00002-of-00003.csv

    Output:

        {'train': ['data/train-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*'],
         'test': ['data/test-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*'],
         'random': ['data/random-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*']}

    In order, it first tests if SPLIT_PATTERN_SHARDED works, otherwise it tests the patterns in ALL_DEFAULT_PATTERNS.
    )r   r   zThe directory at z doesn't contain any data filesN)r   r   r   r   r#   )r   r   resolverr   r   r!   get_data_patterns  s   T
r   	data_filec           	      C   s   t | |d\} }t| fi |^}}t|tr"|| }|j|jfS | tj	rMttj	|j
d}d| ttj	d d  ddd } || }|j|jfS || }dD ]}||v rct|| f  S qTdS )	Nr   )endpointtokenzhf://r   z	/resolve/@)ETagetagmtimer   )r   r	   rO   r
   resolve_pathrepo_idrevisionrg   r   HF_ENDPOINTr   r\   r   r   rN   )	r   r   r   r   _resolved_pathhffsr   rQ   r   r   r!   _get_single_origin_metadata  s    

$

r   r   max_workersc                 C   s:   |d ur|nt j}ttt|d| |tdt| dkpd dS )Nr   zResolving data files   )r   
tqdm_classdescdisable)r   &HF_DATASETS_MULTITHREADING_MAX_WORKERSr   r   r   hf_tqdmr\   )r   r   r   r   r   r!   _get_origin_metadata  s   
r   c                       s0  e Zd ZdZdee dee ddf fddZdd	d
Ze				ddee de
jjdee deee  dee dd fddZe				ddee dee deee  dee dd f
ddZe				ddee dee deee  dee dd f
ddZddddeee  deee  dd fddZ  ZS )rL   a  
    List of data files (absolute local paths or URLs).
    It has two construction methods given the user's data files patterns:
    - ``from_hf_repo``: resolve patterns inside a dataset repository
    - ``from_local_or_remote``: resolve patterns from a local path

    Moreover, DataFilesList has an additional attribute ``origin_metadata``.
    It can store:
    - the last modified time of local files
    - ETag of remote files
    - commit sha of a dataset repository

    Thanks to this additional attribute, it is possible to hash the list
    and get a different hash if and only if at least one file changed.
    This is useful for caching Dataset objects that are obtained from a list of data files.
    r   origin_metadatarC   Nc                       t  | || _d S rD   )super__init__r   )selfr   r   	__class__r   r!   r     s   
zDataFilesList.__init__otherc                 C      t g | || j|j S rD   )rL   r   r   r   r   r   r!   __add__     zDataFilesList.__add__rK   dataset_infor   r   r   c                 C   s6   d|j  d|j d|pd d}| j||||dS )Nzhf://datasets/r   /r   r   r   r   )idsharstripfrom_patterns)clsrK   r   r   r   r   r   r   r!   from_hf_repo  s   $	zDataFilesList.from_hf_repoc                 C   s,   |d ur|nt    }| j||||dS Nr   )r   resolveas_posixr   )r   rK   r   r   r   r   r   r!   from_local_or_remote,  s   z"DataFilesList.from_local_or_remotec              	   C   st   |d ur|nt    }g }|D ]}z|t||||d W q ty.   t|s, Y qw t||d}| ||S Nr   r   )r   r   r   extendr   r   r   r   )r   rK   r   r   r   r   r8   r   r   r   r!   r   9  s&   
zDataFilesList.from_patterns
extensions
file_namesr   r   c                   s   g  |rd dd |D } td| d |r2d dd |D } td| d  rBt fd	d
| D | jdS tt| | jdS )N|c                 s       | ]}t |V  qd S rD   rz   escape)r7   extr   r   r!   rG   X  rU   z'DataFilesList.filter.<locals>.<genexpr>z.*(z	)(\..+)?$c                 s   r   rD   r   )r7   fnr   r   r!   rG   [  rU   z.*[\/]?(z)$c                    s&   g | ] t  fd dD r qS )c                 3   s    | ]}|  V  qd S rD   )r{   rT   r   r   r!   rG   _  rU   z2DataFilesList.filter.<locals>.<listcomp>.<genexpr>)rH   )r7   rK   r   r!   r9   _  s   & z(DataFilesList.filter.<locals>.<listcomp>)r   )joinr   rz   compilerL   r   rP   )r   r   r   ext_pattern
fn_patternr   r   r!   filterS  s   zDataFilesList.filter)r   rL   rC   rL   NNN)r   r   r    __doc__rP   rN   SingleOriginMetadatar   r   classmethodhuggingface_hubhf_apiDatasetInfor   r   r   r   r   r   __classcell__r   r   r   r!   rL     sv    "





c                   @   s0  e Zd ZdZe			ddeeeee e	f f de
e de
ee  de
e dd f
dd	Ze			ddeeeee e	f f d
ejjde
e de
ee  de
e dd fddZe			ddeeeee e	f f de
e de
ee  de
e dd f
ddZdddde
ee  de
ee  dd fddZdS )DataFilesDicta  
    Dict of split_name -> list of data files (absolute local paths or URLs).
    It has two construction methods given the user's data files patterns :
    - ``from_hf_repo``: resolve patterns inside a dataset repository
    - ``from_local_or_remote``: resolve patterns from a local path

    Moreover, each list is a DataFilesList. It is possible to hash the dictionary
    and get a different hash if and only if at least one file changed.
    For more info, see [`DataFilesList`].

    This is useful for caching Dataset objects that are obtained from a list of data files.

    Changing the order of the keys of this dictionary also doesn't change its hash.
    NrK   r   r   r   rC   c                 C   @   |  }|  D ]\}}t|tr|ntj||||d||< q|S r   )rZ   rO   rL   r   r   rK   r   r   r   r   rQ   patterns_for_keyr   r   r!   r   v     
z"DataFilesDict.from_local_or_remoter   c           	      C   sB   |  }|  D ]\}}t|tr|n	tj|||||d||< q|S )N)r   r   r   r   )rZ   rO   rL   r   )	r   rK   r   r   r   r   r   rQ   r  r   r   r!   r     s   	zDataFilesDict.from_hf_repoc                 C   r  r   )rZ   rO   rL   r   r  r   r   r!   r     r  zDataFilesDict.from_patternsr   r   r   c                C   s2   t |  }|  D ]\}}|j||d||< q	|S )Nr   )r   rZ   r   )r   r   r   r   rQ   data_files_listr   r   r!   r     s   
zDataFilesDict.filterr   )r   r   r    r   r   rS   rN   r   rP   rL   r   r   r   r   r   r   r   r   r   r   r   r   r!   r   f  sr    




r   c                       s   e Zd ZdZdee deeee   f fddZdd Ze		ddee deee  d	d fd
dZ
	ddedee d	dfddZdee d	d fddZ  ZS )DataFilesPatternsListz
    List of data files patterns (absolute local paths or URLs).
    For each pattern there should also be a list of allowed extensions
    to keep, or a None ot keep all the files for the pattern.
    rK   r   c                    r   rD   )r   r   r   )r   rK   r   r   r   r!   r     s   
zDataFilesPatternsList.__init__c                 C   r   rD   )rL   r   r   r   r   r!   r     r   zDataFilesPatternsList.__add__NrC   c                 C   s   | ||gt | S rD   )r\   )r   rK   r   r   r   r!   r     s   z#DataFilesPatternsList.from_patternsr   r   rL   c              	   C   s   |d ur|nt    }g }t| | jD ]\}}z|t||||d W q ty4   t|s2 Y qw t	||d}t
||S r   )r   r   r   zipr   r   r   r   r   r   rL   )r   r   r   r   r8   r   r   r   r   r!   r     s&   
zDataFilesPatternsList.resolver   c                    s   t |  fdd| jD S )Nc                    s   g | ]}|  qS r   r   )r7   r   r   r   r!   r9     rY   z;DataFilesPatternsList.filter_extensions.<locals>.<listcomp>)r  r   )r   r   r   r  r!   filter_extensions  s   z'DataFilesPatternsList.filter_extensionsrD   )r   r   r    r   rP   rN   r   r   r   r   r   r   r   r	  r   r   r   r   r!   r    s4    

r  c                   @   sv   e Zd ZdZe	ddeeee f deee  dd fddZ		dded	ee
 dd
fddZdee dd fddZdS )DataFilesPatternsDictz[
    Dict of split_name -> list of data files patterns (absolute local paths or URLs).
    NrK   r   rC   c                 C   s<   |  }|  D ]\}}t|tr|ntj||d||< q|S )Nr   )rZ   rO   r  r   )r   rK   r   r   rQ   r  r   r   r!   r     s   z#DataFilesPatternsDict.from_patternsr   r   r   c                 C   s,   t  }|  D ]\}}|||||< q|S rD   )r   rZ   r   )r   r   r   r   rQ   data_files_patterns_listr   r   r!   r     s   zDataFilesPatternsDict.resolver   c                 C   s.   t |  }|  D ]\}}||||< q	|S rD   )r   rZ   r	  )r   r   r   rQ   r  r   r   r!   r	    s   
z'DataFilesPatternsDict.filter_extensionsrD   )r   r   r    r   r   rS   rN   rP   r   r   r   r   r	  r   r   r   r!   r
    s(    


r
  )NNrD   )Vr   rz   	functoolsr   r   r   pathlibr   r   typingr   r   r   r   fsspec.corer	   r
   	packagingr   tqdm.contrib.concurrentr   r   r   downloadr   namingr   ra   r   utilsr   r   r   utils.file_utilsr   r   r   r   r   utils.py_utilsr   tuplerN   r   TRAINr[   
get_loggerr   r   r   r   r#   SPLIT_PATTERN_SHARDED
VALIDATIONTESTr=   r5   FSSPEC_VERSIONr   r3   rB   r   "DEFAULT_PATTERNS_SPLIT_IN_FILENAME"DEFAULT_PATTERNS_SPLIT_IN_DIR_NAMEDEFAULT_PATTERNS_ALLr   r   rI   r   boolrJ   rS   rP   r`   ro   rw   r   r   r   r   intr   rL   r   r  r
  r   r   r   r!   <module>   s    







0&!,>/

(\]

a]5