o
    ॵi                      @   s0  d dl Z d dlmZ d dlmZmZ d dlZd dlm	Z	 d dl
mZ d dlmZmZ d dlmZ e Zdd	 Z		d%d
edee dee fddZde	dedededededefddZdefddZdeeef dedefddZdeeef defddZdd Zefd ededed!ed"ee f
d#d$ZdS )&    N)defaultdict)OptionalUnion)HubApi)DatasetContextConfig)DEFAULT_DATASET_REVISIONMetaDataFields)
get_loggerc                 C   s   dd |   D S )Nc                 S   s*   i | ]\}}| d s| dr||qS )metafile)get).0kv r   ]/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/msdatasets/utils/dataset_utils.py
<dictcomp>   s    z,format_dataset_structure.<locals>.<dictcomp>)items)dataset_structurer   r   r   format_dataset_structure   s   r   r   subset_namesplitc                 C   s   |r|| vs|st |  dkrtd| d|   |}|s0tt|  }td|  t| | }|rH||vrHtd| d|  |rP||| i}||fS )a|  
    Args:
        dataset_structure (dict): Dataset Structure, like
         {
            "default":{
                "train":{
                    "meta":"my_train.csv",
                    "file":"pictures.zip"
                }
            },
            "subsetA":{
                "test":{
                    "meta":"mytest.csv",
                    "file":"pictures.zip"
                }
            }
        }
        subset_name (str, optional): Defining the subset_name of the dataset.
        split (str, optional): Which split of the data to load.
    Returns:
           target_subset_name (str): Name of the chosen subset.
           target_dataset_structure (dict): Structure of the chosen split(s), like
           {
               "test":{
                        "meta":"mytest.csv",
                        "file":"pictures.zip"
                    }
            }
       zsubset_name z not found. Available: z,No subset_name specified, defaulting to the zsplit )lenkeys
ValueErrornextiterloggerinfor   )r   r   r   target_subset_nametarget_dataset_structurer   r   r   get_target_dataset_structure   s.   !r"   hub_api	max_limitis_recursivedataset_name	namespaceversionreturnc           
      C   sB   g }| j ||||d|d}|D ]}|d}	|	sq||	 q|S )a  
    List all objects for specific dataset.

    Args:
        hub_api (class HubApi): HubApi instance.
        max_limit (int): Max number of objects.
        is_recursive (bool): Whether to list objects recursively.
        dataset_name (str): Dataset name.
        namespace (str): Namespace.
        version (str): Dataset version.
    Returns:
        res (list): List of objects, i.e., ['train/images/001.png', 'train/images/002.png', 'val/images/001.png', ...]
    T)r&   r'   r$   r%   is_filter_dirrevisionKey)list_oss_dataset_objectsr   append)
r#   r$   r%   r&   r'   r(   resobjectsitem
object_keyr   r   r   list_dataset_objectsR   s   
r3   c                 C   s8   d}|   D ]\}}t|tr|dsd} |S q|S )a	  
    To check whether input contains at least one directory.

    Args:
        file_map (dict): Structure of data files. e.g., {'train': 'train.zip', 'validation': 'val.zip'}
    Returns:
        True if input contains at least one directory, False otherwise.
    Fz.zipT)r   
isinstancestrendswith)file_mapr/   r   r   r   r   r   contains_dirt   s   	 r8   c                 C   s$   t | tr| g} tj|d| S )N_)r4   r5   ospathjoin)r   r(   r   r   r   get_subdir_hash_from_split   s   
r=   c                 C   s.   t | tr| gS t | tr| S dt|  d)z! Unify the split to list-format. z/Expected format of split: str or list, but got .)r4   r5   listtype)r   r   r   r   get_split_list   s
   

rA   c                 C   s`   i }|   D ]\}}g ||< q|D ]}|   D ]\}}||dd r,|| | qq|S )a  
    Get the map between dataset split and oss objects.

    Args:
        file_map (dict): Structure of data files. e.g., {'train': 'train', 'validation': 'val'}, both of train and val
            are dirs.
        objects (list): List of oss objects. e.g., ['train/001/1_123.png', 'train/001/1_124.png', 'val/003/3_38.png']
    Returns:
        A map of split-objects. e.g., {'train': ['train/001/1_123.png', 'train/001/1_124.png'],
            'validation':['val/003/3_38.png']}
    /)r   
startswithrstripr.   )r7   r0   r/   r   r   obj_keyr   r   r   get_split_objects_map   s   
rF   subset_split_intocontext_configr+   c                 C   s~  t t}t t}t t}t t}t }	|jj}
|  D ],\}}|dd||< |	|dd|||||< |dr@|d ||< |d||< qg }| D ]U\}}|r|tj	r|| }t
||
}|jdd}tj|d|d	d
}|j|jjd  }t|dkrtd| d |jd }n|d }||  }|||< qN|st|	dd|||d}t|rt||}||||fS )a"  
    Return:
        meta_map: Structure of meta files (.csv), the meta file name will be replaced by url, like
        {
           "test": "https://xxx/mytest.csv"
        }
        file_map: Structure of data files (.zip), like
        {
            "test": "pictures.zip"
        }
    custom r
   r   args	delimiter,F\)iteratorrL   
escapecharz:FILEr   zNo column contains ":FILE" in r>   T)r#   r$   r%   r&   r'   r(   )r   dictr   data_meta_configmeta_cache_dirr   r   get_dataset_file_url_originr   ARGS_BIG_DATAfetch_meta_files_from_urlconfig_kwargspdread_csvcolumnsr5   containsto_listr   r   errorr3   r8   rF   )rG   r&   r'   rH   r+   meta_mapr7   args_mapcustom_type_mapmodelscope_apirT   r   r   r0   	args_dictmeta_csv_file_urlmeta_csv_file_pathcsv_delimitercsv_df
target_colr   r   r   get_dataset_files   sl   


ri   )NN)r:   collectionsr   typingr   r   pandasrY   modelscope.hub.apir   4modelscope.msdatasets.context.dataset_context_configr   modelscope.utils.constantr   r   modelscope.utils.loggerr	   r   r   rR   r5   r"   intboolr?   r3   r8   r=   rA   rF   ri   r   r   r   r   <module>   sP   	
8
"
