o
    ॵi                     @   sz  d dl Z d dlZd dlmZmZmZmZmZmZm	Z	m
Z
mZ d dlZd dlmZmZmZmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZmZmZmZ d d	l m!Z!m"Z" d d
l#m$Z$ d dl%m&Z& d dl'm(Z( d dl)m*Z* d dl+m,Z, d dl-m.Z.m/Z/ d dl0m1Z1 d dl2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8m9Z9m:Z:m;Z; d dl<m=Z=m>Z> d dl?m@Z@ e@ ZAdefddZBG dd dZCdS )    N)	AnyCallableDictIterableListMappingOptionalSequenceUnion)DatasetDatasetDictIterableDatasetIterableDatasetDict)_PACKAGED_DATASETS_MODULES)is_relative_path)DatasetRepository)DatasetContextConfig)LocalDataLoaderManagerLocalDataLoaderTypeRemoteDataLoaderManagerRemoteDataLoaderType)ExternalDatasetNativeIterableDataset)build_custom_dataset)DatasetDeleteManager)load_dataset_with_ctx)DatasetUploadManager)build_preprocessor)Config
ConfigDict)MS_DATASETS_CACHE)	DEFAULT_DATASET_NAMESPACEDEFAULT_DATASET_REVISIONConfigFieldsDatasetFormationsDownloadModeHubsModeKeysTasks
UploadMode)is_tf_availableis_torch_available)
get_loggerreturnc                 C   sJ   | d u rg } | S t | tr| g} | S tt| t| k r#td|  | S )Nz"List columns contains duplicates: )
isinstancestrlenset
ValueError)para r4   T/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/msdatasets/ms_dataset.pyformat_list'   s   
r6   c                '   @   s>  e Zd ZU dZdZdZeed< 	d\dee	e
eef dee fddZdd	 Zd
d Zdd Zedd Zedd Ze	d\dee	eef dedeed f fddZe	d\dee	eeee
ef dedeed f fddZeedeej dddde!j"e#dde$ ddfdeee%f dee dee dee dee dee dee d ee d!eeee&e e'eeee&e f f f  d"ee! d#ee d$ee( d%ee) d&ee$ d'ee d(ee( deed ef f"d)d*Z*eeeddd+e+j,fd,ed-ededee dee d.ee) d/ee) d0ee( d1ee+ ddfd2d3Z-eeddfd4ed5ed6ee d7ee d8ee ddfd9d:Z.eedddfd4ed;ed6ee d7ee d8ee d<e(ddfd=d>Z/eeefd,ededee dee def
d?d@Z0					+d]dAeee1e f dBee2e1e2 f dCedDe3dEe(f
dFdGZ4							+d^dHe)dIe(dBee2e1e2 f dAeee1e f dJe2dKe(dLe5ee6f dMeee1e f dNe(fdOdPZ7de	fdQdRZ8dSe5eef de	fdTdUZ9		+d_dBee2e1e2 f dAeee1e f dEe(fdVdWZ:		+		d`dHe)dIe(dBee2e1e2 f dKe(dNe(dMeee1e f dAeee1e f fdXdYZ;		dad&e$fdZd[Z<dS )b	MsDataseta  
    ModelScope Dataset (aka, MsDataset) is backed by a huggingface Dataset to
    provide efficient data access and local storage managements. On top of
    that, MsDataset supports the data integration and interactions with multiple
    remote hubs, particularly, ModelScope's own Dataset-hub. MsDataset also
    abstracts away data-access details with other remote storage, including both
    general external web-hosted data and cloud storage such as OSS.
    N_dataset_context_configds_instancetargetc                 C   sJ   || _ |d ur|| j jvrtdt| j j  d| || _d| _d S )Nz)"target" must be a column of the dataset(z
, but got F)_hf_dsfeatures	TypeErrorlistkeysr:   	is_custom)selfr9   r:   r4   r4   r5   __init__>   s   
zMsDataset.__init__c                 c   s0    | j D ]}| jd ur|| j V  q|V  qd S N)r;   r:   )rA   itemr4   r4   r5   __iter__J   s   

zMsDataset.__iter__c                 C   s
   | j | S rC   r;   )rA   keyr4   r4   r5   __getitem__Q      
zMsDataset.__getitem__c                 C   
   t | jS rC   )r0   r;   rA   r4   r4   r5   __len__T   rI   zMsDataset.__len__c                 C   s   | j S rC   rF   rK   r4   r4   r5   r9   W   s   zMsDataset.ds_instancec                 C   s   t | jtr
| jjS d S rC   )r.   r;   r   config_kwargsrK   r4   r4   r5   rM   [   s   zMsDataset.config_kwargshf_dsr-   c                    s   t dt t|tr |S t|tr4t| dkr( tt	|
 S  fdd| D S t|tr= |S tdt| )z
        @deprecated
        This method is deprecated and may be removed in future releases, please use `to_ms_dataset()` instead.
        z@from_hf_dataset is deprecated, please use to_ms_dataset instead.   c                       i | ]
\}}| |qS r4   r4   .0kvclsr:   r4   r5   
<dictcomp>r       z-MsDataset.from_hf_dataset.<locals>.<dictcomp>z2"hf_ds" must be a Dataset or DatasetDict, but got )warningswarnDeprecationWarningr.   r   r   r0   r?   nextitervaluesitemsr   r=   type)rV   rN   r:   r4   rU   r5   from_hf_datasetb   s   



zMsDataset.from_hf_datasetc                    s   t |tr
 |S t |tr.t| dkr" tt| S  fdd| D S t |t	r7 |S t |t
r@ |S t |trI |S t |trmt| dkra tt| S  fdd| D S tdt| )z&Convert input to `MsDataset` instance.rO   c                    rP   r4   r4   rQ   rU   r4   r5   rW      rX   z+MsDataset.to_ms_dataset.<locals>.<dictcomp>c                    rP   r4   r4   rQ   rU   r4   r5   rW      rX   z8"ds_instance" must be a Dataset or DatasetDict, but got )r.   r   r   r0   r?   r\   r]   r^   r_   r   r   r   r   r=   r`   )rV   r9   r:   r4   rU   r5   to_ms_datasetz   s&   






zMsDataset.to_ms_datasetFrO   dataset_name	namespaceversionhubsubset_namesplitdata_dir
data_filesdownload_mode	cache_diruse_streamingstream_batch_size
custom_cfgtokendataset_info_onlyc           !      K   sP  |rddl m} | }|| t|	ptj}	t|ptj}t| ts1t| t	s1t
dt|  t| t	rJ|du r<d}t|| i}tj||dS tj| } tj| }t| rz| ddkrz|sz| d}|d  }|d  } |rx| szd	td| |||||||||	|
||d
|}| tv stj| stj| rt|tj}tj||d}t|tr||_ |r|j!dd|i| d|_"|S |tj#krddl$m} || f||||	j%d|S |tjkr`ddl m} | }|j&| |d\}}t|tt'j(j%kr3t)d|d |  |||||
dd|	j%||||dd|
}|W  d   S 1 s,w   Y  dS t*|}|t+j,}tj||d}t|tr^|j-|_ |r^|j!dd|i| d|_"|S |tj.krddl/m0} ddl1m2} |t3kr{|j4|_5|t6kr|j7|_8|
t9krddl:m;} tj<|ddd}
|
|_=||} | >  | j?S d)a	  Load a MsDataset from the ModelScope Hub, Hugging Face Hub, urls, or a local dataset.

            Args:
                dataset_name (str): Path or name of the dataset.
                    The form of `namespace/dataset_name` is also supported.
                namespace(str, optional): Namespace of the dataset. It should not be None if you load a remote dataset
                    from Hubs.modelscope,
                namespace (str, optional):
                    Namespace of the dataset. It should not be None if you load a remote dataset
                    from Hubs.modelscope,
                target (str, optional): Name of the column to output.
                version (str, optional): Version of the dataset script to load:
                subset_name (str, optional): Defining the subset_name of the dataset.
                data_dir (str, optional): Defining the data_dir of the dataset configuration. I
                data_files (str or Sequence or Mapping, optional): Path(s) to source data file(s).
                split (str, optional): Which split of the data to load.
                hub (Hubs or str, optional): When loading from a remote hub, where it is from. default Hubs.modelscope
                download_mode (DownloadMode or str, optional): How to treat existing datasets. default
                                                               DownloadMode.REUSE_DATASET_IF_EXISTS
                cache_dir (str, Optional): User-define local cache directory.
                use_streaming (bool, Optional): If set to True, no need to download all data files.
                                                Instead, it streams the data progressively, and returns
                                                NativeIterableDataset or a dict of NativeIterableDataset.
                stream_batch_size (int, Optional): The batch size of the streaming data.
                custom_cfg (str, Optional): Model configuration, this can be used for custom datasets.
                                           see https://modelscope.cn/docs/Configuration%E8%AF%A6%E8%A7%A3
                token (str, Optional): SDK token of ModelScope.
                dataset_info_only (bool, Optional): If set to True, only return the dataset config and info (dict).
                **config_kwargs (additional keyword arguments): Keyword arguments to be passed

            Returns:
                MsDataset (MsDataset): MsDataset object for a certain dataset.
            r   )HubApiz.dataset_name must be `str` or `list`, but got Nr:   )r:   /rO   zUThe dataset_name should be in the form of `namespace/dataset_name` or `dataset_name`.)rc   rd   re   rg   rh   r:   rf   ri   rj   rk   cache_root_dirrm   rn   ro   T)load_dataset)namerh   	streamingrk   )rc   rd   )pathrv   ri   rj   rh   rl   r<   download_configrk   revisionrp   rw   rq   trust_remote_code)VirgoDownloader)VirgoDatasetConfig)
CACHE_HOMEvirgorf   datasetszPlease adjust input args to specify a loading mode, we support following scenes: loading from local disk, huggingface hub and modelscope hub.r4   )@modelscope.hub.apirr   loginr%   REUSE_DATASET_IF_EXISTSr&   
modelscoper.   r/   r>   r=   r`   r   	from_dictr7   rb   osrx   
expanduserexistsr   countrh   stripr   r   isdirisfiler   ru   r   HF_DATA_LOADERr8   to_custom_datasetr@   huggingfacer   valueget_dataset_id_and_typer$   generalr   r   r   MS_DATA_LOADERdataset_context_configr   -modelscope.msdatasets.data_loader.data_loaderr|   modelscope.utils.constantr}   r!   default_virgo_namespacerd   r"   default_dataset_versionre   r    modelscope.utils.config_dsr~   joinrt   processdataset)!rc   rd   r:   re   rf   rg   rh   ri   rj   rk   rl   rm   rn   ro   rp   rq   rM   rr   apidataset_instis_local_pathdataset_name_splitr   ru   _apidataset_id_on_hubdataset_typedataset_resremote_dataloader_managerr|   r}   r~   virgo_downloaderr4   r4   r5   load   s*  9





	

&


zMsDataset.loadTobject_namelocal_file_pathnum_processes	chunksizefilter_hidden_filesupload_modec	           
      C   s   	 t dt | stdt|||d}	t|ptj}tj	|r+|	j
| ||d dS tj|r>|	j| |||||d dS t| d)z
        @deprecated
        This method is deprecated and may be removed in future releases, please use git command line instead.
        Hupload is deprecated, please use git command line to upload the dataset.zobject_name cannot be empty!rc   rd   re   )r   r   r   )object_dir_namelocal_dir_pathr   r   r   r   z& is not a valid file path or directoryN)rY   rZ   r[   r2   r   r)   	OVERWRITEr   rx   r   uploadr   
upload_dir)
r   r   rc   rd   re   r   r   r   r   _upload_managerr4   r4   r5   r   ^  s:   

zMsDataset.uploaddataset_work_dir
dataset_idrz   
auth_tokengit_pathc                 C   sR   t dt t| ||||d}| }|rtd| dS td| dS )a  Clone meta-file of dataset from the ModelScope Hub.

        Args:
            dataset_work_dir (str): Current git working directory.
            dataset_id (str): Dataset id, in the form of your-namespace/your-dataset-name .
            revision (str, optional):
                revision of the model you want to clone from. Can be any of a branch, tag or commit hash
            auth_token (str, optional):
                token obtained when calling `HubApi.login()`. Usually you can safely ignore the parameter
                as the token is already saved when you login the first time, if None, we will use saved token.
            git_path (str, optional):
                The git command line path, if None, we use 'git'
        Returns:
            None
        r   repo_work_dirr   rz   r   r   zAlready cloned repo to: {}zRepo dir already exists: {}N)	rY   rZ   r[   r   cloneloggerinfoformatwarning)r   r   rz   r   r   _repoclone_work_dirr4   r4   r5   
clone_meta  s"   zMsDataset.clone_metacommit_messageforcec                 C   s&   t | d|||d}|j|||d dS )aU  Upload meta-file of dataset to the ModelScope Hub. Please clone the meta-data from the ModelScope Hub first.

        Args:
            dataset_work_dir (str): Current working directory.
            commit_message (str): Commit message.
            revision(`Optional[str]`):
                revision of the model you want to clone from. Can be any of a branch, tag or commit hash
            auth_token(`Optional[str]`):
                token obtained when calling `HubApi.login()`. Usually you can safely ignore the parameter
                as the token is already saved when you log in the first time, if None, we will use saved token.
            git_path:(`Optional[str]`):
                The git command line path, if None, we use 'git'
            force (Optional[bool]): whether to use forced-push.

        Returns:
            None

         r   )r   branchr   N)r   push)r   r   rz   r   r   r   r   r4   r4   r5   upload_meta  s   zMsDataset.upload_metac                 C   s0   t |||d}|j| d}td|  d |S )as   Delete object of dataset. Please log in first and make sure you have permission to manage the dataset.

        Args:
            object_name (str): The object name of dataset to be deleted. Could be a name of file or directory. If it's
                directory, then ends with `/`.
                For example: your-data-name.zip, train/001/img_001.png, train/, ...
            dataset_name (str): Path or name of the dataset.
            namespace(str, optional): Namespace of the dataset.
            version (str, optional): Version of the dataset.

        Returns:
            res_msg (str): Response message.

        r   )r   zObject z successfully removed!)r   deleter   r   )r   rc   rd   re   _delete_managerresp_msgr4   r4   r5   r     s   zMsDataset.deletecolumnspreprocessors	task_namedata_config	to_tensorc                 K   sz   t  stdt| jtr |d|i || jj t||S |dur,| j|||dS | j	  | jj
d||d | jS )aF  Create a torch.utils.data.Dataset from the MS Dataset. The torch.utils.data.Dataset can be passed to
           torch.utils.data.DataLoader.

        Args:
            preprocessors (Callable or List[Callable], default None): (list of) Preprocessor object used to process
                every sample of the dataset. The output type of processors is dict, and each (numeric) field of the dict
                will be used as a field of torch.utils.data.Dataset.
            columns (str or List[str], default None): Dataset column(s) to be loaded (numeric data only if
                `to_tensor` is True). If the preprocessor is None, the arg columns must have at least one column.
                If the `preprocessors` is not None, the output fields of processors will also be added.
            task_name (str, default None):  task name, refer to :obj:`Tasks` for more details
            data_config (ConfigDict, default None): config dict for model object.
                Attributes of ConfigDict:
                    `preprocessor` (Callable, List[Callable], optional): preprocessors to deal with dataset
                    `type` (str): the type of task
                    `split_config` (dict, optional): get the split config for ExternalDataset
                    `test_mode` (bool, optional): is test mode or not
            to_tensor (bool, default None): whether convert the data types of dataset column(s) to torch.tensor or not.
            format_kwargs: A `dict` of arguments to be passed to the `torch.tensor`.

        Returns:
            :class:`torch.utils.data.Dataset`

        z>The function to_torch_dataset requires pytorch to be installedpreprocessorN)r   r   torch)r`   r   format_kwargs)r+   ImportErrorr.   r;   r   updaterM   r   !_to_torch_dataset_with_processorsreset_format
set_format)rA   r   r   r   r   r   r   r4   r4   r5   to_torch_dataset  s"   !

zMsDataset.to_torch_dataset
batch_sizeshuffle
collate_fndrop_remaindercollate_fn_args
label_colsprefetchc
           
   
   C   sj   t  std|dur| j|||||	||dS |du r"td dS | j  | jj||||||||	dS )a  Create a tf.data.Dataset from the MS Dataset. This tf.data.Dataset can be passed to tf methods like
           model.fit() or model.predict().

        Args:
            batch_size (int): Number of samples in a single batch.
            shuffle(bool): Shuffle the dataset order.
            preprocessors (Callable or List[Callable], default None): (list of) Preprocessor object used to process
                every sample of the dataset. The output type of processors is dict, and each field of the dict will be
                used as a field of the tf.data. Dataset. If the `preprocessors` is None, the `collate_fn`
                shouldn't be None.
            columns (str or List[str], default None): Dataset column(s) to be loaded. If the preprocessor is None,
                the arg columns must have at least one column. If the `preprocessors` is not None, the output fields of
                processors will also be added.
            collate_fn(Callable, default None): A callable object used to collect lists of samples into a batch. If
                the `preprocessors` is None, the `collate_fn` shouldn't be None.
            drop_remainder(bool, default None): Drop the last incomplete batch when loading.
            collate_fn_args (Dict, optional): A `dict` of arguments to be passed to the`collate_fn`.
            label_cols (str or List[str], defalut None): Dataset column(s) to load as labels.
            prefetch (bool, default True): Prefetch data.

        Returns:
            :class:`tf.data.Dataset`

        z?The function to_tf_dataset requires Tensorflow to be installed.N)r   r   r   r   z?The `preprocessors` and the `collate_fn` should`t be both None.)r   r   r   r   )r*   r   _to_tf_dataset_with_processorsr   errorr;   r   to_tf_dataset)
rA   r   r   r   r   r   r   r   r   r   r4   r4   r5   r   4  s<   $	
zMsDataset.to_tf_datasetc                 C   s   | j   | j S rC   )r;   r   rK   r4   r4   r5   to_hf_datasetv  s   
zMsDataset.to_hf_datasetcolumn_mappingc                 C   s   | j   | j |S )a  
        Rename columns and return the underlying hf dataset directly
        TODO: support native MsDataset column rename.
        Args:
            column_mapping: the mapping of the original and new column names
        Returns:
            underlying hf dataset
        )r;   r   rename_columns)rA   r   r4   r4   r5   remap_columnsz  s   
	zMsDataset.remap_columnsc                    s  t |tr|n|g}t   fdd| jj D  g }g }|ritt| jfdd D }|D ]}|dd |	 D  q3dd }	| D ]}
|	||
 sct
d|
 d	 ||
 qL||
 qLd
d lG fdddjjj}|| j||| |S )Nc                       g | ]}| v r|qS r4   r4   rR   rG   )r   r4   r5   
<listcomp>      z?MsDataset._to_torch_dataset_with_processors.<locals>.<listcomp>c                    s   i | ]
}|t  | qS r4   nparrayrR   rS   )sampler4   r5   rW     rX   z?MsDataset._to_torch_dataset_with_processors.<locals>.<dictcomp>c                 S      i | ]
\}}|t |qS r4   r   rQ   r4   r4   r5   rW     s    c                 S   s    t | jt jpt | jt jS rC   )r   
issubdtypedtypeintegerfloating)r   r4   r4   r5   is_numpy_number  s   zDMsDataset._to_torch_dataset_with_processors.<locals>.is_numpy_numberzData of column z  is non-numeric, will be removedr   c                       s>   e Zd Zdef fddZdd ZfddZdd	 Z  ZS )
zAMsDataset._to_torch_dataset_with_processors.<locals>.MsMapDatasetr   c                    s4   t t  || _|| _|| _|| _|| _|| _d S rC   )	superr7   rB   r   preprocessor_listr   retained_numeric_columnsretained_unumeric_columnsr   )rA   r   r   r   r   r   r   	__class__r4   r5   rB     s   
zJMsDataset._to_torch_dataset_with_processors.<locals>.MsMapDataset.__init__c                 S   rJ   rC   )r0   r   rK   r4   r4   r5   rL     rI   zIMsDataset._to_torch_dataset_with_processors.<locals>.MsMapDataset.__len__c                    s   | j r |S |S rC   )r   	as_tensor)rA   xr   r4   r5   type_converter  s   
zPMsDataset._to_torch_dataset_with_processors.<locals>.MsMapDataset.type_converterc                    sx   j |   fddjD }jD ]&}|  D ]\}}jr'|jv r/|||< q|jv r8|||< qq|S )Nc                    s,   i | ]}j r|jv r| | qS r4   )r   r   r  r   	item_dictrA   r4   r5   rW     s    
zaMsDataset._to_torch_dataset_with_processors.<locals>.MsMapDataset.__getitem__.<locals>.<dictcomp>)r   r   r   r_   r   r   r  r   )rA   indexresr   rS   rT   r4   r  r5   rH     s   



zMMsDataset._to_torch_dataset_with_processors.<locals>.MsMapDataset.__getitem__)	__name__
__module____qualname__r   rB   rL   r  rH   __classcell__r4   r   r   r5   MsMapDataset  s
    r
  )r.   r>   r6   r;   r<   r?   r\   r]   r   r_   r   r   appendr   utilsdatar   )rA   r   r   r   r   r   r   
sample_res	processorr   rS   r
  r4   )r   r   r   r5   r     sF   




&z+MsDataset._to_torch_dataset_with_processorsc                    sH  t |tr|n|gtt|}tt|   fddjj D dd ljj	
tjtjtjd}|rF|jtjd}dfdd	dd	jd jgd
fdd}	ddlm}
 |j|	|
d}rfdd}||}nt|dkr|dd }|dkr|j||d}|r||
}|S )Nc                    r   r4   r4   r   )cols_to_retainr4   r5   r     r   z<MsDataset._to_tf_dataset_with_processors.<locals>.<listcomp>r   )r   )buffer_sizeFc                    s`   t    fddD }D ]}|dd |j   D  q|r(|S tt| S )Nc                    s"   i | ]}|t j  | qS r4   )r   r   r;   r   )irA   r4   r5   rW     s   " zJMsDataset._to_tf_dataset_with_processors.<locals>.func.<locals>.<dictcomp>c                 S   r   r4   r   rQ   r4   r4   r5   rW     s    
)intr   r;   r_   tupler>   r^   )r  return_dictr  r   )r   retained_columnsrA   r  r5   func  s   

z6MsDataset._to_tf_dataset_with_processors.<locals>.funcT)input_signaturec                    s:   j | gfdd D d  fddtD S )Nc                    s   g | ]	} j |jqS r4   )dtypesas_dtyper   )rR   val)tfr4   r5   r     s    zTMsDataset._to_tf_dataset_with_processors.<locals>.fetch_function.<locals>.<listcomp>)inpToutc                    s   i | ]	\}}| | qS r4   r4   )rR   r  rG   outputr4   r5   rW   	  s    zTMsDataset._to_tf_dataset_with_processors.<locals>.fetch_function.<locals>.<dictcomp>)numpy_functionr^   	enumerater  )r  r  r  r   r5   fetch_function  s   
z@MsDataset._to_tf_dataset_with_processors.<locals>.fetch_function)AUTOTUNE)num_parallel_callsc                    sV    fdd|   D }t| dkrtt|  } t|dkr'tt| }| |fS )Nc                    s   i | ]\}}| v r||qS r4   r4   )rR   rG   tensorr   r4   r5   rW     s    z_MsDataset._to_tf_dataset_with_processors.<locals>.split_features_and_labels.<locals>.<dictcomp>rO   )r_   r0   r\   r]   r^   )input_batchlabelsr(  r4   r5   split_features_and_labels  s   
zKMsDataset._to_tf_dataset_with_processors.<locals>.split_features_and_labelsrO   c                 S   s   t t|  S rC   )r\   r]   r^   )r   r4   r4   r5   <lambda>  s    z:MsDataset._to_tf_dataset_with_processors.<locals>.<lambda>)r   )F)r.   r>   r6   r1   r;   r<   r?   
tensorflowr  r   from_tensor_slicesr   aranger0   int64r   function
TensorSpectensorflow.data.experimentalr%  mapbatchr   )rA   r   r   r   r   r   r   r   
tf_datasetr$  r%  r+  r4   )r  r  r   r   r  r  rA   r  r5   r     sJ   




z(MsDataset._to_tf_dataset_with_processorsc                 K   sp  t  std|sdS d| _|du rd|v r|d}|tjkr"dnd}|d| }|du rBt|tj	r=t
|j	jdnt
dd}|t|d	 |j}d
|v rV|d
}t|}d|v rd|d}|du rwt|drw|j}	|	rwt|	|}t| jtr|t|d || jj t||jd| _dS |dur|dd}
| j||
d| _dS | j  | jjdd dS )a  Convert the input datasets to specific custom datasets by given model configuration and preprocessor.

        Args:
            custom_cfg (Config): The model configuration for custom datasets.
            preprocessor (Preprocessor, Optional): Preprocessor for data samples.
            mode (str, Optional): See modelscope.utils.constant.ModeKeys

        Returns:
            `MsDataset`
        z?The function to_custom_dataset requires pytorch to be installedNTmodetrainr  zdataset.)r`   )r7  taskfieldr   )r   )cfgr   r   )r   r   r   )r+   r   r@   getr'   TRAINsafe_gethasattrr#   modelr   r`   r   dictr9  popr(   find_field_by_taskr   r   r.   r;   r   rM   r   r   r   r   )rA   ro   r   r7  kwargs
ds_cfg_keydata_cfgr   
field_namepreprocessor_cfgr   r4   r4   r5   r   '  sZ   





zMsDataset.to_custom_datasetrC   )NNNNT)NNNNNNT)NT)NTNN)NN)=r  r  r  __doc__r;   r8   r   __annotations__r
   r   r   r   r   r   r/   rB   rE   rH   rL   propertyr9   rM   classmethodr   rA  ra   r   rb   staticmethodr!   r"   r&   r   r%   r   r    r   r>   r	   r   boolr  r   r)   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r4   r4   r4   r5   r7   1   s  
 	





	 H		B& 
6
	

B
V
Rr7   )Dr   rY   typingr   r   r   r   r   r   r   r	   r
   numpyr   r   r   r   r   r   datasets.packaged_modulesr   datasets.utils.file_utilsr   modelscope.hub.repositoryr   4modelscope.msdatasets.context.dataset_context_configr   5modelscope.msdatasets.data_loader.data_loader_managerr   r   r   r   !modelscope.msdatasets.dataset_clsr   r   9modelscope.msdatasets.dataset_cls.custom_datasets.builderr   (modelscope.msdatasets.utils.delete_utilsr   ,modelscope.msdatasets.utils.hf_datasets_utilr   (modelscope.msdatasets.utils.upload_utilsr   modelscope.preprocessorsr   modelscope.utils.configr   r   r   r    r   r!   r"   r#   r$   r%   r&   r'   r(   r)   modelscope.utils.import_utilsr*   r+   modelscope.utils.loggerr,   r   r6   r7   r4   r4   r4   r5   <module>   s0   ,,
