o
    ॵiSS                     @   s>  d dl Z d dlmZmZ d dlZd dlZd dlZd dlm	Z	m
Z
mZmZmZmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlm Z m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z'm(Z(m)Z) d dl*m+Z+ e+ Z,dZ-dZ.G dd dej/Z0G dd de0Z1G dd dej/Z2dS )    N)DictUnion)ArrowBasedBuilderDatasetDatasetDictGeneratorBasedBuilderIterableDatasetIterableDatasetDict)is_remote_filesystem)DatasetInfo)camelcase_to_snakecase)csv)FileLock)
map_nested)HubApi)DatasetContextConfig)ExternalDatasetNativeIterableDataset)DataStreamingDownloadManager)get_subdir_hash_from_split)DEFAULT_DATASET_NAMESPACEDatasetPathNameDownloadMode)
get_logger	delimiter,c                       sz   e Zd Zdef fddZefddZddefdefdd	Zd
d Z	dd Z
dd Zdd Zdd ZdefddZ  ZS )CsvDatasetBuilderdataset_context_configc                    s.  |j | _ |j| _|j| _|j| _|j| _|j| _|jj| _|jj| _|j	| _
ti | _tj| j| j| j | jtj| _t| _t| j
v rI| j
t | _| jpSt|jj }t|| jd}ddlm}m   fdd| j D }||}t j d| j| j||d| j
 | j | j!_"t#| j | _$ti | _%d S )Nsplitversionr   )DataFilesDictDataFilesListc                    s    i | ]\}}| |gd dqS )N)origin_metadata .0kvr"   r$   b/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/msdatasets/download/dataset_builder.py
<dictcomp>C   s    z.CsvDatasetBuilder.__init__.<locals>.<dictcomp>)	cache_dirconfig_namehash
data_filesr$   )&dataset_namecache_root_dir	namespacer    subset_namer   data_meta_configmeta_data_fileszip_data_filesconfig_kwargsinput_config_kwargsdictsplit_path_dictospathjoinr   	META_NAMEcache_build_dirDEFAULT_CSV_DELIMITERcsv_delimiterDELIMITER_NAMElisttarget_dataset_structurekeysr   datasets.data_filesr!   r"   itemsfrom_local_or_remotesuper__init__infobuilder_namer   namelocal_meta_csv_paths)selfr   r   sub_dir_hashr!   r/   	__class__r)   r*   rJ   (   sP   







zCsvDatasetBuilder.__init__c                 C   s    t j| j| jdd|d}|S )NFT)with_version	with_hashr2   )r;   r<   r=   _cache_dir_root_relative_data_dir)rO   r2   builder_data_dirr$   r$   r*   _build_cache_dirU   s   z"CsvDatasetBuilder._build_cache_dirTreturnc                 C   s   |du r| j jn| d| j j }| j}| j}|r!tj|| j}|r.tj|t| jj	}|r>|r>t
|tr>tj||}|S )zRelative path of this dataset in cache_dir:
        Will be:
            self.name/self.config.version/self.hash/
        or if a namespace has been specified:
            self.namespace___self.name/self.config.version/self.hash/
        N___)rK   rL   configr.   r;   r<   r=   	config_idstrr    
isinstance)rO   rS   rT   r2   rW   builder_configr.   r$   r$   r*   rV   ]   s   "

z$CsvDatasetBuilder._relative_data_dirc              	   C   sz   | j jstd|| j j}|| j}g }| D ]\}}t|tr'|g}|t	j
|||||dd q|S )Nz7At least one data file must be specified, but got none.)filesbase_dirrM   
gen_kwargs)r[   r/   
ValueErrordownload_and_extractr6   rG   r^   r]   appenddatasetsSplitGenerator
iter_filesget)rO   
dl_managerr/   r6   splits
split_namer`   r$   r$   r*   _split_generatorss   s&   
z#CsvDatasetBuilder._split_generatorsc                 #   s(   | j jd urt| j jjnd }|rdd t|j|jD nd }t|D ]l\}}t	j
|d|| jd}g }|jjD ]}	|	drE||	 q9z.t|D ]'\}
}|D ]}	 rb||	  fdd||	< qQtjj||d}||
f|fV  qKW q% ty } ztd	| d
t| d|   d }~ww d S )Nc                 S   s   i | ]	\}}||  qS r$   )to_pandas_dtype)r&   rM   dtyper$   r$   r*   r+      s    z6CsvDatasetBuilder._generate_tables.<locals>.<dictcomp>T)iteratorrp   r   :FILEc                       t j | S Nr;   r<   r=   xra   r$   r*   <lambda>       z4CsvDatasetBuilder._generate_tables.<locals>.<lambda>)schemazFailed to read file 'z' with error z: )r[   featurespar{   typezipnamestypes	enumeratepdread_csvrA   _engineendswithrf   applyTablefrom_pandasrd   loggererror)rO   r`   ra   r{   rp   file_idxfilecsv_file_readertransform_fields
field_name	batch_idxdfpa_tableer$   rx   r*   _generate_tables   sN   



z"CsvDatasetBuilder._generate_tablesc                 K   s*  |j j}|j j}|stj}|j j}|stj}| j}|stj}g }|tj ||j j	 || || || tj
|}	tj|tj|	d }
t|
8 tj|}|ro|tjjkrotd| j d| d td| j d| d | j||d W d    d S 1 sw   Y  d S )N.lockReusing dataset  ()Generating dataset )rk   download_mode)download_configr,   r   r   LOCK_FILE_NAME_ANYr    r3   rf   DATA_FILES_NAMEr0   LOCK_FILE_NAME_DELIMITERr=   r;   r<   stripr   existsr   REUSE_DATASET_IF_EXISTSvaluer   warningrM   rK   _download_and_prepare)rO   r   rk   download_kwargstarget_cache_dirrm   version_namer3   lock_file_nameslock_file_name	lock_pathdata_existsr$   r$   r*   download_and_prepare   sD   




"z&CsvDatasetBuilder.download_and_preparec                    sd   dd l }|jj |tjjkr|j dd tj dd  fdd| j	
 D | _|| j| _d S )Nr   T)ignore_errors)exist_okc                    s   i | ]\}}|t | qS r$   )r   fetch_meta_files_from_urlr%   r   r$   r*   r+          z;CsvDatasetBuilder._download_and_prepare.<locals>.<dictcomp>)shutilr   r,   r   FORCE_REDOWNLOADr   rmtreer;   makedirsr5   rG   rN   re   r6   r:   )rO   rk   r   r   r$   r   r*   r      s   

z'CsvDatasetBuilder._download_and_preparec              
      s   t j|d| jd}g }|j D ]}|dr|| q| j|d |D ]M}t	 t
rVt dkrV|jd t krQtd| d|jd  dt  d	 q& ||< q&t	 trk rk||  fd
d||< q&td|  q&tj|}t|dS )NFrq   r   rr    r   z,Number of lines in meta-csv file for split 'z' (z&) does not match number of data-files(z)!c                    rs   rt   ru   rv   base_extracted_dirr$   r*   ry      rz   z;CsvDatasetBuilder._convert_csv_to_dataset.<locals>.<lambda>zNothing to do for field )arrow_table)r   r   rA   columnstolistr   rf   r:   rj   r^   rC   lenshaper   r   r]   r   r   r}   r   r   r   )rO   rm   csv_file_pathr   r   r   pa_datar$   r   r*   _convert_csv_to_dataset   s<   





z)CsvDatasetBuilder._convert_csv_to_datasetc                    s   t  fdd j D S )Nc                    s   i | ]\}}|  ||qS r$   )r   r%   rO   r$   r*   r+      r   z0CsvDatasetBuilder.as_dataset.<locals>.<dictcomp>)r   rN   rG   r   r$   r   r*   
as_dataset   s   zCsvDatasetBuilder.as_dataset)__name__
__module____qualname__r   rJ   r   rX   r]   rV   rn   r   r   r   r   r   r   __classcell__r$   r$   rQ   r*   r   &   s    -	
'r   c                   @   s2   e Zd ZdefddZdd Zdd Zdd	 Zd
S )TaskSpecificDatasetBuilderr   c                 C   s   |j | _|j| _|j| _|j| _|j| _| jpt|jj	 }t
|| jd| _|jj| _|jj| _d | _d | _td|j i| _tj|j| _|  | _|jj| _d S )Nr   rL   )r0   rM   r3   r2   r   r    rC   r4   rD   rE   r   r.   r5   r/   r6   r:   r[   r   	from_dictrK   r;   r<   
expanduserr1   rU   rX   
_cache_dirmeta_args_map_config_kwargs)rO   r   r   r$   r$   r*   rJ     s.   


z#TaskSpecificDatasetBuilder.__init__c                 K   s   t j| j| jt jdd }t|; t j| j}|r:|t	j
kr:td| j d| j d 	 W d    d S td| j d| j d W d    n1 sRw   Y  | j|d d S )N_r   r   r   r   r   )rk   )r;   r<   r=   rU   r   replacesepr   r   r   r   r   r   rM   rK   r   )rO   r   rk   r   r   r   r$   r$   r*   r     s   
z/TaskSpecificDatasetBuilder.download_and_preparec                 C   s   | | j| _d S rt   )re   r6   r:   )rO   rk   r$   r$   r*   r   '  s   
z0TaskSpecificDatasetBuilder._download_and_preparec                 C   s   t | j| jS rt   )r   r:   r   r   r$   r$   r*   r   +  s   z%TaskSpecificDatasetBuilder.as_datasetN)r   r   r   r   rJ   r   r   r   r$   r$   r$   r*   r      s
    r   c                       s   e Zd Zdef fddZededejfddZde	de
eeef ef fdd	Zde	fd
dZdefddZdd ZdeddfddZedededefddZ  ZS )IterableDatasetBuilderr   c                    s  |j | _ |j| _|j| _|j| _|j| _|j| _|jj| _|jj| _|j	| _
|j| _tj| j| j| j | jtj| _t| _t| j
v rH| j
t | _| jpRt|jj }t|| jd}t jd| j| j | j|d d| j
 | j | j_t| j | _d | _|jj | _ d S )Nr   )r,   r0   r-   r.   r/   r$   )!r0   r1   r2   r    r3   r   r4   r5   r6   r7   r8   stream_batch_sizer;   r<   r=   r   r>   r?   r@   rA   rB   rC   rD   rE   r   rI   rJ   rK   rL   r   rM   meta_csv_dfmeta_cache_dir)rO   r   r   rP   rQ   r$   r*   rJ   1  sJ   




zIterableDatasetBuilder.__init__rY   c                 C   s   t | d}|S )N)r   )r   )r   builder_instancer$   r$   r*   get_builder_instanceY  s   z+IterableDatasetBuilder.get_builder_instancerk   c                 C   s   t | ttfstd| j dt| j }|s$tdt| jj	 d| 
| dd | |D }|jj}|d u r>|}n||v rG|| }ntd| dt| t| j|d	d
}t |trdt|}|S )NzBuilder z is not streamable.z(Loading a streaming dataset cached in a z is not supported yet.c                 S   s   i | ]}|j |qS r$   )rM   )r&   sgr$   r$   r*   r+   n  s    z?IterableDatasetBuilder.as_streaming_dataset.<locals>.<dictcomp>zBad split: z. Available splits: T)	map_tuple)r^   r   r   rd   rM   r
   _fsNotImplementedErrorr~   r   _check_manual_downloadrn   r   r   rC   r   _as_streaming_dataset_singler9   r	   )rO   rk   is_localsplits_generatorsr   splits_generatorstreaming_datasetsr$   r$   r*   as_streaming_dataset`  s6   


z+IterableDatasetBuilder.as_streaming_datasetc              	   C   s*  g }d}d}| j rtt| j  }| jrtt| j }|r<|s<| j  D ]\}}|tj||g |dd q'|S |rh|rh| j D ] \}}t	|t
rQ|g}| j |}|tj||||dd qE|S |s|r| j D ]\}}t	|t
r}|g}|tj|d||dd qq|S d| j d)Nr   )metar`   rk   rb   +Neither column meta nor data file found in z#.json, specify at least one column.)r5   nextitervaluesr6   rG   rf   rg   rh   r^   r]   rj   r0   )rO   rk   rl   meta_data_filezip_data_filerm   meta_file_urlr`   r$   r$   r*   rn     sb   '

z(IterableDatasetBuilder._split_generatorsc                 C   s    |  |}t|| j|j| jdS )N)rK   r   r   ) _get_examples_iterable_for_splitr   rK   rM   r   )rO   r   ex_iterabler$   r$   r*   r     s   
z3IterableDatasetBuilder._as_streaming_dataset_singlec                 k   s   | d}| d}| d}t }d}d}|r0ttt|}|dr0d}tj|d }|rG|sG| 	| t
j| j}	d|	fV  d S |ro|ro| 	| |ra|| j| j| j|}
|
|j_t
j| j}	d|	fV  d S |s|rt
jd	|i}	d|	fV  d S d
| j d)Nr   r`   rk   Fr   z.zipTr   z
Input:FILEr   z.json .)rj   r   r]   r   r   r   r;   r<   splitext_get_meta_csv_dfr}   r   r   r   &get_dataset_access_config_for_unzippedr0   r2   r    r   
oss_configfrom_pydict)rO   rc   r   r`   rk   hub_apiis_zipzip_file_namezip_filer   oss_config_for_unzippedr$   r$   r*   r     s<   





z'IterableDatasetBuilder._generate_tablesr   Nc                 C   s<   | j d u s	| j jrt|| j}tj|d| jd| _ d S d S )NFr   )r   emptyr   r   r   r   r   rA   )rO   r   meta_csv_file_pathr$   r$   r*   r     s   z'IterableDatasetBuilder._get_meta_csv_dfheaderstextsr   c                 C   sT   i }|  |} tdt| D ]}g }|D ]}|| ||  q||| | < q|S )Nr   )r   ranger   rf   )r   r   r   residxcol_listliner$   r$   r*   trans_data_to_mapping  s   
z,IterableDatasetBuilder.trans_data_to_mapping)r   r   r   r   rJ   staticmethodr   Csvr   r   r   r   r]   r   r   rn   r   r   r   r   rC   r   r   r$   r$   rQ   r*   r   /  s*    (
(3
)	 r   )3r;   typingr   r   rg   pandasr   pyarrowr}   r   r   r   r   r   r	   datasets.filesystemsr
   datasets.infor   datasets.namingr   datasets.packaged_modulesr   datasets.utils.filelockr   datasets.utils.py_utilsr   modelscope.hub.apir   4modelscope.msdatasets.context.dataset_context_configr   !modelscope.msdatasets.dataset_clsr   r   /modelscope.msdatasets.download.download_managerr   )modelscope.msdatasets.utils.dataset_utilsr   modelscope.utils.constantr   r   r   modelscope.utils.loggerr   r   rB   r@   r  r   r   r   r$   r$   r$   r*   <module>   s4     Z0