o
    8wi$                     @   s  d dl Z d dlZd dlmZ d dlmZ d dlmZ d dlm	Z	m
Z
mZ d dlZd dlmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZ eeZG dd dejZdedeee ef fddZ G dd de!ee!ee	f f Z"i dg dg dg dg dg dg dg dg dg dg dg d g d!g d"g d#g d$g d%g i d&g d'g d(g d)g d*g d+g d,g d-g d.g d/g d0g d1g d2g d3g d4g d5g d6g g g g d7Z#dS )8    N)Counter)groupby)
itemgetter)AnyClassVarOptional)DatasetCardData   )METADATA_CONFIGS_FIELD)Features)DatasetInfoDatasetInfosDict)	_split_re)
get_loggerc                       s&   e Zd Zdd Zd fdd	Z  ZS )_NoDuplicateSafeLoaderc                    sR   fdd|j D }dd |D }t|  fdd D }|r'td| d S )Nc                    s   g | ]	\}} j | qS  )constructed_objects).0key_node_)selfr   T/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/datasets/utils/metadata.py
<listcomp>   s    zS_NoDuplicateSafeLoader._check_no_duplicates_on_constructed_node.<locals>.<listcomp>c                 S   s"   g | ]}t |trt|n|qS r   )
isinstancelisttupler   keyr   r   r   r      s   " c                    s   g | ]
} | d kr|qS )   r   r   )counterr   r   r      s    zGot duplicate yaml keys: )valuer   	TypeError)r   nodekeysduplicate_keysr   )r   r   r   (_check_no_duplicates_on_constructed_node   s   z?_NoDuplicateSafeLoader._check_no_duplicates_on_constructed_nodeFc                    s   t  j||d}| | |S )N)deep)superconstruct_mappingr%   )r   r"   r&   mapping	__class__r   r   r(      s   
z(_NoDuplicateSafeLoader.construct_mapping)F)__name__
__module____qualname__r%   r(   __classcell__r   r   r*   r   r      s    r   readme_contentreturnc                 C   s|   t |  }|r7|d dkr7d|dd  v r7|dd  dd }d|d| }|d||d d  fS d d|fS )Nr   z---r   
)r   
splitlinesindexjoin)r0   full_contentsep_idx	yamlblockr   r   r   _split_yaml_from_readme$   s    r9   c                	   @   s   e Zd ZU dZeZee ed< e	de
fddZededee
eef  ded	d fd
dZeded	d fddZded	dfddZd	ee fddZdS )MetadataConfigsz5Should be in format {config_name: {**config_params}}.
FIELD_NAMEmetadata_configc                 C   s   |  d}|d urStd| d}t|ttfst|t|trU|D ]2}t|ttfrNt|trRt|dkrNd|v rNt	
t|d rNt| dttfsRt|q$d S d S d S )N
data_filesz
                Expected data_files in YAML to be either a string or a list of strings
                or a list of dicts with two keys: 'split' and 'path', but got a  
                Examples of data_files in YAML:

                   data_files: data.csv

                   data_files: data/*.png

                   data_files:
                    - part0/*
                    - part1/*

                   data_files:
                    - split: train
                      path: train/*
                    - split: test
                      path: test/*

                   data_files:
                    - split: train
                      path:
                      - train/part1/*
                      - train/part2/*
                    - split: test
                      path: test/*

                PS: some symbols like dashes '-' are not allowed in split names
                r	   splitpath)gettextwrapdedentr   r   str
ValueErrordictlenrematchr   )r<   yaml_data_filesyaml_error_messageyaml_data_files_itemr   r   r   $_raise_if_data_files_field_not_valid3   s4   

"z4MetadataConfigs._raise_if_data_files_field_not_validparquet_commit_hashexported_parquet_filesdataset_infosr1   c                    s@    fddt |tdD  rfdd  D | S )Nc              	      sH   i | ] \}}|fd dt |tdD t |t jpddqS )c                    s(   g | ]\}}| fd d|D dqS )c                    s   g | ]
}|d   d qS )urlzrefs%2Fconvert%2Fparquet)replace)r   parquet_filerM   r   r   r   r   s    zhMetadataConfigs._from_exported_parquet_files_and_dataset_infos.<locals>.<dictcomp>.<listcomp>.<listcomp>)r>   r?   r   )r   
split_nameparquet_files_for_splitrS   r   r   r   o   s    
]MetadataConfigs._from_exported_parquet_files_and_dataset_infos.<locals>.<dictcomp>.<listcomp>r>   z0.0.0r=   version)r   r   rC   r@   r   rX   )r   config_nameparquet_files_for_config)rO   rM   r   r   
<dictcomp>m   s    

zRMetadataConfigs._from_exported_parquet_files_and_dataset_infos.<locals>.<dictcomp>configc                    s6   i | ]\ }  fd d|j D   d dqS )c                    s.   g | ]}  d  D ]
}|d |kr
|q
qS )r=   r>   r   )r   rT   	data_file)rY   metadata_configsr   r   r      s    
rV   rX   rW   )splits)r   dataset_info)r^   rY   r   r[      s    

)r   r   items)clsrM   rN   rO   r   )rO   r^   rM   r   ._from_exported_parquet_files_and_dataset_infosf   s   

z>MetadataConfigs._from_exported_parquet_files_and_dataset_infosdataset_card_datac                    s   | | jr=|| j }t|tstd| j d| d|D ]}d|vr,td| d| | q|  fdd|D S |  S )	Nz	Expected z to be a list, but got ''rY   zUEach config must include `config_name` field with a string name of a config, but got z. c                    s2   i | ]}|    r d dd   D qS )rY   c                 S   s(   i | ]\}}||d kr|nt |qS )features)r   _from_yaml_list)r   paramr    r   r   r   r[      s    zEMetadataConfigs.from_dataset_card_data.<locals>.<dictcomp>.<dictcomp>)copypoprb   )r   r<   r\   r   r   r[      s    
z:MetadataConfigs.from_dataset_card_data.<locals>.<dictcomp>)r@   r;   r   r   rD   rL   )rc   re   r^   r<   r   rl   r   from_dataset_card_data   s$   



z&MetadataConfigs.from_dataset_card_dataNc                 C   s|   | r<|   D ]}| | q| |}tti ||  }| D ]
\}}|dd  q#dd | D || j< d S d S )NrY   c                 S   s   g | ]
\}}d |i|qS ra   r   )r   rY   config_metadatar   r   r   r      s    
z8MetadataConfigs.to_dataset_card_data.<locals>.<listcomp>)valuesrL   rm   rE   sortedrb   rk   r;   )r   re   r<   current_metadata_configstotal_metadata_configsrY   rn   r   r   r   to_dataset_card_data   s   
z$MetadataConfigs.to_dataset_card_datac                 C   s\   d }|   D ]%\}}t| dks|dks|dr+|d u r |}qtd| d| dq|S )Nr   defaultz&Dataset has several default configs: 'z' and 'z'.)rb   rF   r@   rD   )r   default_config_namerY   r<   r   r   r   get_default_config_name   s   z'MetadataConfigs.get_default_config_name)r,   r-   r.   __doc__r
   r;   r   rC   __annotations__staticmethodrE   rL   classmethodr   r   r   rd   r   rm   rs   r   rv   r   r   r   r   r:   .   s&   
 2&r:   zimage-classificationtranslationzimage-segmentationz	fill-maskzautomatic-speech-recognitionztoken-classificationzsentence-similarityzaudio-classificationzquestion-answeringsummarizationzzero-shot-classificationztable-to-textzfeature-extractionotherzmultiple-choiceztext-classificationztext-to-imageztext2text-generationzzero-shot-image-classificationztabular-classificationztabular-regressionzimage-to-imageztabular-to-textzunconditional-image-generationztext-retrievalztext-to-speechzobject-detectionzaudio-to-audioztext-generationconversationalztable-question-answeringzvisual-question-answeringzimage-to-textzreinforcement-learning)zvoice-activity-detectionztime-series-forecastingzdocument-question-answering)$rG   rA   collectionsr   	itertoolsr   operatorr   typingr   r   r   yamlhuggingface_hubr   r\   r
   rg   r   infor   r   namingr   utils.loggingr   r,   logger
SafeLoaderr   rC   r   r9   rE   r:   known_task_idsr   r   r   r   <module>   s     
 	
 !"#