o
    .iL                     @   sT  d Z ddlZddlZddlZddlZddlZddlmZ ddlmZ ddl	m
Z
mZmZ ddlZddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZmZ ee Z!eG dd dZ"eG dd dZ#G dd de$Z%G dd de$Z&eG dd dZ'eG dd dZ(G dd de)e*e(f Z+dS )aw  DatasetInfo record information we know about a dataset.

This includes things that we know about the dataset statically, i.e.:
 - description
 - canonical location
 - does it have validation and tests splits
 - size
 - etc.

This also includes the things that can and should be computed once we've
processed the dataset as well:
 - number of examples (in each split)
 - etc.
    N)	dataclass)Path)ClassVarOptionalUnion)	url_to_fs)DatasetCardDatasetCardData   )config)Features)	SplitDict)Version)
get_logger)asdictunique_valuesc                   @   &   e Zd ZU dZeed< dZeed< dS )SupervisedKeysData inputoutputN)__name__
__module____qualname__r   str__annotations__r    r   r   A/home/ubuntu/.local/lib/python3.10/site-packages/datasets/info.pyr   7      
 r   c                   @   r   )DownloadChecksumsEntryDatar   keyvalueN)r   r   r   r    r   r   r!   r   r   r   r   r   =   r   r   c                   @      e Zd ZdZdS )MissingCachedSizesConfigErrorz;The expected cached sizes of the download file are missing.Nr   r   r   __doc__r   r   r   r   r#   C       r#   c                   @   r"   )NonMatchingCachedSizesErrorz/The prepared split doesn't have expected sizes.Nr$   r   r   r   r   r'   G   r&   r'   c                   @   sL   e Zd ZU dZee ed< dZee ed< dd Z	e
dedd fdd	ZdS )
PostProcessedInfoNfeaturesresources_checksumsc                 C   s0   | j d urt| j tst| j | _ d S d S d S N)r)   
isinstancer   	from_dictselfr   r   r   __post_init__P   s   zPostProcessedInfo.__post_init__post_processed_info_dictreturnc                    4   dd t | D  | di  fdd| D S )Nc                 S      h | ]}|j qS r   name.0fr   r   r   	<setcomp>W       z.PostProcessedInfo.from_dict.<locals>.<setcomp>c                       i | ]\}}| v r||qS r   r   r8   kvfield_namesr   r   
<dictcomp>X       z/PostProcessedInfo.from_dict.<locals>.<dictcomp>r   dataclassesfieldsitems)clsr1   r   r@   r   r-   U       zPostProcessedInfo.from_dict)r   r   r   r)   r   r   r   r*   dictr0   classmethodr-   r   r   r   r   r(   K   s   
 r(   c                   @   s  e Zd ZU dZejedZeed< ejedZ	eed< ejedZ
eed< ejedZeed< dZee ed< dZee ed	< dZee ed
< dZee ed< dZee ed< dZee ed< dZeeeef  ed< dZee ed< dZee ed< dZee ed< dZee ed< dZee ed< dZ ee ed< g dZ!e"e#e  ed< dd Z$d6dee fddZ%d7ddZ&dd  Z'e(d!e#d  fd"d#Z)e(d8d$edee d%d fd&d'Z*e(d(ed%d fd)d*Z+d9d:d-d.Z,d;d/d0Z-d%efd1d2Z.e(d3ed%d fd4d5Z/dS )<DatasetInfoa	  Information about a dataset.

    `DatasetInfo` documents datasets, including its name, version, and features.
    See the constructor arguments and properties for a full list.

    Not all fields are known on construction and may be updated later.

    Attributes:
        description (`str`):
            A description of the dataset.
        citation (`str`):
            A BibTeX citation of the dataset.
        homepage (`str`):
            A URL to the official homepage for the dataset.
        license (`str`):
            The dataset's license. It can be the name of the license or a paragraph containing the terms of the license.
        features ([`Features`], *optional*):
            The features used to specify the dataset's column types.
        post_processed (`PostProcessedInfo`, *optional*):
            Information regarding the resources of a possible post-processing of a dataset. For example, it can contain the information of an index.
        supervised_keys (`SupervisedKeysData`, *optional*):
            Specifies the input feature and the label for supervised learning if applicable for the dataset (legacy from TFDS).
        builder_name (`str`, *optional*):
            The name of the `GeneratorBasedBuilder` subclass used to create the dataset. Usually matched to the corresponding script name. It is also the snake_case version of the dataset builder class name.
        config_name (`str`, *optional*):
            The name of the configuration derived from [`BuilderConfig`].
        version (`str` or [`Version`], *optional*):
            The version of the dataset.
        splits (`dict`, *optional*):
            The mapping between split name and metadata.
        download_checksums (`dict`, *optional*):
            The mapping between the URL to download the dataset's checksums and corresponding metadata.
        download_size (`int`, *optional*):
            The size of the files to download to generate the dataset, in bytes.
        post_processing_size (`int`, *optional*):
            Size of the dataset in bytes after post-processing, if any.
        dataset_size (`int`, *optional*):
            The combined size in bytes of the Arrow tables for all splits.
        size_in_bytes (`int`, *optional*):
            The combined size in bytes of all files associated with the dataset (downloaded files + Arrow files).
        **config_kwargs (additional keyword arguments):
            Keyword arguments to be passed to the [`BuilderConfig`] and used in the [`DatasetBuilder`].
    )default_factorydescriptioncitationhomepagelicenseNr)   post_processedsupervised_keysbuilder_namedataset_nameconfig_nameversionsplitsdownload_checksumsdownload_sizepost_processing_sizedataset_sizesize_in_bytes)rV   rZ   r\   r)   rX   _INCLUDED_INFO_IN_YAMLc                 C   s   | j d urt| j tst| j | _ | jd ur$t| jts$t| j| _| jd urCt| jtsCt| jtr<t| j| _nt| j| _| j	d urUt| j	t
sUt
| j	| _	| jd ur{t| jts}t| jttfrpt| j | _d S tdi | j| _d S d S d S )Nr   )r)   r,   r   r-   rR   r(   rW   r   r   rX   r   from_split_dictrS   r   tuplelistr.   r   r   r   r0      s   zDatasetInfo.__post_init__Fstorage_optionsc                 C   s   t |fi |pi ^}}|t|tjd}| j||d W d   n1 s)w   Y  | jrT|t|tjd}| 	| W d   dS 1 sMw   Y  dS dS )a  Write `DatasetInfo` and license (if present) as JSON files to `dataset_info_dir`.

        Args:
            dataset_info_dir (`str`):
                Destination directory.
            pretty_print (`bool`, defaults to `False`):
                If `True`, the JSON will be pretty-printed with the indent level of 4.
            storage_options (`dict`, *optional*):
                Key/value pairs to be passed on to the file-system backend, if any.

                <Added version="2.9.0"/>

        Example:

        ```py
        >>> from datasets import load_dataset
        >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation")
        >>> ds.info.write_to_directory("/path/to/directory/")
        ```
        wb)pretty_printN)
r   open	posixpathjoinr   DATASET_INFO_FILENAME
_dump_inforQ   LICENSE_FILENAME_dump_license)r/   dataset_info_dirrd   rb   fs_r9   r   r   r   write_to_directory   s   "zDatasetInfo.write_to_directoryc                 C   s*   | tjt| |rdnddd dS )zQDump info in `file` file-like object open in bytes mode (to support remote files)   Nindentutf-8)writejsondumpsr   encode)r/   filerd   r   r   r   ri      s   *zDatasetInfo._dump_infoc                 C   s   | | jd dS )zTDump license in `file` file-like object open in bytes mode (to support remote files)rs   N)rt   rQ   rw   )r/   rx   r   r   r   rk      s   zDatasetInfo._dump_licensedataset_infosc                    s   dd  D  t  dkrt fdd D r d S dtdd  D  }dtdd  D  }dtd	d  D  }dtd
d  D  }d }d }| ||||||dS )Nc                 S   s   g | ]
}|d ur|  qS r+   )copyr8   	dset_infor   r   r   
<listcomp>       z*DatasetInfo.from_merge.<locals>.<listcomp>r   c                 3   s    | ]	} d  |kV  qdS )r   Nr   r{   ry   r   r   	<genexpr>   s    z)DatasetInfo.from_merge.<locals>.<genexpr>z

c                 s       | ]}|j V  qd S r+   )rN   r8   infor   r   r   r          c                 s   r   r+   )rO   r   r   r   r   r      r   c                 s   r   r+   )rP   r   r   r   r   r      r   c                 s   r   r+   )rQ   r   r   r   r   r      r   )rN   rO   rP   rQ   r)   rS   )lenallrg   r   strip)rH   ry   rN   rO   rP   rQ   r)   rS   r   r   r   
from_merge   s"   "zDatasetInfo.from_mergerl   r2   c                 C   s   t |fi |pi ^}}td|  |std|jt|tjddd}t	
|}W d   n1 s7w   Y  | |S )a   Create [`DatasetInfo`] from the JSON file in `dataset_info_dir`.

        This function updates all the dynamically generated fields (num_examples,
        hash, time of creation,...) of the [`DatasetInfo`].

        This will overwrite all previous metadata.

        Args:
            dataset_info_dir (`str`):
                The directory containing the metadata file. This
                should be the root directory of a specific dataset version.
            storage_options (`dict`, *optional*):
                Key/value pairs to be passed on to the file-system backend, if any.

                <Added version="2.9.0"/>

        Example:

        ```py
        >>> from datasets import DatasetInfo
        >>> ds_info = DatasetInfo.from_directory("/path/to/directory/")
        ```
        zLoading Dataset info from zECalling DatasetInfo.from_directory() with undefined dataset_info_dir.rrs   encodingN)r   loggerr   
ValueErrorre   rf   rg   r   rh   ru   loadr-   )rH   rl   rb   rm   rn   r9   dataset_info_dictr   r   r   from_directory   s   
zDatasetInfo.from_directoryr   c                    r3   )Nc                 S   r4   r   r5   r7   r   r   r   r:     r;   z(DatasetInfo.from_dict.<locals>.<setcomp>c                    r<   r   r   r=   r@   r   r   rB     rC   z)DatasetInfo.from_dict.<locals>.<dictcomp>r   rD   )rH   r   r   r@   r   r-     rI   zDatasetInfo.from_dictTother_dataset_infoc                    s.   | j }|jdi  fdd|j  D  d S )Nc                    s(   i | ]\}}|d us s|t |qS r+   rz   deepcopyr=   ignore_noner   r   rB   !  s
    
z&DatasetInfo.update.<locals>.<dictcomp>r   )__dict__updaterG   )r/   r   r   	self_dictr   r   r   r     s   

zDatasetInfo.updatec                 C   s    | j di dd | j D S )Nc                 S      i | ]
\}}|t |qS r   r   r=   r   r   r   rB   )  r~   z$DatasetInfo.copy.<locals>.<dictcomp>r   )	__class__r   rG   r.   r   r   r   rz   (  s    zDatasetInfo.copyc                 C   sf   i }t | }|D ](}|| jv r0t| |}t|dr | ||< qt|dr,| ||< q|||< q|S )N_to_yaml_list_to_yaml_string)r   r^   getattrhasattrr   r   )r/   	yaml_dictr   r    r!   r   r   r   _to_yaml_dict+  s   



zDatasetInfo._to_yaml_dict	yaml_datac                    s~   t |}|dd urt|d |d< |dd ur%t|d |d< dd t| D  | di  fdd| D S )Nr)   rX   c                 S   r4   r   r5   r7   r   r   r   r:   @  r;   z.DatasetInfo._from_yaml_dict.<locals>.<setcomp>c                    r<   r   r   r=   r@   r   r   rB   A  rC   z/DatasetInfo._from_yaml_dict.<locals>.<dictcomp>r   )	rz   r   getr   _from_yaml_listr   rE   rF   rG   )rH   r   r   r@   r   _from_yaml_dict9  s   
 zDatasetInfo._from_yaml_dict)FN)Fr+   )T)r   rL   )r2   rL   )0r   r   r   r%   rE   fieldr   rN   r   rO   rP   rQ   r)   r   r   rR   r(   rS   r   rT   rU   rV   rW   r   r   rX   rJ   rY   rZ   intr[   r\   r]   r^   r   ra   r0   ro   ri   rk   rK   r   r   r-   r   rz   r   r   r   r   r   r   rL   [   sF   
 -
!

rL   c                   @   sN   e Zd ZddddZedddZededd fd	d
ZdeddfddZdS )DatasetInfosDictFr2   Nc                 C   s  i }t j|tj}t j|tj}|s| |}||  t j|rPt	|ddd}dd |
 D }tj|||r=dnd d W d    n1 sKw   Y  t j|r_t|}	|	j}
nd }	t }
|r||
 |	d u rytdt|
 d	 n|	}	|	t| d S d S )
Nwrs   r   c                 S   s   i | ]	\}}|t |qS r   )r   r8   rV   r|   r   r   r   rB   O  s    z7DatasetInfosDict.write_to_directory.<locals>.<dictcomp>rp   rq   z---
z
---
)ospathrg   r   DATASETDICT_INFOS_FILENAMEREPOCARD_FILENAMEr   r   existsre   rG   ru   dumpr   r   datar	   to_dataset_card_datar   saver   )r/   dataset_infos_dir	overwriterd   total_dataset_infosdataset_infos_pathdataset_readme_pathr9   dataset_infos_dictdataset_carddataset_card_datar   r   r   ro   E  s0   



z#DatasetInfosDict.write_to_directoryc                 C   s   t d|  tjtj|tjr(t	t
|tj j}d|v r(| |S tjtj|tjr`ttj|tjdd}| dd t	| D W  d    S 1 sYw   Y  d S |  S )NzLoading Dataset Infos from dataset_infors   r   c                 S   r   r   )rL   r-   )r8   rV   r   r   r   r   rB   m  s    
z3DatasetInfosDict.from_directory.<locals>.<dictcomp>)r   r   r   r   r   rg   r   r   r   r   r   r   from_dataset_card_datar   re   ru   rG   )rH   r   r   r9   r   r   r   r   a  s   
$zDatasetInfosDict.from_directoryr   c                 C   sl   t |dttfr3t |d tr| dd |d D S t|d }|d dd|_| |j|iS |  S )Nr   c                 S   s    i | ]}| d dt|qS )rV   default)r   rL   r   )r8   dataset_info_yaml_dictr   r   r   rB   z  s    z;DatasetInfosDict.from_dataset_card_data.<locals>.<dictcomp>rV   r   )r,   r   ra   rJ   rL   r   rV   )rH   r   r   r   r   r   r   u  s   	z'DatasetInfosDict.from_dataset_card_datac                 C   s6  | rd|v rt |d tr|d dd|d i}nd|v r/t |d tr/dd |d D }ni }i |dd |  D }| D ]\}}||d< qBt|dkrutt| |d< |d 	dd }|dkrsd|i|d |d< d S d S g |d< t
| D ]\}}|	dd  d|i|}|d | qd S d S )Nr   rV   r   c                 S   s   i | ]}|d  |qS )rV   r   )r8   config_metadatar   r   r   rB     s    z9DatasetInfosDict.to_dataset_card_data.<locals>.<dictcomp>c                 S   s   i | ]	\}}||  qS r   )r   r   r   r   r   rB     s    r
   )r,   rJ   r   ra   rG   r   nextitervaluespopsortedappend)r/   r   dataset_metadata_infosr   rV   dset_info_yaml_dictr   r   r   r   r     s@   
!z%DatasetInfosDict.to_dataset_card_data)FF)r2   N)r2   r   )	r   r   r   ro   rK   r   r	   r   r   r   r   r   r   r   D  s    r   ),r%   rz   rE   ru   r   rf   r   pathlibr   typingr   r   r   fsspecfsspec.corer   huggingface_hubr   r	   r   r   r)   r   rX   r   utilsr   utils.loggingr   utils.py_utilsr   r   r   r   r   r   	Exceptionr#   r'   r(   rL   rJ   r   r   r   r   r   r   <module>   s>    i