o
    8wi                     @   s   d dl Z d dlZd dlmZ d dlmZ ddlmZ ddlm	Z	m
Z
mZmZmZmZ ddlmZ eeZG d	d
 d
e jZddee defddZdee defddZddededefddZdd ZdS )    N)Optional)insecure_hashlib   )config) ExpectedMoreDownloadedFilesErrorExpectedMoreSplitsErrorNonMatchingChecksumErrorNonMatchingSplitsSizesErrorUnexpectedDownloadedFileErrorUnexpectedSplitsError   )
get_loggerc                   @   s   e Zd ZdZdZdZdZdS )VerificationModea  `Enum` that specifies which verification checks to run.

    The default mode is `BASIC_CHECKS`, which will perform only rudimentary checks to avoid slowdowns
    when generating/downloading a dataset for the first time.

    The verification modes:

    |                           | Verification checks                                                           |
    |---------------------------|------------------------------------------------------------------------------ |
    | `ALL_CHECKS`              | Split checks, uniqueness of the keys yielded in case of the GeneratorBuilder  |
    |                           | and the validity (number of files, checksums, etc.) of downloaded files       |
    | `BASIC_CHECKS` (default)  | Same as `ALL_CHECKS` but without checking downloaded files                    |
    | `NO_CHECKS`               | None                                                                          |

    
all_checksbasic_checks	no_checksN)__name__
__module____qualname____doc__
ALL_CHECKSBASIC_CHECKS	NO_CHECKS r   r   V/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/datasets/utils/info_utils.pyr      s
    r   expected_checksumsrecorded_checksumsc                    s    d u rt d d S tt t dkr#ttt t ttt  dkr;tttt   fdd D }|d urMd| nd}t|dkr`td| d| d	t d
|  d S )NzUnable to verify checksums.r   c                    s    g | ]} | | kr|qS r   r   ).0urlr   r   r   r   
<listcomp>4   s     z$verify_checksums.<locals>.<listcomp>z for  zChecksums didn't matchz:
zY
Set `verification_mode='no_checks'` to skip checksums verification and ignore this errorz&All the checksums matched successfully)loggerinfolensetr   strr
   r   )r   r   verification_namebad_urlsfor_verification_namer   r   r   verify_checksums,   s    
r*   expected_splitsrecorded_splitsc                    s    d u rt d d S tt t dkr#ttt t ttt  dkr;tttt   fdd D }t|dkrQtt|t d d S )NzUnable to verify splits sizes.r   c                    s2   g | ]} | j | j kr | | d qS ))expectedrecorded)num_examples)r   namer+   r,   r   r   r    G   s
    z!verify_splits.<locals>.<listcomp>z$All the splits matched successfully.)r"   r#   r$   r%   r   r&   r   r	   )r+   r,   
bad_splitsr   r1   r   verify_splits?   s   
r3   Tpathrecord_checksumreturnc                    sx   |r1t  }t| d t fdddD ]}|| q| }W d   n1 s+w   Y  nd}tj| |dS )z7Compute the file size and the sha256 checksum of a filerbc                      s
     dS )Ni   )readr   fr   r   <lambda>V   s   
 z(get_size_checksum_dict.<locals>.<lambda>    N)	num_byteschecksum)	r   sha256openiterupdate	hexdigestosr4   getsize)r4   r5   mchunkr>   r   r9   r   get_size_checksum_dictQ   s   
rH   c                 C   s   | r
t jr
| t jk S dS )zCheck if `dataset_size` is smaller than `config.IN_MEMORY_MAX_SIZE`.

    Args:
        dataset_size (int): Dataset size in bytes.

    Returns:
        bool: Whether `dataset_size` is smaller than `config.IN_MEMORY_MAX_SIZE`.
    F)r   IN_MEMORY_MAX_SIZE)dataset_sizer   r   r   is_small_dataset^   s   
	
rK   )N)T)enumrD   typingr   huggingface_hub.utilsr   r!   r   
exceptionsr   r   r   r	   r
   r   loggingr   r   r"   Enumr   dictr*   r3   r&   boolrH   rK   r   r   r   r   <module>   s     