o
    8wi1                     @   s  d Z ddlZddlZddlZddlZddlmZ ddlmZ ddlm	Z	m
Z
 ddlZddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZmZmZmZ ddlmZ ddlmZmZ ddl m!Z!m"Z" ddl#m$Z$ ddl%m&Z& ee'Z(G dd dej)Z*G dd dZ+dS )zDownload manager interface.    N)datetime)partial)OptionalUnion)	url_to_fs)
thread_map   )config)tqdm)ArchiveIterableFilesIterablecached_pathis_relative_path,stack_multiprocessing_download_progress_barsurl_or_path_join)get_size_checksum_dict)
get_loggerr
   )NestedDataStructure
map_nested)tracked_str   )DownloadConfigc                   @   s   e Zd ZdZdZdZdZdS )DownloadModea)  `Enum` for how to treat pre-existing downloads and data.

    The default mode is `REUSE_DATASET_IF_EXISTS`, which will reuse both
    raw downloads and the prepared dataset if they exist.

    The generations modes:

    |                                     | Downloads | Dataset |
    |-------------------------------------|-----------|---------|
    | `REUSE_DATASET_IF_EXISTS` (default) | Reuse     | Reuse   |
    | `REUSE_CACHE_IF_EXISTS`             | Reuse     | Fresh   |
    | `FORCE_REDOWNLOAD`                  | Fresh     | Fresh   |

    reuse_dataset_if_existsreuse_cache_if_existsforce_redownloadN)__name__
__module____qualname____doc__REUSE_DATASET_IF_EXISTSREUSE_CACHE_IF_EXISTSFORCE_REDOWNLOAD r#   r#   _/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/datasets/download/download_manager.pyr   2   s
    r   c                
   @   s  e Zd ZdZ					d+dee dee dee dee fdd	Zed
d Z	edd Z
dedefddZdd Zdee dedee fddZdededefddZdeeejf fddZdeeee f fdd Zd!d" Zd#d$ Zd%d& Zd'd( Zd)d* ZdS ),DownloadManagerFNTdataset_namedata_dirdownload_config	base_pathc                 C   sF   || _ || _|ptjd| _i | _|| _|pt | _	i | _
i | _dS )a4  Download manager constructor.

        Args:
            data_dir:
                can be used to specify a manual directory to get the files from.
            dataset_name (`str`):
                name of dataset this instance will be used for. If
                provided, downloads will contain which datasets they were used for.
            download_config (`DownloadConfig`):
                to specify the cache directory and other
                download options
            base_path (`str`):
                base path that is used when relative paths are used to
                download files. This can be a remote url.
            record_checksums (`bool`, defaults to `True`):
                Whether to record the checksums of the downloaded files. If None, the value is inferred from the builder.
        .N)_dataset_name	_data_dirospathabspath
_base_path_recorded_sizes_checksumsrecord_checksumsr   r(   downloaded_pathsextracted_paths)selfr&   r'   r(   r)   r2   r#   r#   r$   __init__J   s   
zDownloadManager.__init__c                 C   s   | j S N)r,   r5   r#   r#   r$   
manual_dirm   s   zDownloadManager.manual_dirc                 C   s   t dd | j D S )z+Returns the total size of downloaded files.c                 s   s    | ]}|d  V  qdS )	num_bytesNr#   ).0checksums_dictr#   r#   r$   	<genexpr>t   s    z2DownloadManager.downloaded_size.<locals>.<genexpr>)sumr1   valuesr8   r#   r#   r$   downloaded_sizeq   s   zDownloadManager.downloaded_sizeurl_or_urlsdownloaded_path_or_pathsc                 C   sJ   d}t tt| | |ddD ]\}}t|| jd| jt|< qdS )z)Record size/checksum of downloaded files.   zComputing checksums)delaydesc)record_checksumN)hf_tqdmlistzipflattenr   r2   r1   str)r5   rA   rB   rD   urlr.   r#   r#   r$   _record_sizes_checksumsv   s   z'DownloadManager._record_sizes_checksumsc              
   C   s  | j  }d|_|jdu rd|_t| j|d}t }t  t	||d|j
dddd}W d   n1 s5w   Y  t | }td	| d
  d t|}t|}| jtt| |  t }| || t | }td| d
  d |jS )ay  Download given URL(s).

        By default, only one process is used for download. Pass customized `download_config.num_proc` to change this behavior.

        Args:
            url_or_urls (`str` or `list` or `dict`):
                URL or `list` or `dict` of URLs to download. Each URL is a `str`.

        Returns:
            `str` or `list` or `dict`:
                The downloaded paths matching the given input `url_or_urls`.

        Example:

        ```py
        >>> downloaded_files = dl_manager.download('https://storage.googleapis.com/seldon-datasets/sentence_polarity_v1/rt-polaritydata.tar.gz')
        ```
        FNzDownloading datar(   TzDownloading data files)	map_tuplenum_procrE   batched
batch_sizezDownloading took <   z minzChecksum Computation took )r(   copyextract_compressed_filedownload_descr   _download_batchedr   nowr   r   rQ   loggerinfototal_secondsr   r3   updatedictrI   rJ   rM   data)r5   rA   r(   download_func
start_timerB   durationr#   r#   r$   download   s6   


zDownloadManager.downloadurl_or_filenamesreturnc              	      s   t |dkrq   d _tj d}t|d }t|r$tj|}t	|fi  j
\}}d}z||dd}W n	 tyE   Y nw |dk rMtjnd}t|| jpVdd	tjd
dkrkt jrkt jd nd |tdS  fdd|D S )N   TrN   r   sizei  @r   Downloadingfiles8HF_DATASETS_STACK_MULTIPROCESSING_DOWNLOAD_PROGRESS_BARS1rO   )rE   unitpositionmax_workers
tqdm_classc                    s   g | ]	}j | d qS )rN   )_download_single)r;   url_or_filenamer(   r5   r#   r$   
<listcomp>   s    z5DownloadManager._download_batched.<locals>.<listcomp>)lenrU   disable_tqdmr   rp   rK   r   r   r0   r   storage_optionsr[   get	Exceptionr	   &HF_DATASETS_MULTITHREADING_MAX_WORKERSr   rW   r-   environmultiprocessingcurrent_process	_identityr
   )r5   rd   r(   r`   r.   fsrg   rn   r#   rr   r$   rX      s@   z!DownloadManager._download_batchedrq   c                 C   s>   t |}t|rt| j|}t||d}t|}|| |S )NrN   )rK   r   r   r0   r   r   
set_origin)r5   rq   r(   outr#   r#   r$   rp      s   
z DownloadManager._download_singlepath_or_bufc                 C   s   t |dr
t|S t|S )aK  Iterate over files within an archive.

        Args:
            path_or_buf (`str` or `io.BufferedReader`):
                Archive path or archive binary file object.

        Yields:
            `tuple[str, io.BufferedReader]`:
                2-tuple (path_within_archive, file_object).
                File object is opened in binary mode.

        Example:

        ```py
        >>> archive = dl_manager.download('https://storage.googleapis.com/seldon-datasets/sentence_polarity_v1/rt-polaritydata.tar.gz')
        >>> files = dl_manager.iter_archive(archive)
        ```
        read)hasattrr   from_buffrom_urlpath)r5   r   r#   r#   r$   iter_archive   s   


zDownloadManager.iter_archivepathsc                 C   s
   t |S )a  Iterate over file paths.

        Args:
            paths (`str` or `list` of `str`):
                Root paths.

        Yields:
            `str`: File path.

        Example:

        ```py
        >>> files = dl_manager.download_and_extract('https://huggingface.co/datasets/beans/resolve/main/data/train.zip')
        >>> files = dl_manager.iter_files(files)
        ```
        )r   from_urlpaths)r5   r   r#   r#   r$   
iter_files  s   
zDownloadManager.iter_filesc                 C   sd   | j  }d|_t| j|d}t|||jdd}t|}t|}| j	t
t| |  |jS )a$  Extract given path(s).

        Args:
            path_or_paths (path or `list` or `dict`):
                Path of file to extract. Each path is a `str`.

        Returns:
            extracted_path(s): `str`, The extracted paths matching the given input
            path_or_paths.

        Example:

        ```py
        >>> downloaded_files = dl_manager.download('https://storage.googleapis.com/seldon-datasets/sentence_polarity_v1/rt-polaritydata.tar.gz')
        >>> extracted_files = dl_manager.extract(downloaded_files)
        ```
        TrN   zExtracting data files)rQ   rE   )r(   rU   rV   r   rp   r   rQ   r   r4   r]   r^   rI   rJ   r_   )r5   path_or_pathsr(   extract_funcr4   r#   r#   r$   extract  s   
zDownloadManager.extractc                 C   s   |  | |S )a  Download and extract given `url_or_urls`.

        Is roughly equivalent to:

        ```
        extracted_paths = dl_manager.extract(dl_manager.download(url_or_urls))
        ```

        Args:
            url_or_urls (`str` or `list` or `dict`):
                URL or `list` or `dict` of URLs to download and extract. Each URL is a `str`.

        Returns:
            extracted_path(s): `str`, extracted paths of given URL(s).
        )r   rc   )r5   rA   r#   r#   r$   download_and_extract6  s   z$DownloadManager.download_and_extractc                 C   s
   | j  S r7   )r1   rU   r8   r#   r#   r$   get_recorded_sizes_checksumsH  s   
z,DownloadManager.get_recorded_sizes_checksumsc                 C   s^   t | j t | j  }t| j D ]\}}||v r,tj|r,t	| | j|= qd S r7   )
setr4   r?   r3   rH   itemsr-   r.   isfileremove)r5   paths_to_deletekeyr.   r#   r#   r$   delete_extracted_filesK  s   
z&DownloadManager.delete_extracted_filesc                 C   s   | j jr
|   d S d S r7   )r(   delete_extractedr   r8   r#   r#   r$   manage_extracted_filesR  s   z&DownloadManager.manage_extracted_files)NNNNT)r   r   r   is_streamingr   rK   r   r6   propertyr9   r@   r   rM   rc   rH   rX   rp   r   ioBufferedReaderr   r   r   r   r   r   r   r#   r#   r#   r$   r%   G   sJ    
#

2
+
 r%   ),r   enumr   r{   r-   r   	functoolsr   typingr   r   fsspecfsspec.corer   tqdm.contrib.concurrentr    r	   utilsr
   rG   utils.file_utilsr   r   r   r   r   r   utils.info_utilsr   utils.loggingr   utils.py_utilsr   r   utils.trackr   r(   r   r   rZ   Enumr   r%   r#   r#   r#   r$   <module>   s,    