o
    8wiq                     @   s   d dl Z d dlZd dlmZ d dlmZmZ ddlmZm	Z	m
Z
mZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z& ddl'm(Z( ddl)m*Z* dd	l+m,Z, e(e-Z.G d
d dZ/dS )    N)Iterable)OptionalUnion   )!SINGLE_FILE_COMPRESSION_PROTOCOLSArchiveIterableFilesIterable_get_extraction_protocol_get_path_extension!_prepare_path_and_storage_optionsis_relative_pathurl_or_path_join	xbasenamexdirname	xet_parsexexistsxgetsizexglob
xgzip_openxisdirxisfilexjoinxlistdirxnumpy_loadxopenxpandas_read_csvxpandas_read_excelxPathxpyarrow_parquet_read_tablexrelpathxsio_loadmatxsplit	xsplitextxwalkxxml_dom_minidom_parse)
get_logger)
map_nested   )DownloadConfigc                
   @   s   e Zd ZdZdZ				d"dee dee dee dee fdd	Ze	d
d Z
dd ZdedefddZdd ZdedefddZdd Zdeeejf dee fddZdeeee f dee fddZdd Zd d! ZdS )#StreamingDownloadManagera  
    Download manager that uses the "::" separator to navigate through (possibly remote) compressed archives.
    Contrary to the regular `DownloadManager`, the `download` and `extract` methods don't actually download nor extract
    data, but they rather return the path or url that could be opened using the `xopen` function which extends the
    built-in `open` function to stream data from remote files.
    TNdataset_namedata_dirdownload_config	base_pathc                 C   s:   || _ || _|ptjd| _|pt | _d | _d| _	d S )N.F)
_dataset_name	_data_dirospathabspath
_base_pathr(   r,   downloaded_sizerecord_checksums)selfr*   r+   r,   r-    r8   i/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/datasets/download/streaming_download_manager.py__init__9   s   
z!StreamingDownloadManager.__init__c                 C   s   | j S N)r0   r7   r8   r8   r9   
manual_dirG   s   z#StreamingDownloadManager.manual_dirc                 C   s   t | j|dd}|S )aU  Normalize URL(s) of files to stream data from.
        This is the lazy version of `DownloadManager.download` for streaming.

        Args:
            url_or_urls (`str` or `list` or `dict`):
                URL(s) of files to stream data from. Each url is a `str`.

        Returns:
            url(s): (`str` or `list` or `dict`), URL(s) to stream data from matching the given input url_or_urls.

        Example:

        ```py
        >>> downloaded_files = dl_manager.download('https://storage.googleapis.com/seldon-datasets/sentence_polarity_v1/rt-polaritydata.tar.gz')
        ```
        T	map_tuple)r&   _download_singler7   url_or_urlsr8   r8   r9   downloadK   s   z!StreamingDownloadManager.downloadurlpathreturnc                 C   s    t |}t|rt| j|}|S r;   )strr   r   r4   )r7   rD   r8   r8   r9   r@   _   s   z)StreamingDownloadManager._download_singlec                 C   s   t | j|dd}|S )a  Add extraction protocol for given url(s) for streaming.

        This is the lazy version of `DownloadManager.extract` for streaming.

        Args:
            url_or_urls (`str` or `list` or `dict`):
                URL(s) of files to stream data from. Each url is a `str`.

        Returns:
            url(s): (`str` or `list` or `dict`), URL(s) to stream data from matching the given input `url_or_urls`.

        Example:

        ```py
        >>> downloaded_files = dl_manager.download('https://storage.googleapis.com/seldon-datasets/sentence_polarity_v1/rt-polaritydata.tar.gz')
        >>> extracted_files = dl_manager.extract(downloaded_files)
        ```
        Tr>   )r&   _extract)r7   rB   urlpathsr8   r8   r9   extractf   s   z StreamingDownloadManager.extractc                 C   s   t |}t|| jd}|dd }t|}|dv s|dr'td| d|d u r-|S |tv rUtj	
|dd }d|v rI|d |d n|}| d	| d| S | d
| S )Nr,   z::r   )tgztar)z.tar.gzz.tar.bz2z.tar.xzz+Extraction protocol for TAR archives like 'z' is not implemented in streaming mode. Please use `dl_manager.iter_archive` instead.

Example usage:

	url = dl_manager.download(url)
	tar_archive_iterator = dl_manager.iter_archive(url)

	for filename, file in tar_archive_iterator:
		...r.   z://z://::)rF   r	   r,   splitr
   endswithNotImplementedErrorr   r1   r2   basenamerindex)r7   rD   protocolr2   	extension
inner_filer8   r8   r9   rG   |   s   
	z!StreamingDownloadManager._extractc                 C   s   |  | |S )a0  Prepare given `url_or_urls` for streaming (add extraction protocol).

        This is the lazy version of `DownloadManager.download_and_extract` for streaming.

        Is equivalent to:

        ```
        urls = dl_manager.extract(dl_manager.download(url_or_urls))
        ```

        Args:
            url_or_urls (`str` or `list` or `dict`):
                URL(s) to stream from data from. Each url is a `str`.

        Returns:
            url(s): (`str` or `list` or `dict`), URL(s) to stream data from matching the given input `url_or_urls`.
        )rI   rC   rA   r8   r8   r9   download_and_extract   s   z-StreamingDownloadManager.download_and_extracturlpath_or_bufc                 C   s$   t |dr
t|S tj|| jdS )aN  Iterate over files within an archive.

        Args:
            urlpath_or_buf (`str` or `io.BufferedReader`):
                Archive path or archive binary file object.

        Yields:
            `tuple[str, io.BufferedReader]`:
                2-tuple (path_within_archive, file_object).
                File object is opened in binary mode.

        Example:

        ```py
        >>> archive = dl_manager.download('https://storage.googleapis.com/seldon-datasets/sentence_polarity_v1/rt-polaritydata.tar.gz')
        >>> files = dl_manager.iter_archive(archive)
        ```
        readrJ   )hasattrr   from_buffrom_urlpathr,   )r7   rV   r8   r8   r9   iter_archive   s   

z%StreamingDownloadManager.iter_archiverH   c                 C   s   t j|| jdS )a  Iterate over files.

        Args:
            urlpaths (`str` or `list` of `str`):
                Root paths.

        Yields:
            str: File URL path.

        Example:

        ```py
        >>> files = dl_manager.download_and_extract('https://huggingface.co/datasets/beans/resolve/main/data/train.zip')
        >>> files = dl_manager.iter_files(files)
        ```
        rJ   )r   from_urlpathsr,   )r7   rH   r8   r8   r9   
iter_files   s   z#StreamingDownloadManager.iter_filesc                 C      d S r;   r8   r<   r8   r8   r9   manage_extracted_files      z/StreamingDownloadManager.manage_extracted_filesc                 C   r^   r;   r8   r<   r8   r8   r9   get_recorded_sizes_checksums   r`   z5StreamingDownloadManager.get_recorded_sizes_checksums)NNNN)__name__
__module____qualname____doc__is_streamingr   rF   r(   r:   propertyr=   rC   r@   rI   rG   rU   r   ioBufferedReaderr   tupler[   listr]   r_   ra   r8   r8   r8   r9   r)   /   s6    

 "r)   )0rh   r1   collections.abcr   typingr   r   utils.file_utilsr   r   r   r	   r
   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   utils.loggingr%   utils.py_utilsr&   r,   r(   rb   loggerr)   r8   r8   r8   r9   <module>   s    !