o
    8wig)                     @   s  d dl Z d dlZd dlZd dlmZ d dlmZmZ d dlZd dl	Z
d dlZd dlZd dlmZ d dlmZmZ ejjeZG dd dejZdd	 Zg d
Zee_g dZee_g dZee_defddZdefddZ defddZ!defddZ"defddZ#defddZ$defddZ%i dededede&d e&d!e&d"e&d#e&d$ej'd%ej'd&e d'e d(e!d)e!d*e"d+e#d,e$d-e%iZ(e(e_(dS ).    N)islice)AnyCallable)cast_to_python_objects)-SINGLE_FILE_COMPRESSION_EXTENSION_TO_PROTOCOL	xbasenamec                   @   s   e Zd ZU dZee ed< ee ed< ee ed< eeee	ge	f f ed< dZ
edd Zd	ejfd
dZdd Zdd ZdS )
WebDatasetd   IMAGE_EXTENSIONSAUDIO_EXTENSIONSVIDEO_EXTENSIONSDECODERS   c              	   c   s`   i }t d}t }|D ]\}}t|\}}	|d u rq|r8|d |kr8|d|d< |d|d< |V  i }||d< ||d< | ||	 < |	dd t	v r|
|||	   |d| }
t |
}| ||	 < W d    n1 s{w   Y  || t|
dd }n|	dd }|| jv r| j| ||	 ||	< q|r|V  d S d S )Nmemory__key____url__.z	memory://)fsspec
filesystemdatasetsStreamingDownloadManagerbase_plus_extpopreadlowersplitr   write_bytesextractopendeleter   r   )clstar_pathtar_iteratorcurrent_examplefsstreaming_download_managerfilenamefexample_key
field_nameextracted_file_pathdata_extension r-   l/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/datasets/packaged_modules/webdataset/webdataset.py_get_pipeline_from_tar   s>   



z!WebDataset._get_pipeline_from_tarreturnc                 C   s   t  S )N)r   DatasetInfo)selfr-   r-   r.   _info;   s   zWebDataset._infoc                    s  | j jstd| j j  | j j}g }| D ]"\}}t|tr&|g} fdd|D }|tj	|||dd q| j
js| |d |d }tt|| jtfddD rbtd	d
d D }tj|ddj}	tj|	}
d D ]}|ddd }|| jv rt |
|< q{d D ]}|ddd }|| jv rt |
|< qd D ]}|ddd }|| jv rt |
|< q|
| j
_|S )z-We handle string, list and dicts in datafilesz=At least one data file must be specified, but got data_files=c                    s   g | ]}  |qS r-   )iter_archive).0r"   )
dl_managerr-   r.   
<listcomp>H   s    z0WebDataset._split_generators.<locals>.<listcomp>)	tar_pathstar_iterators)name
gen_kwargsr   c                 3   s$    | ]}|   d    kV  qdS )r   N)keysr5   example)first_examplesr-   r.   	<genexpr>R   s   " z/WebDataset._split_generators.<locals>.<genexpr>zThe TAR archives of the dataset should be in WebDataset format, but the files in the archive don't share the same prefix or the same types.c                 S   s"   g | ]}t jt|gd dqS )T)only_1d_for_numpy)paTablefrom_pylistr   r=   r-   r-   r.   r7   W   s    default)promote_optionsr      r   )config
data_files
ValueErrordownloaditems
isinstancestrappendr   SplitGeneratorinfofeaturesr/   listr   #NUM_EXAMPLES_FOR_FEATURES_INFERENCEanyrB   concat_tablesschemaFeaturesfrom_arrow_schemarsplitr
   Imager   Audior   Video)r2   r6   rI   splits
split_namer8   r9   pipeline	pa_tablesinferred_arrow_schemarR   r*   	extensionr-   )r6   r?   r.   _split_generators>   sT   




zWebDataset._split_generatorsc                 c   s    dd | j j D }dd | j j D }t| j j }tt||D ]F\}\}}t| ||D ]7\}	}
|D ]
}||
vrDd |
|< q:|| D ]}|
| d ur`|
d d | |
| d|
|< qI| d|	 |
fV  q4q&d S )Nc                 S       g | ]\}}t |tjr|qS r-   )rM   r   r[   r5   r*   featurer-   r-   r.   r7   r   
    z1WebDataset._generate_examples.<locals>.<listcomp>c                 S   re   r-   )rM   r   r\   rf   r-   r-   r.   r7   u   rh   r   r   )pathbytes_)rQ   rR   rL   rS   r<   	enumeratezipr/   )r2   r8   r9   image_field_namesaudio_field_namesall_field_namestar_idxr"   r#   example_idxr>   r*   r-   r-   r.   _generate_examplesq   s.   


zWebDataset._generate_examplesN)__name__
__module____qualname__DEFAULT_WRITER_BATCH_SIZErS   rN   __annotations__dictr   r   rT   classmethodr/   r   r1   r3   rd   rs   r-   r-   r-   r.   r      s   
 
3r   c                 C   s(   t d| }|s
dS |d|dfS )z>Split off all file extensions.

    Returns base, allext.
    z^((?:.*/|)[^.]+)[.]([^/]*)$)NNrG      )rematchgroup)ri   r}   r-   r-   r.   r      s   r   )?blpbmpdibbufrcurpcxdcxddspsepsfitfitsfliflcftcftugbrgifgribh5hdfpngapngjp2j2kjpcjpfjpxj2cicnsicoimiimtiftiffjfifjpejpgjpegmpgmpegmsppcdpxrpbmpgmppmpnmpsdbwrgbrgbasgirastgaicbvdavstwebpwmfemfxbmxpm)aiffauavrcafflachtksvxmat4mat5mpc2koggpafpvfrawrf64sd2sdsircamvocw64wavnistwavexwveximp3opus)z.mkvz.mp4z.aviz.mpegz.movdatac                 C   s
   |  dS )Nzutf-8)decoder   r-   r-   r.   
text_loads  s   
r   c                 C   s   ddl m} || S )NrG   )_tenbin) r   decode_buffer)r   r   r-   r-   r.   tenbin_loads  s   
r   c                 C      dd l }|| S Nr   )msgpackunpackb)r   r   r-   r-   r.   msgpack_loads$     
r   c                 C   s$   dd l }t| }|jjj|ddS )Nr   Fallow_pickle)numpy.lib.formatioBytesIOlibformat
read_array)r   numpystreamr-   r-   r.   	npy_loads*  s   
r   c                 C   s   t jt| ddS )NFr   )nploadr   r   r   r-   r-   r.   	npz_loads1  s   r   c                 C   r   r   )cborloads)r   r   r-   r-   r.   
cbor_loads5  r   r   c                 C   s   dd l }|jt| ddS )Nr   T)weights_only)torchr   r   r   )r   r   r-   r-   r.   torch_loads;  s   r   txttext
transcriptr!   cls2indexinxidjsonjsntentbmpmsgnpynpzr   pth))r   r  r|   	itertoolsr   typingr   r   r   r   r   pyarrowrB   r   datasets.features.featuresr   datasets.utils.file_utilsr   r   utilslogging
get_loggerrt   loggerGeneratorBasedBuilderr   r   r
   r   r   rj   r   r   r   r   r   r   r   intr   r   r-   r-   r-   r.   <module>   s    uA	

