o
    wi$                     @   s  d Z ddlZddlZddlZddlmZmZmZmZm	Z	m
Z
mZmZ ddlZddlmZmZ ddlmZ dZdZdZd	d
 Zdeeef defddZddddZefdeeeef  deegef deeef fddZdeddfdejde
e deegef de
eegef  de
eegef  de	eeef  fddZeddi fdeeeef  deegef de
eegef  de
eegef  de
e de	eeef  fdd Zed!defdeeeef  d"eegeeef f d#ed$e
ee  deegef de	eeef  fd%d&Z eddfd'eeeef  deegef de
eegef  de
eegef  deeeef  f
d(d)Z!e"e!Z#dS )*z/Low level iteration functions for tar archives.    N)AnyCallableDictIterableIteratorOptionalSetTuple   )filtersgopen)reraise_exceptionF__c                 C   s(   t d| }|s
dS |d|dfS )zSplit off all file extensions.

    Args:
        path: Path with extensions.

    Returns:
        Tuple containing the base path and all extensions.
    z^((?:.*/|)[^.]+)[.]([^/]*)$)NNr
      )rematchgroup)pathr    r   T/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/webdataset/tariterators.pybase_plus_ext   s   	r   samplereturnc                 C   s4   | duot | tott|  dko| dd S )zCheck whether a sample is valid.

    Args:
        sample: A dictionary representing a sample.

    Returns:
        Boolean indicating whether the sample is valid.
    Nr   __bad__F)
isinstancedictlenlistkeysget)r   r   r   r   valid_sample(   s   
r    )shufflec                c   sH    t | trt| } nt| } |rt|  | D ]}t|dV  qdS )zGenerate a list of URLs, possibly shuffled.

    Args:
        urls: A string or list of URLs.
        shuffle: Whether to shuffle the URLs.

    Yields:
        Dictionary containing the URL.
    )urlN)r   strbraceexpandr   randomr!   r   )urlsr!   r"   r   r   r   	shardlist:   s   


r'   datahandlerkwc                 k   s    | D ]N}t |tsJ |d|v sJ |d }ztj|fi |}|j|d |V  W q tyQ } z|j|f |_||rEW Y d}~qW Y d}~ dS d}~ww dS )zOpen URLs and yield a stream of url+stream pairs.

    Args:
        data: Iterator over dict(url=...).
        handler: Exception handler.
        **kw: Keyword arguments for gopen.gopen.

    Yields:
        A stream of url+stream pairs.
    r"   )streamN)r   r   r   update	Exceptionargs)r(   r)   r*   r   r"   r+   exnr   r   r   
url_openerN   s"   
r0   z__[^/]*__($|/)fileobj	skip_metaselect_filesrename_filesc                 c   sB   t j| dd}|D ]}|j}zM| sW q
|du rW q
d|vr,|tr,|tr,W q
|dur8t	||r8W q
|r>||}|durH||sHW q
|
| }t||d}	|	V  g |_W q
 ty }
 z5t|
drt|
jdkrt|
jd d t|  f|
jd	d  |
_||
rW Y d}
~
q
W Y d}
~
 ~dS d}
~
ww ~dS )
ax  Iterate over tar file, yielding filename, content pairs for the given tar stream.

    Args:
        fileobj: The tar file stream.
        skip_meta: Regexp for keys that are skipped entirely.
        handler: Exception handler.
        select_files: Predicate for selecting files.
        rename_files: Function to rename files.

    Yields:
        A stream of samples.
    zr|*)r1   modeN/)fnamer(   r.   r   z @ r
   )tarfileopennameisreg
startswithmeta_prefixendswithmeta_suffixr   r   extractfilereadr   membersr-   hasattrr   r.   r#   )r1   r2   r)   r3   r4   r+   tarinfor7   r(   resultr/   r   r   r   tar_file_iteratorm   s>   
,rF   	eof_valuec           
      c   s    | D ]x}|d }| d}zAt|tsJ d|v sJ t|d |||dD ] }t|tr5d|v r5d|v s7J ||d< |durC||d	< |V  q&|durN|V  W q ty{ }	 z |	j| d| df |	_||	roW Y d}	~	qW Y d}	~	 dS d}	~	ww dS )
ac  Expand tar files.

    Args:
        data: Iterator over opened tar file streams.
        handler: Exception handler.
        select_files: Select files from tarfiles by name (permits skipping files).
        rename_files: Function to rename files.
        eof_value: Value to yield at the end of each shard.

    Yields:
        A stream of samples.
    r"   
local_pathr+   r)   r3   r4   r(   r7   __url__N__local_path__)r   r   r   rF   r-   r.   )
r(   r)   r3   r4   rG   sourcer"   rH   r   r/   r   r   r   tar_file_expander   s:   

rM   Tr   lcasesuffixesc                 c   s   d}| D ]}zt |tsJ |i krt|r|V  d}W q|d |d }}||\}	}
tr>t|	|
t |tr;| nd |	du rDW q|rJ|
 }
|du sT|	|d krct|r[|V  t|	|d d}|
|v rut| d|
 d|  |du s}|
|v r|||
< |d	}|dur||d	< W q t	y } z|j
|d
|df |_
||rW Y d}~qW Y d}~ nd}~ww t|r|V  dS dS )a  Group tarfile contents by keys and yield samples.

    Args:
        data: Iterator over tarfile contents.
        keys: Function that takes a file name and returns a key and a suffix.
        lcase: Whether to lowercase the suffix.
        suffixes: List of suffixes to keep.
        handler: Exception handler.

    Raises:
        ValueError: If there are duplicate file names in the tar file.

    Yields:
        Iterator over samples.
    Nr7   r(   __key__rJ   )rP   rJ   z": duplicate file name in tar file  rK   r+   r"   )r   r   r    traceprintr   lower
ValueErrorr   r-   r.   )r(   r   rN   rO   r)   current_sample
filesampler7   valueprefixsuffixrH   r/   r   r   r   group_by_keys   sX   

r[   srcc                 C   s,   t | |d}t||||d}t||d}|S )a  Generate samples from a stream of tar files.

    Args:
        src: Stream of tar files.
        handler: Exception handler.
        select_files: Function that selects files to be included.
        rename_files: Function to rename files.

    Returns:
        Stream of samples.
    )r)   rI   )r0   rM   r[   )r\   r)   r3   r4   streamsfilessamplesr   r   r   tarfile_samples  s   r`   )$__doc__r%   r   r8   typingr   r   r   r   r   r   r   r	   r$    r   r   handlersr   rR   r=   r?   r   r#   boolr    r'   r-   r0   TarFilerF   rM   r[   r`   pipelinefiltertarfile_to_samplesr   r   r   r   <module>   s   (

!
4
2

B
