o
    2wi                     @   sJ   d dl Z d dlZd dlmZ d dlmZmZ d dlmZ G dd dZ	dS )    N)BytesIO)ListOptional)	open_bestc                   @   s   e Zd ZdZ	ddedee defddZed	e	fd
dZ
dd Zdd Zdd Zdd Zdd Zed	ee fddZd dedede	fddZdS )!	TarWritera  
    TarWriter is a convenience wrapper over :class:`tarfile.TarFile` that
    allows writing binary data into tar files that are automatically segmented.
    Each segment is a separate tar file called a "shard."

    Shards are useful in training of deep learning models that require a substantial
    amount of data. Each shard can be read sequentially, which allows faster reads
    from magnetic disks, NFS, or otherwise slow storage.

    Example::

        >>> with TarWriter("some_dir/data.%06d.tar", shard_size=100) as w:
        ...     w.write("blob1", binary_blob1)
        ...     w.write("blob2", binary_blob2)  # etc.

    It would create files such as ``some_dir/data.000000.tar``, ``some_dir/data.000001.tar``, etc.
    The starting shard offset can be set using ``shard_offset`` parameter. The writer starts from 0 by default.

    It's also possible to use ``TarWriter`` with automatic sharding disabled::

        >>> with TarWriter("some_dir/data.tar", shard_size=None) as w:
        ...     w.write("blob1", binary_blob1)
        ...     w.write("blob2", binary_blob2)  # etc.

    This class is heavily inspired by the WebDataset library:
    https://github.com/webdataset/webdataset
      r   pattern
shard_sizeshard_offsetc                 C   s^   t || _| jr|d u rtd| js|d urtd || _|| _| jd| _	| 
  d S )Nz\shard_size must be specified when sharding is enabled via a formatting marker such as '%06d'zSharding is disabled because `pattern` doesn't contain a formatting marker (e.g., '%06d'), but shard_size is not None - ignoring shard_size.z.gz)strr   sharding_enabledRuntimeErrorloggingwarningr	   initial_shard_offsetendswithgzipreset)selfr   r	   r
    r   T/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/lhotse/shar/writers/tar.py__init__&   s   
zTarWriter.__init__returnc                 C   s
   d| j v S )N%r   r   r   r   r   r   8   s   
zTarWriter.sharding_enabledc                 C   s*   d | _ d | _d | _| j| _d| _d| _d S )Nr   )fnamestream	tarstreamr   
num_shards	num_itemsnum_items_totalr   r   r   r   r   <   s   
zTarWriter.resetc                 C   s   |    | S N)r   r   r   r   r   	__enter__D   s   zTarWriter.__enter__c                 O   s   |    d S r"   )close)r   argskwargsr   r   r   __exit__H   s   zTarWriter.__exit__c                 C   s0   | j d ur
| j   | jd ur| j  d S d S r"   )r   r$   r   r   r   r   r   r$   K   s
   


zTarWriter.closec                 C   sh   |    | jr| j| j | _|  jd7  _n| j| _t| jd| _tj| j| j	r*dndd| _
d| _d S )N   wbzw|gzzw|)fileobjmoder   )r$   r   r   r   r   r   r   tarfileopenr   r   r    r   r   r   r   _next_streamQ   s   
zTarWriter._next_streamc                    s*    j r fddt j jD S  jgS )Nc                    s   g | ]} j | qS r   r   ).0ir   r   r   
<listcomp>d   s    z*TarWriter.output_paths.<locals>.<listcomp>)r   ranger   r   r   r   r   r   r   output_pathsa   s
   
zTarWriter.output_pathsTkeydatacountc                 C   s   |r| j dks| jr| jdkr| j| j dkr|   t|}|d t|	 |_
| j|| |rE|  jd7  _|  j d7  _ d S d S )Nr   r(   )r!   r   r    r	   r.   r,   TarInfoseeklengetvaluesizer   addfile)r   r4   r5   r6   tir   r   r   writej   s   



zTarWriter.writeN)r   r   )T)__name__
__module____qualname____doc__r   r   intr   propertyboolr   r   r#   r'   r$   r.   r   r3   r   r>   r   r   r   r   r   	   s(    
r   )
r   r,   ior   typingr   r   lhotse.serializationr   r   r   r   r   r   <module>   s    