o
    Sij,                     @   s  d dl Z d dlmZ d dlmZmZmZmZmZm	Z	m
Z
 d dlmZ d dlmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZmZ ed Ze
eef Z e	e  Z!G dd dZ"de#dee!e#f fddZ$de#de#de#fddZ%dd Z&dS )    N)partial)DictListLiteralOptionalTupleTypeUnion)fastcopy)ArrayTemporalArray)	Recording)Cut)to_shar_placeholder)ArrayTarWriter)AudioTarWriter)JsonlShardWriter)Pathlikeifnone)wavflacmp3lilcomnumpyjsonlc                   @   s   e Zd ZdZ					ddedeeef dee d	e	d
e	dee deddfddZ
ede	fddZedeeee f fddZdd Zdd Zdd ZdeddfddZdS )
SharWritera  
    SharWriter writes cuts and their corresponding data into multiple shards,
    also recognized as the Lhotse Shar format.
    Each shard is numbered and represented as a collection of one text manifest and
    one or more binary tarfiles.
    Each tarfile contains a single type of data, e.g., recordings, features, or custom fields.

    The main idea behind Lhotse Shar format is to optimize dataloading with sequential reads,
    while keeping the data composition more flexible than e.g. WebDataset tar archives do.
    To achieve this, Lhotse Shar keeps each data type in a separate archive, along a single
    CutSet JSONL manifest.
    This way, the metadata can be investigated without iterating through the binary data.
    The format also allows iteration over a subset of fields, or extension of existing data
    with new fields.

    The user has to specify which fields should be saved, and what compression to use for each of them.
    Currently we support ``wav``, ``flac``, ``opus``, and ``mp3`` compression for ``recording`` and custom audio fields,
    and ``lilcom`` or ``numpy`` for ``features`` and custom array fields.

    Example::

        >>> cuts = CutSet(...)  # cuts have 'recording' and 'features'
        >>> with SharWriter("some_dir", shard_size=100, fields={"recording": "opus", "features": "lilcom"}) as w:
        ...     for cut in cuts:
        ...         w.write(cut)

    .. note:: Different audio backends in Lhotse may use different encoders for the same audio formats.
        It is advisable to use the same audio backend for saving and loading audio data in Shar and other formats.
        See: :class:`lhotse.audio.recording.Recording`.

    It would create a directory ``some_dir`` with files such as ``some_dir/cuts.000000.jsonl.gz``,
    ``some_dir/recording.000000.tar``, ``some_dir/features.000000.tar``,
    and then the same names but numbered with ``000001``, etc.
    The starting shard offset can be set using ``shard_offset`` parameter. The writer starts from 0 by default.

    When ``shard_size`` is set to ``None``, we will disable automatic sharding and the
    shard number suffix will be omitted from the file names.

    The option ``warn_unused_fields`` will emit a warning when cuts have some data attached to them
    (e.g., recording, features, or custom arrays) but saving it was not specified via ``fields``.

    The option ``include_cuts`` controls whether we store the cuts alongside ``fields`` (true by default).
    Turning it off is useful when extending existing dataset with new fields/feature types,
    but the original cuts do not require any modification.

    See also: :class:`~lhotse.shar.writers.tar.TarWriter`, :class:`~lhotse.shar.writers.audio.AudioTarWriter`,
        :class:`~lhotse.shar.writers.array.ArrayTarWriter`.
      TNr   
output_dirfields
shard_sizewarn_unused_fieldsinclude_cutsshard_suffixshard_offsetreturnc                 C   s   t || _|| _|| _|| _|| _| jr$|d u s J d| dd| _nt|d| _|| _	i | _
|rCtt| j| j| j| j	d| j
d< | j D ]!\}}	t|	\}
}|
| j d| | j | | j| j	d| j
|< qHd S )Nz>shard_suffix must be None when shard_size is specified (got: 'z').z.%06d )patternr   r#   cuts/)strr   r   r   r    r!   sharding_enabledr"   r   initial_shard_offsetwritersr   _create_cuts_output_urlitemsresolve_writer)selfr   r   r   r    r!   r"   r#   fieldwriter_typemake_writer_fnext r5   L/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/shar/writers/shar.py__init__F   s6   



zSharWriter.__init__c                 C   s   | j d uo	| j dkS )Nr   )r   r0   r5   r5   r6   r*   m      zSharWriter.sharding_enabledc                 C   s   dd | j  D S )Nc                 S   s   i | ]\}}||j qS r5   )output_paths).0kwr5   r5   r6   
<dictcomp>s   s    z+SharWriter.output_paths.<locals>.<dictcomp>)r,   r.   r8   r5   r5   r6   r:   q   r9   zSharWriter.output_pathsc                 C   s   | j  D ]}|  q| S N)r,   values	__enter__r0   r=   r5   r5   r6   rA   u   s   
zSharWriter.__enter__c                 C   s   |    d S r?   )close)r0   exc_typeexc_valexc_tbr5   r5   r6   __exit__z   s   zSharWriter.__exit__c                 C   s   | j  D ]}|  qd S r?   )r,   r@   rC   rB   r5   r5   r6   rC   }   s   
zSharWriter.closecutc                 C   s  d| j v rG|jr=| }t|j|}t|j}|j|kr%||jd _	||_| j
d j|j||j||jjd t||d}n| j
d |j n|jrR| jrRtd d| j v r|jrw| }t|j|}| j
d j|j||d t||d}n| j
d |j n|jr| jrtd	 | j D ]}|d
v rq||r#t||}t|tttfst| j
| tsJ d| j |  d| d| j
| d|j||i q||}t||}| d}	i }
t|tr|j|
d< ||	r|j|	 |jd _	|j|	 |_| j
| j|j|fd|i|
 t||j  d}|j!|	d  t"||| q| j
| |j qt#|ji $ D ]%\}}t|tttfsDq5|| j vrY| jrWtd| d q5q5t|dd}d| j
v rq| j
d | d S d S )N	recordingr   )manifestoriginal_format)rI   zHFound cut with 'recording' field that is not specified for Shar writing.features)rJ   )rL   zGFound cut with 'features' field that is not specified for Shar writing.)rI   rL   z#Expected writer type 'jsonl' (got 'z') for non-data field 'z'.cut_id_channel_selectorsampling_raterJ   )customzFound cut with 'z/' field that is not specified for Shar writing.)startr'   )%r   has_recording
load_audior   rI   _aslistchannelchannel_idssourceschannelsr,   writeidrO   source_formatr
   write_placeholderr    warningswarnhas_featuresload_featuresrL   
has_customgetattr
isinstancer   r   r   r   load_customrP   copypopsetattrr   r.   )r0   rH   datarI   cut_channelsrL   keyvalplaceholder_objchannel_selector_keykwargsr5   r5   r6   rY      s   















	zSharWriter.write)r   TTNr   )__name__
__module____qualname____doc__r   r   r)   r   intboolr7   propertyr*   r   r:   rA   rG   rC   r   rY   r5   r5   r5   r6   r      s@    5
	
'r   namer$   c              
   C   s   t tdddft tdddft tdddft tdddft tdddft tdd	dft td
d	dftdfd}| |v sHJ d|  dd| ||  S )Nr   )formatz.tarr   r   opusoriginalr   )compressionr   	.jsonl.gz)r   r   r   rx   ry   r   r   r   zUnknown field type (got: 'z', we support only: z, )r   r   r   r   join)rv   optsr5   r5   r6   r/      s   
r/   base_output_urlr"   c                 C   s&   |  dr| dd} |  d| dS )Nzpipe:zpipe:gzip -c | z/cutsr{   )
startswithreplace)r~   r"   r5   r5   r6   r-      s   
r-   c                 C   s   t | tr| S | gS r?   )rc   list)xr5   r5   r6   rT      s   
rT   )'r]   	functoolsr   typingr   r   r   r   r   r   r	   lhotser
   lhotse.arrayr   r   lhotse.audior   
lhotse.cutr   lhotse.shar.utilsr   lhotse.shar.writers.arrayr   lhotse.shar.writers.audior   lhotse.shar.writers.cutr   lhotse.utilsr   r   
WriterNameFieldWriterInstanceFieldWriterr   r)   r/   r-   rT   r5   r5   r5   r6   <module>   s(    $ S	