o
    Sio                     @   sh  U d dl Z d dlZd dlZd dlZd dlZd dlmZmZ d dlm	Z	 d dl
mZ d dlmZmZ d dlmZ d dlmZmZmZmZmZmZmZmZ d dlZd dlmZ d d	lmZm Z m!Z!m"Z"m#Z#m$Z$ d d
l%m&Z& eZ'dydede(fddZ)dZ*ede+fddZ,edd Z-dededdfddZ.dede/fddZ0G dd dZ1dededdfddZ2dedee/e3f fdd Z4G d!d" d"Z5deee(ef  deddfd#d$Z6dedeee(ef ddf fd%d&Z7G d'd( d(Z8G d)d* d*e9Z:G d+d, d,Z;G d-d. d.Z<G d/d0 d0Z=d1d2 Z>d3e(dede+fd4d5Z?dzded6ee de'fd7d8Z@dedee' fd9d:ZA	dzdedee' fd;d<ZBd=d> ZCG d?d@ d@eDZEdAe'deddfdBdCZFG dDdE dEe5e<e=e1ZGde/defdFdGZHdee/ dee/ fdHdIZIe"dJrd dlJZJdKdL ZKnejLZKG dMdN dNZMG dOdP dPZNG dQdR dReNZOG dSdT dTeNZPG dUdV dVeNZQG dWdX dXeNZRG dYdZ dZeNZSG d[d\ d\eNZTdefd]d^ZUdefd_d`ZVdefdadbZWdcZXG ddde deeNZYG dfdg dgeNZZda[edP e\dh< dee( fdidjZ]e	dkedPe(f dedl fdmdnZ^d{dodpZ_dkedPe(f ddPfdqdrZ`edsdtd{dudvZaG dwdx dxeNZbdS )|    N)StreamReaderStreamWriter)contextmanager)	lru_cache)BytesIOStringIO)Path)AnyDict	GeneratorIterableListOptionalTypeUnion)parse)PathlikePipe	SmartOpenis_module_availableis_valid_url replace_bucket_with_profile_namegzip_open_robustrpathmodec              	   C   s~   t | ttttfr| S t | ttfsJ dt|  d|  dzt 	| |W S  t
y>   t| r=td|  dt d w )a  
    Auto-determine the best way to open the input path or URI.
    Uses ``smart_open`` when available to handle URLs and URIs.

    Supports providing "-" as input to read from stdin or save to stdout.

    If the input is prefixed with "pipe:", it will open a subprocess and redirect
    either stdin or stdout depending on the mode.
    The concept is similar to Kaldi's "generalized pipes", but uses WebDataset syntax.
    zUnexpected identifier type z for object z. Expected str or pathlib.Path.z.Error trying to open what seems to be a URI: 'z'
In order to open URLs/URIs please run 'pip install smart_open' (if you're trying to use AIStore, either the Python SDK is not installed (pip install aistore) or z is not defined.)
isinstancer   r   r   r   strr   typeget_current_io_backendopen	Exceptionr   
ValueErrorAIS_ENDPOINT_ENVVAR)r   r    r%   H/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/serialization.py	open_best   s$   r'   AIS_ENDPOINTreturnc                   C   s   t tjv ottjt  S N)r$   osenvironr   r%   r%   r%   r&   is_aistore_available?   s   r-   c                  C   sN   t dstdt stddd l} tjd }t| j}| j	|ddd|fS )	NaistorezDPlease run 'pip install aistore' in order to read data from AIStore.zWSet a valid URL as AIS_ENDPOINT environment variable's value to read data from AIStore.r   r(   )      2   )timeoutmax_pool_size)
r   ImportErrorr-   r#   r.   r+   r,   parse_version__version__Client)r.   endpoint_urlversionr%   r%   r&   get_aistore_clientF   s   

r:   datac              	   C   s|   t |d/}ztj| |tjd W n ty#   tj| |tjd Y n	w W d    d S W d    d S 1 s7w   Y  d S )Nw)streamDumper)r'   yamldumpCSafeDumperAttributeError
SafeDumperr;   r   fr%   r%   r&   save_to_yamlW   s   "rF   c              	   C   sr   t | d*}ztj|tjdW W  d    S  ty.   tj|tjd Y W  d    S w 1 s2w   Y  d S )Nr   )r=   Loader)r'   r?   loadCSafeLoaderrB   
SafeLoaderr   rE   r%   r%   r&   	load_yaml`   s   rL   c                   @   4   e Zd ZdeddfddZededefddZdS )	YamlMixinr   r)   Nc                 C      t t|  | d S r*   )rF   listto_dictsselfr   r%   r%   r&   to_yamlj      zYamlMixin.to_yamlc                 C      t |}| |S r*   )rL   
from_dictsclsr   r;   r%   r%   r&   	from_yamlm      
zYamlMixin.from_yaml)__name__
__module____qualname__r   rT   classmethodManifestrZ   r%   r%   r%   r&   rN   i       rN   c                 C   sB   t |d}tj| |ddd W d   dS 1 sw   Y  dS )eSave the data to a JSON file. Will use GZip to compress it if the path ends with a ``.gz`` extension.r<      F)indentensure_asciiN)r'   jsonr@   rD   r%   r%   r&   save_to_jsons   s   "rg   c                 C   s8   t | d}t|W  d   S 1 sw   Y  dS zILoad a JSON file. Also supports compressed JSON with a ``.gz`` extension.r   N)r'   rf   rH   rK   r%   r%   r&   	load_jsony   s   $ri   c                   @   rM   )	JsonMixinr   r)   Nc                 C   rO   r*   )rg   rP   rQ   rR   r%   r%   r&   to_json   rU   zJsonMixin.to_jsonc                 C   rV   r*   )ri   rW   rX   r%   r%   r&   	from_json   r[   zJsonMixin.from_json)r\   r]   r^   r   rk   r_   r`   rl   r%   r%   r%   r&   rj      ra   rj   c                 C   sP   t |d}| D ]}ttj|dd|d qW d   dS 1 s!w   Y  dS )rb   r<   Fre   fileN)r'   printrf   dumps)r;   r   rE   itemr%   r%   r&   save_to_jsonl   s
   "rs   c                 c   sJ    t | d}|D ]	}t|}|V  q	W d   dS 1 sw   Y  dS rh   )r'   decode_json_line)r   rE   lineretr%   r%   r&   
load_jsonl   s   "rw   c                   @   s   e Zd ZdZddededdfddZdd	d
ZdddZde	e
ef defddZdd Zdd Zde	e
ef defddZd dededdfddZdee fddZdS )!SequentialJsonlWritera  
    SequentialJsonlWriter allows to store the manifests one by one,
    without the necessity of storing the whole manifest set in-memory.
    Supports writing to JSONL format (``.jsonl``), with optional gzip
    compression (``.jsonl.gz``).

    Example:

        >>> from lhotse import RecordingSet
        ... recordings = [...]
        ... with RecordingSet.open_writer('recordings.jsonl.gz') as writer:
        ...     for recording in recordings:
        ...         writer.write(recording)

    This writer can be useful for continuing to write files that were previously
    stopped -- it will open the existing file and scan it for item IDs to skip
    writing them later. It can also be queried for existing IDs so that the user
    code may skip preparing the corresponding manifests.

    Example:

        >>> from lhotse import RecordingSet, Recording
        ... with RecordingSet.open_writer('recordings.jsonl.gz', overwrite=False) as writer:
        ...     for path in Path('.').rglob('*.wav'):
        ...         recording_id = path.stem
        ...         if writer.contains(recording_id):
        ...             # Item already written previously - skip processing.
        ...             continue
        ...         # Item doesn't exist yet - run extra work to prepare the manifest
        ...         # and store it.
        ...         recording = Recording.from_file(path, recording_id=recording_id)
        ...         writer.write(recording)
    Tr   	overwriter)   Nc                 C   s   || _ d | _d| _t | _t| j  r?|sAd| _t| j d}dd dd |D D | _W d    d S 1 s8w   Y  d S d S d S )Nr<   ar   c                 S   s   h | ]
}d |v r|d  qS )idr%   ).0r;   r%   r%   r&   	<setcomp>   s
    z1SequentialJsonlWriter.__init__.<locals>.<setcomp>c                 s   s    | ]}t |V  qd S r*   )rt   )r|   ru   r%   r%   r&   	<genexpr>       z1SequentialJsonlWriter.__init__.<locals>.<genexpr>)r   ro   r   set
ignore_idsr   is_filer'   )rS   r   ry   rE   r%   r%   r&   __init__   s   
"zSequentialJsonlWriter.__init__c                 C   s   |    | S r*   )_maybe_openrS   r%   r%   r&   	__enter__   s   zSequentialJsonlWriter.__enter__c                 O   s   |    d S r*   )closerS   argskwargsr%   r%   r&   __exit__      zSequentialJsonlWriter.__exit__rr   c                 C   s8   t |tr
|| jv S z|j| jv W S  ty   Y dS w NF)r   r   r   r{   rB   rS   rr   r%   r%   r&   __contains__   s   

z"SequentialJsonlWriter.__contains__c                 C   s"   | j d u rt| j| j| _ d S d S r*   )ro   r'   r   r   r   r%   r%   r&   r      s   
z!SequentialJsonlWriter._maybe_openc                 C   s"   | j d ur| j   d | _ d S d S r*   )ro   r   r   r%   r%   r&   r      s   


zSequentialJsonlWriter.closec                 C      || v S r*   r%   r   r%   r%   r&   contains      zSequentialJsonlWriter.containsFmanifestflushc                 C   sr   z|j | jv r
W dS W n	 ty   Y nw |   t|ts"| }ttj	|dd| j
d |r7| j
  dS dS )az  
        Serializes a manifest item (e.g. :class:`~lhotse.audio.Recording`,
        :class:`~lhotse.cut.Cut`, etc.) to JSON and stores it in a JSONL file.

        :param manifest: the manifest to be written.
        :param flush: should we flush the file after writing (ensures the changes
            are synced with the disk and not just buffered for later writing).
        NFrm   rn   )r{   r   rB   r   r   dictto_dictrp   rf   rq   ro   r   rS   r   r   r%   r%   r&   write   s   	
zSequentialJsonlWriter.writec                 C   s8   t | j s	dS | jdur| jjs| j  t| jS )z
        Opens the manifest that this writer has been writing to.
        The manifest is opened in a lazy mode.
        Returns ``None`` when the manifest is empty.
        N)r   r   existsro   closedr   load_manifest_lazyr   r%   r%   r&   open_manifest   s
   

z#SequentialJsonlWriter.open_manifestT)r)   rx   r)   NF)r\   r]   r^   __doc__r   boolr   r   r   r   r   r	   r   r   r   r   r   r   r`   r   r%   r%   r%   r&   rx      s    "


rx   c                   @      e Zd ZdS )InvalidPathExtensionNr\   r]   r^   r%   r%   r%   r&   r         r   c                   @   sz   e Zd ZdZdd ZdddZddd	Zdefd
dZde	e
ef defddZddeddfddZdee fddZdS )InMemoryWriterz
    Mimics :class:`.SequentialJsonlWriter` API but doesn't actually perform any I/O.
    It is used internally to create manifest sets without writing them to disk.
    c                 C   s   g | _ t | _d S r*   )items	frozensetr   r   r%   r%   r&   r     s   zInMemoryWriter.__init__r)   c                 C   s   | S r*   r%   r   r%   r%   r&   r        zInMemoryWriter.__enter__Nc                 O      d S r*   r%   r   r%   r%   r&   r     r   zInMemoryWriter.__exit__c                 C      dS r   r%   r   r%   r%   r&   r      r   zInMemoryWriter.__contains__rr   c                 C   r   r*   r%   r   r%   r%   r&   r   #  r   zInMemoryWriter.containsFr   c                 C   s   | j | d S r*   )r   appendr   r%   r%   r&   r   &  s   zInMemoryWriter.writec                 C   s$   | j sdS t| j d }|| j S )z
        Return a manifest set. Resolves the proper manifest set class by itself.
        Returns ``None`` when the manifest is empty.
        Nr   )r   resolve_manifest_set_class
from_itemsrS   rY   r%   r%   r&   r   )  s   zInMemoryWriter.open_manifest)r)   r   r   r   )r\   r]   r^   r   r   r   r   r   r   r   r   r	   r   r   r   r`   r   r%   r%   r%   r&   r     s    

r   c                
   @   sb   e Zd ZdeddfddZededefddZe	ddeedf d	e	dee
ef fd
dZdS )
JsonlMixinr   r)   Nc                 C   s   t |  | d S r*   )rs   rQ   rR   r%   r%   r&   to_jsonl5  s   zJsonlMixin.to_jsonlc                 C   rV   r*   )rw   rW   rX   r%   r%   r&   
from_jsonl8  r[   zJsonlMixin.from_jsonlTry   c                 C   s   |du rt  S t||dS )a$  
        Open a sequential writer that allows to store the manifests one by one,
        without the necessity of storing the whole manifest set in-memory.
        Supports writing to JSONL format (``.jsonl``), with optional gzip
        compression (``.jsonl.gz``).

        .. note:: when ``path`` is ``None``, we will return a :class:`.InMemoryWriter`
            instead has the same API but stores the manifests in memory.
            It is convenient when you want to make disk saving optional.

        Example:

            >>> from lhotse import RecordingSet
            ... recordings = [...]
            ... with RecordingSet.open_writer('recordings.jsonl.gz') as writer:
            ...     for recording in recordings:
            ...         writer.write(recording)

        This writer can be useful for continuing to write files that were previously
        stopped -- it will open the existing file and scan it for item IDs to skip
        writing them later. It can also be queried for existing IDs so that the user
        code may skip preparing the corresponding manifests.

        Example:

            >>> from lhotse import RecordingSet, Recording
            ... with RecordingSet.open_writer('recordings.jsonl.gz', overwrite=False) as writer:
            ...     for path in Path('.').rglob('*.wav'):
            ...         recording_id = path.stem
            ...         if writer.contains(recording_id):
            ...             # Item already written previously - skip processing.
            ...             continue
            ...         # Item doesn't exist yet - run extra work to prepare the manifest
            ...         # and store it.
            ...         recording = Recording.from_file(path, recording_id=recording_id)
            ...         writer.write(recording)
        N)ry   )r   rx   )rY   r   ry   r%   r%   r&   open_writer=  s   )zJsonlMixin.open_writerr   )r\   r]   r^   r   r   r_   r`   r   r   r   rx   r   r   r%   r%   r%   r&   r   4  s    

r   c                   @   sp   e Zd ZdefddZedeeee	f ee	 f fddZ
dd Zedefd	d
ZededefddZdS )	LazyMixinr;   c                 C      t )z
        Function to be implemented by every sub-class of this mixin.
        It's expected to create a sub-class instance out of an iterable of items
        that are held by the sub-class (e.g., ``CutSet.from_items(iterable_of_cuts)``).
        NotImplemented)rS   r;   r%   r%   r&   r   l  s   zLazyMixin.from_itemsr)   c                 C   r   )z
        Property to be implemented by every sub-class of this mixin.
        It can either be a regular Python dict holding manifests for eager mode,
        or a special iterator class for lazy mode.
        r   r   r%   r%   r&   r;   t  s   zLazyMixin.datac                 C   s   | j s| S t| }|| S )z
        Evaluates all lazy operations on this manifest, if any, and returns a copy
        that keeps all items in memory.
        If the manifest was "eager" already, this is a no-op and won't copy anything.
        )is_lazyr   r   r   r%   r%   r&   to_eager}  s   
zLazyMixin.to_eagerc                 C   s   t | jtttf S )zc
        Indicates whether this manifest was opened in lazy (read-on-the-fly) mode or not.
        )r   r;   r   rP   tupler   r%   r%   r&   r     s   zLazyMixin.is_lazyr   c                 C   s   ddl m} | ||S )a0  
        Read a JSONL manifest in a lazy manner, which opens the file but does not
        read it immediately. It is only suitable for sequential reads and iteration.

        .. warning:: Opening the manifest in this way might cause some methods that
            rely on random access to fail.
        r   )LazyManifestIterator)lhotse.lazyr   )rY   r   r   r%   r%   r&   from_jsonl_lazy  s   	zLazyMixin.from_jsonl_lazyN)r\   r]   r^   r   r   propertyr   r
   r   r	   r;   r   r   r   r_   r   r`   r   r%   r%   r%   r&   r   k  s    $r   c                 c   s,    t |}	 tt|| }|sdS |V  q)zXhttps://stackoverflow.com/questions/8991506/iterate-an-iterator-by-chunks-of-n-in-pythonTN)iterr   	itertoolsislice)niterableitchunkr%   r%   r&   grouper  s   r   extc                    s   t  fddt|jD S )Nc                 3   s    | ]} |kV  qd S r*   r%   )r|   sfxr   r%   r&   r~     r   z%extension_contains.<locals>.<genexpr>)anyr   suffixes)r   r   r%   r   r&   extension_contains  s   r   manifest_clsc           
   	   C   s   ddl m}m}m}m} td| rt| }|du rt|}ntd| r(t| }ntd| r2t	| }nt
d|  d}|durC|g}n||||g}|D ]}	z|	|}t|dkr\t W  n	 tyg   Y qKw |du rst
d|  |S )	z2Generic utility for reading an arbitrary manifest.r   )CutSet
FeatureSetRecordingSetSupervisionSet.jsonlN.json.yamlz-Not a valid manifest (does the path exist?): zUnknown type of manifest: )lhotser   r   r   r   r   rw   rP   ri   rL   r#   rW   lenRuntimeErrorr"   )
r   r   r   r   r   r   raw_datadata_set
candidatesmanifest_typer%   r%   r&   load_manifest  s6   





r   c                 C   sb   t d| st| dksJ tt| }zt|}W n
 ty#   Y dS w t|}t|}|| S )z{
    Generic utility for reading an arbitrary manifest from a JSONL file.
    Returns None when the manifest is empty.
    r   -N)	r   r   r   rw   nextStopIterationdeserialize_itemr   r   )r   r   firstrr   rY   r%   r%   r&   r     s   
r   c                 C   s*   t d| st| dkrt| S t| |dS )z
    Generic utility for reading an arbitrary manifest.
    If possible, opens the manifest lazily, otherwise reads everything into memory.
    r   r   r   )r   r   r   r   )r   r   r%   r%   r&   load_manifest_lazy_or_eager  s   r   c           	      C   sz   ddl m}m}m}m}m}m} ddlm}m	} t
| |r|S t
| |r&|S t
| |r-|S t
| |r4|S tdt|  )zLReturns the right *Set class for a manifest, e.g. Recording -> RecordingSet.r   )Featuresr   	Recordingr   SupervisionSegmentr   )Cutr   z8No corresponding 'Set' class is known for item of type: )r   r   r   r   r   r   r   
lhotse.cutr   r   r   NotALhotseManifestr   )	rr   r   r   r   r   r   r   r   r   r%   r%   r&   r     s    



r   c                   @   r   )r   Nr   r%   r%   r%   r&   r   	  r   r   r   c                 C   sb   t d|st|dkr| | d S t d|r| | d S t d|r*| | d S td| )Nr   r   r   r   z"Unknown serialization format for: )r   r   r   rk   rT   r#   )r   r   r%   r%   r&   store_manifest  s   

r   c                   @   s4   e Zd ZededefddZdeddfddZdS )Serializabler   r)   c                 C   s   t || dS )Nr   )r   )rY   r   r%   r%   r&   	from_file  s   zSerializable.from_fileNc                 C   s   t | | d S r*   )r   rR   r%   r%   r&   to_file     zSerializable.to_file)r\   r]   r^   r_   r   r`   r   r   r%   r%   r%   r&   r     s    r   c           
      C   s  ddl m}m}m}m}m}m} ddlm} ddl	m
} d| v r%|| S d| v s-d| v r1|| S d| v r:|| S d	| v rC|| S d
| vrL|| S | d
}	|	dkrZ|| S |	dkrc|| S |	dkrqtd || S |	dkrz|| S td|	 d)Nr   )r   ImageMonoCutMultiCutr   r   deserialize_array)MixedCutwidthshapearraysourcesnum_featuresr   r   r   r   a   Your manifest was created with Lhotse version earlier than v0.8, when MonoCut was called Cut. Please re-generate it with Lhotse v0.8 as it might stop working in a future version (using manifest.from_file() and then manifest.to_file() should be sufficient).r   z-Unexpected cut type during deserialization: '')r   r   r   r   r   r   r   lhotse.arrayr   r   r   	from_dictpopwarningswarnr#   )
r;   r   r   r   r   r   r   r   r   cut_typer%   r%   r&   r   !  s4    








r   c                    s   | du rdS ddl m} ddlm} ddlm} |  D ]5\} t trQt	 fdddD r8|
 | |< qd	 v rC|
 | |< z| | |< W q   Y qq| S )
a}  
    Handles deserialization of manifests inside the custom field dictionary
    (e.g. from :class:`~lhotse.SupervisionSegment` or :class:`~lhotse.MonoCut`).

    Mutates the input in-place, and returns it.

    :param data: the contents of ``manifest.custom`` field.
    :return: ``custom`` field dict with deserialized manifests (if any),
        or None when input is None.
    Nr   )r   r   )r   c                 3   s    | ]}| v V  qd S r*   r%   )r|   kvaluer%   r&   r~   [  r   z+deserialize_custom_field.<locals>.<genexpr>)r{   r   sampling_rater   )r   r   r   r   lhotse.audior   r   r   r   allr   )r;   r   r   r   keyr%   r  r&   deserialize_custom_fieldC  s$   
r  orjsonc                 C   s"   zt | W S    t|  Y S r*   )r	  loadsrf   )ru   r%   r%   r&   rt   k  s   rt   c                   @   s:   e Zd Zdd Zdd Zdd Zdd Zd	efd
dZdS )StdStreamWrapperc                 C   
   || _ d S r*   r=   )rS   r=   r%   r%   r&   r   v     
zStdStreamWrapper.__init__c                 C   r   r*   r%   r   r%   r%   r&   r   y  r   zStdStreamWrapper.closec                 C   s   | j S r*   r  r   r%   r%   r&   r   |     zStdStreamWrapper.__enter__c                 C   r   r*   r%   )rS   exc_typeexc_valexc_tbr%   r%   r&   r     r   zStdStreamWrapper.__exit__rr   c                 C   s   |dkr| j S t| j|S )Nr   )r   getattrr=   r   r%   r%   r&   __getattr__  s   zStdStreamWrapper.__getattr__N)	r\   r]   r^   r   r   r   r   r   r  r%   r%   r%   r&   r  u  s    r  c                       s   e Zd ZdZi Z fddZededd fddZde	d	efd
dZ
edefddZde	defddZde	defddZ  ZS )	IOBackendax  
    Base class for IO backends supported by Lhotse.
    An IO backend supports open() operations for reads and/or writes to file-like objects.
    Deriving classes are auto-registered under their class name, and auto-discoverable
    through functions:

    * :func:`~lhotse.serialization.available_io_backends`

    * :func:`~lhotse.serialization.get_current_io_backend`

    * :func:`~lhotse.serialization.set_current_io_backend`

    The default composite backend that tries to figure out the best solution
    can be obtained via :func:`~lhotse.serialization.get_default_io_backend`.

    New IO backends are expected to define the following methods:

    * `open(identifier: str, mode: str)` which returns a file-like object.
        Must be implemented.

    * `is_applicable(identifier: str) -> bool` returns `True` if a given
        backend can be used for a given identifier. True by default.

    * `is_available(identifier: str) -> bool` Class method. Only define it when
        the availability of the backend depends on some special actions,
        such as installing an option dependency.

    * `handles_special_case(identifier: str) -> bool` defined ONLY when
        a given IO Backend MUST be selected for a specific identifier.
        For example, only :class:`~lhotse.serialization.PipeIOBackend` handles
        piped commands like `"pipe:gunzip -c manifest.jsonl.gz"`.
    c                    s.   | j tjvr| tj| j < t jdi | d S )Nr%   )r\   r  KNOWN_BACKENDSsuper__init_subclass__)rY   r   	__class__r%   r&   r    s   zIOBackend.__init_subclass__namer)   c                 C   s$   || j vrtd| | j |  S )NzUnknown IO backend name: )r  r   )rY   r  r%   r%   r&   new  s   
zIOBackend.new
identifierr   c                 C   s   t  r*   )NotImplementedErrorrS   r  r   r%   r%   r&   r!     r  zIOBackend.openc                 C   r   NTr%   rY   r%   r%   r&   is_available  s   zIOBackend.is_availablec                 C   r   r   r%   rS   r  r%   r%   r&   handles_special_case  r   zIOBackend.handles_special_casec                 C   r   r   r%   r#  r%   r%   r&   is_applicable  r   zIOBackend.is_applicable)r\   r]   r^   r   r  r  r_   r   r  r   r!   r   r"  r$  r%  __classcell__r%   r%   r  r&   r    s    !r  c                   @   s4   e Zd ZdZdedefddZdedefddZd	S )
BuiltinIOBackendzCalls Python's built-in `open`.r  r   c                 C   s   t ||dS )N)r   )r!   r  r%   r%   r&   r!     r   zBuiltinIOBackend.openr)   c                 C   s
   t | S r*   r   r#  r%   r%   r&   r%    r  zBuiltinIOBackend.is_applicableN)	r\   r]   r^   r   r   r   r!   r   r%  r%   r%   r%   r&   r'    s    r'  c                   @   F   e Zd ZdZdedefddZdedefddZdedefd	d
Z	dS )RedirectIOBackendz"Opens a stream to stdin or stdout.r  r   c                 C   s4   |dkr	t tjS |dkrt tjS td| d)Nr   r<   z=Cannot open stream for '-' with mode other 'r' or 'w' (got: 'z'))r  sysstdinstdoutr#   r  r%   r%   r&   r!     s   


zRedirectIOBackend.openr)   c                 C   s   |dkS )Nr   r%   r#  r%   r%   r&   r$    r   z&RedirectIOBackend.handles_special_casec                 C   
   |  |S r*   r$  r#  r%   r%   r&   r%    r  zRedirectIOBackend.is_applicableN
r\   r]   r^   r   r   r   r!   r   r$  r%  r%   r%   r%   r&   r*    s
    	r*  c                   @   r)  )PipeIOBackendzJExecutes the provided command / pipe and wraps it into a file-like object.r  r   c                 C   s   t t|dd |dddS )z
        Runs the command and redirects stdin/stdout depending on the mode.
        Returns a file-like object that can be read from or written to.
        r/   NTi  )r   shellbufsize)r   r   r  r%   r%   r&   r!     s   zPipeIOBackend.openr)   c                 C      t |dS )Nzpipe:r   
startswithr#  r%   r%   r&   r$    r   z"PipeIOBackend.handles_special_casec                 C   r.  r*   r/  r#  r%   r%   r&   r%    r  zPipeIOBackend.is_applicableNr0  r%   r%   r%   r&   r1    s
    r1  c                   @   r)  )GzipIOBackendz-Uses gzip.open to automatically (de)compress.r  r   c                 C   s"   d|vrd|vr|d }t ||S )Ntbr   r  r%   r%   r&   r!     s   
zGzipIOBackend.openr)   c                 C   s   t |}|dot| S )N.gz)r   endswithr   r#  r%   r%   r&   r$    s   z"GzipIOBackend.handles_special_casec                 C   r.  r*   r/  r#  r%   r%   r&   r%     r  zGzipIOBackend.is_applicableNr0  r%   r%   r%   r&   r7    s
    r7  c                   @   s4   e Zd ZdZdedefddZedefddZ	d	S )
SmartOpenIOBackendzQUses `smart_open` library (if installed) to auto-determine how to handle the URI.r  r   c                 C   s   t ||S r*   )r   r!   r  r%   r%   r&   r!     r   zSmartOpenIOBackend.openr)   c                 C      t dS )N
smart_openr   r!  r%   r%   r&   r"  
     zSmartOpenIOBackend.is_availableN)
r\   r]   r^   r   r   r   r!   r_   r   r"  r%   r%   r%   r&   r<    s
    r<  c                   @   sX   e Zd ZdZdedefddZedefddZde	defd	d
Z
de	defddZdS )AIStoreIOBackendz
    Uses `aistore` client (if installed and enabled via AIS_ENDPOINT env var)
    to download data from AIStore if the identifier is a URL/URI.
    r  r   c                 C   s   t  \}}||}d|v r<z| }W n ty!   | }Y nw |tdkr-| }n| }|dr:t	|}|S d|v rS|tdksMJ d| |
  S d S )Nr   z1.9.1r:  r<   z1.10.0z_Writing to AIStore requires at least version 1.10.0 of AIStore Python SDK, but your version is )r:   fetch_object_by_url
get_readerrB   getr5   as_filerawr;  r   
get_writer)rS   r  r   clientr9   objectrequestfileobjr%   r%   r&   r!     s,   



zAIStoreIOBackend.openr)   c                 C   s    t dottjv ottjt S )Nr.   )r   r$   r+   r,   r   r!  r%   r%   r&   r"  .  s
   zAIStoreIOBackend.is_availablec                 C   r4  )Nzais://r5  r#  r%   r%   r&   r$  6  r   z%AIStoreIOBackend.handles_special_casec                 C   s   t |S r*   r(  r#  r%   r%   r&   r%  9  r   zAIStoreIOBackend.is_applicableN)r\   r]   r^   r   r   r!   r_   r   r"  r   r$  r%  r%   r%   r%   r&   rA    s    rA  c                   C      t dd S )NLHOTSE_MSC_OVERRIDE_PROTOCOLSr+   getenvr%   r%   r%   r&   !get_lhotse_msc_override_protocols=  r   rP  c                   C   rL  )NLHOTSE_MSC_PROFILErN  r%   r%   r%   r&   get_lhotse_msc_profileA  r   rR  c                  C   s   t dd} |  dkS )zU
    If set to True, the MSC backend will be forced to be used for regular URLs.
    LHOTSE_MSC_BACKEND_FORCEDFalsetrue)r+   rO  lower)valr%   r%   r&   get_lhotse_msc_backend_forcedE  s   rX  mscc                   @   sn   e Zd ZdZdedefddZedefddZde	defd	d
Z
de	defddZededefddZdS )MSCIOBackenda_  
    Uses Multi-Storage Client to download data from object store.

    Multi-Storage Client (MSC) is a Python library aims at providing a unified interface to object and file
    storage backends, including S3, GCS, AIStore, and more.  With no code change, user can seamlessly switch
    between different storage backends with corresponding MSC urls.

    To use MSCIOBackend, user will need

    1)
    MSC config file that specifies the storage backend information. Please refer to the MSC documentation
    for more details: https://nvidia.github.io/multi-storage-client/user_guide/quickstart.html#configuration

    2)
    Provide MSC URLs, OR
    Set env `LHOTSE_MSC_BACKEND_FORCED` to True to force the use of MSC backend for regular URLs.

    To learn more about MSC, please check out the GitHub repo: https://github.com/NVIDIA/multi-storage-client
    r  r   c           
   
   C   s   t dstdddl}t|r|||S t }|r;d|v r&|d}n|g}|D ]}||r:|	|t
} nq+t }|rEt||}z	|||}W |S  tyf }	 ztd|	 d|  |	d}	~	ww )ao  
        Convert identifier if is not prefixed with msc, and use msc.open to access the file
        For paths that are prefixed with msc, e.g. msc://profile/path/to/my/object1

        For paths are yet to migrate to msc-compatible url, e.g. protocol://bucket/path/to/my/object2
        1. override protocols provided by env LHOTSE_MSC_OVERRIDE_PROTOCOLS to msc: msc://bucket/path/to/my/object2
        2. override the profile/bucket name by env LHOTSE_MSC_PROFILE if provided: msc://profile/path/to/my/object2,
        if bucket name is not provided, then we expect the msc profile name to match with bucket name
        multistorageclientzIPlease run 'pip install multistorageclient' in order to use MSCIOBackend.r   N,zexception: z, identifier: )r   r   r[  rZ  
is_msc_urlr!   rP  splitr6  replace
MSC_PREFIXrR  r   r"   rp   )
rS   r  r   rY  lhotse_msc_override_protocolsoverride_protocol_listoverride_protocollhotse_msc_profilero   er%   r%   r&   r!   e  s<   


zMSCIOBackend.openr)   c                 C   r=  Nr[  r?  r!  r%   r%   r&   r"    r@  zMSCIOBackend.is_availablec                 C   s
   t |S r*   )rZ  r]  r#  r%   r%   r&   r$    r  z!MSCIOBackend.handles_special_casec                 C   s    t dot|pt ot|S rf  )r   rZ  r]  rX  r   r#  r%   r%   r&   r%    s   
zMSCIOBackend.is_applicablec                 C   s   t | t dS )Nz://)r   r6  r`  r  r%   r%   r&   r]    s   zMSCIOBackend.is_msc_urlN)r\   r]   r^   r   r   r!   r_   r   r"  r   r$  r%  staticmethodr	   r]  r%   r%   r%   r&   rZ  P  s    0rZ  c                   @   sX   e Zd ZdZdee fddZdedefddZ	ded	e
fd
dZded	e
fddZdS )CompositeIOBackenda  
    Composes multiple IO backends together.
    Uses `handles_special_case` and `is_applicable` of sub-backends to auto-detect
    which backend to select.

    In case of `handles_special_case`, if multiple backends could have worked,
    we'll use the first one in the list.
    backendsc                 C   r  r*   rj  )rS   rj  r%   r%   r&   r     r  zCompositeIOBackend.__init__r  r   c                 C   s\   | j D ]}||r|||  S q| j D ]}||r%|||  S qtd| d)Nz.Couldn't find a suitable IOBackend for input 'r   )rj  r$  r!   r%  r   )rS   r  r   r9  r%   r%   r&   r!     s   




zCompositeIOBackend.openr)   c                       t  fdd| jD S )Nc                 3       | ]}|  V  qd S r*   r/  r|   r9  rg  r%   r&   r~         z:CompositeIOBackend.handles_special_case.<locals>.<genexpr>r   rj  r#  r%   rg  r&   r$       z'CompositeIOBackend.handles_special_casec                    rl  )Nc                 3   rm  r*   )r%  rn  rg  r%   r&   r~     ro  z3CompositeIOBackend.is_applicable.<locals>.<genexpr>rp  r#  r%   rg  r&   r%    rq  z CompositeIOBackend.is_applicableN)r\   r]   r^   r   r   r  r   r   r   r!   r   r$  r%  r%   r%   r%   r&   ri    s    	ri  CURRENT_IO_BACKENDc                   C   s   dgt dd tjD  S )zO
    Return a list of names of available IO backends, including "default".
    defaultc                 s   s"    | ]}t j|  r|V  qd S r*   )r  r  r"  rn  r%   r%   r&   r~     s    
z(available_io_backends.<locals>.<genexpr>)sortedr  r  r%   r%   r%   r&   available_io_backends  s   
ru  backend)r  NNc                 c   s"    t  }t| }|V  t| dS )a"  
    Context manager that sets Lhotse's IO backend to the specified value
    and restores the previous IO backend at the end of its scope.

    Example::

        >>> with io_backend("AIStoreIOBackend"):
        ...     cuts = CutSet.from_file(...)  # forced open() via AIStore client
    N)r    set_current_io_backend)rv  previousr9  r%   r%   r&   
io_backend  s
   ry  c                  C   s0   t durt S tjd} | durt| S tdS )zC
    Return the backend currently set by the user, or default.
    NLHOTSE_IO_BACKENDrs  )rr  r+   r,   rD  rw  )maybe_backendr%   r%   r&   r      s   r    c                 C   sb   | dkrt  } | atS t| trt| } | atS t| tr!|  } t| ts-J d|  | atS )a  
    Force Lhotse to use a specific IO backend to open every path/URL/URI,
    overriding the default behaviour of "educated guessing".

    Example forcing Lhotse to use ``aistore`` library for every IO open() operation::

        >>> set_current_io_backend(AIStoreIOBackend())
    rs  z Expected str or IOBackend, got: )get_default_io_backendr   r   r  r  r   rr  )rv  r%   r%   r&   rw    s"   
	


rw     )maxsizec                  C   sd   t  t t g} t r| t  t r| t  t r&| t  | t t	 g7 } t
| S )a+  
    Return a composite backend that auto-infers which internal backend can support reading
    from a given identifier.

    It first looks for special cases that need very specific handling
    (such as: stdin/stdout redirects, pipes)
    and tries to match them against relevant IO backends.
    )r*  r1  TarAsDirBackendrZ  r"  r   rA  r<  r7  r'  ri  rk  r%   r%   r&   r|    s   r|  c                   @   s:   e Zd ZdddZdedefddZdedefdd	Zd
S )r  r   c                 K   s   ddl }ddl}t|}dD ]Y}|d |v rg||d d}t|dkrg|d | }	|d }
tj|	r_z||	d}|	|
W   S  |j
tfy^ } ztd|
 d	|	 d
| d}~ww td|	 dq|j||fi |S )z
        Enhanced open() function that supports accessing files inside tar archives
        using the syntax: /path/to/archive.tar/internal/file.txt
        r   Nz.tarz.tar.gzz.tar.bz2z.tar.xz/r}  rc   r   zFile 'z' not found in 'z': z
Tar file 'z' not found)builtinstarfiler   r^  r   r+   r   r   r!   extractfileTarErrorKeyErrorFileNotFoundError)rS   ro   r   r   r  r  	file_pathtar_extpartstar_pathinternal_pathtarre  r%   r%   r&   r!   2  s,   zTarAsDirBackend.openr  r)   c                 C   s(   t |}dD ]}|d |v r dS qdS )Nr  r  TF)r   )rS   r  r  r%   r%   r&   r$  W  s   z$TarAsDirBackend.handles_special_casec                 C   r.  r*   r/  r#  r%   r%   r&   r%  ^  r  zTarAsDirBackend.is_applicableNr   )r\   r]   r^   r!   r   r   r$  r%  r%   r%   r%   r&   r  1  s    
%r  r  r*   )r)   r  )cr   rf   r+   r+  r   codecsr   r   
contextlibr   	functoolsr   ior   r   pathlibr   typingr	   r
   r   r   r   r   r   r   r?   packaging.versionr   r5   lhotse.utilsr   r   r   r   r   r   lhotse.workaroundsr   r`   r   r'   r$   r   r-   r:   rF   r   rL   rN   rg   rP   ri   rj   rs   rw   rx   r#   r   r   r   r   r   r   r   r   r   r   r"   r   r   r   r   r  r	  rt   r
  r  r  r'  r*  r1  r7  r<  rA  rP  rR  rX  r`  rZ  ri  rr  __annotations__ru  ry  r    rw  r|  r  r%   r%   r%   r&   <module>   s   
 ( 
		
"
$	r%72
(
	"
%
=
.W! 
