o
    Si                     @   s  d dl Z d dlmZmZ d dlmZ d dlmZ d dlm	Z	 d dl
mZmZ d dlmZ d dlmZmZmZmZmZ d dlZd dlZd d	lmZmZ d d
lmZ d dlmZ d dlm Z m!Z!m"Z"m#Z#m$Z$m%Z% G dd dedZ&G dd dedZ'i Z(i Z)dee* fddZ+dd Z,dd Z-de*dee' fddZ.de*dee& fddZ/G dd dZ0	 e,G d d! d!e'Z1e-G d"d# d#e&Z2	 e,G d$d% d%e'Z3e-G d&d' d'e&Z4	 d(d) Z5edd*d+e*fd,d-Z6edd*de7fd.d/Z8dad0d1Z9e,G d2d3 d3e'Z:e-G d4d5 d5e&Z;	 e,G d6d7 d7e'Z<e-G d8d9 d9e&Z=	 d:Z>e,G d;d< d<e'Z?e-G d=d> d>e&Z@	 d?ZAe,G d@dA dAe'ZBe-G dBdC dCe&ZC	 e,G dDdE dEe'ZDe-G dFdG dGe&ZE	 dHdI ZFedd*d+e*fdJdKZGe,G dLdM dMe'ZHe-G dNdO dOe&ZI	 dPe*deJfdQdRZKde*fdSdTZLe,G dUdV dVe'ZMe-G dWdX dXe&ZNe,G dYdZ dZe'ZOe-G d[d\ d\e&ZPe,G d]d^ d^e'ZQe,G d_d` d`e'ZRdS )b    N)ABCMetaabstractmethod)contextmanager)	lru_cache)BytesIO)ceilfloor)Path)	GeneratorListOptionalTypeUnion)ArrayTemporalArray)dynamic_lru_cache)	open_best)PathlikeSeconds	SmartOpenis_module_availableis_valid_urlpairwisec                   @   s   e Zd ZdZeedefddZeedefddZedede	j
defd	d
Z			ddede	j
dee dee dedeeef fddZdd Zdd ZdS )FeaturesWritera}  
    ``FeaturesWriter`` defines the interface of how to store numpy arrays in a particular storage backend.
    This backend could either be:

    - separate files on a local filesystem;
    - a single file with multiple arrays;
    - cloud storage;
    - etc.

    Each class inheriting from ``FeaturesWriter`` must define:

    - the ``write()`` method, which defines the storing operation
        (accepts a ``key`` used to place the ``value`` array in the storage);
    - the ``storage_path()`` property, which is either a common directory for the files,
        the name of the file storing multiple arrays, name of the cloud bucket, etc.
    - the ``name()`` property that is unique to this particular storage mechanism -
        it is stored in the features manifests (metadata) and used to automatically deduce
        the backend when loading the features.

    Each :class:`.FeaturesWriter` can also be used as a context manager, as some implementations
    might need to free a resource after the writing is finalized. By default nothing happens
    in the context manager functions, and this can be modified by the inheriting subclasses.

    Example::

        >>> with MyWriter('some/path') as storage:
        ...     extractor.extract_from_recording_and_store(recording, storage)

    The features loading must be defined separately in a class inheriting from :class:`FeaturesReader`.
    returnc                 C      d S N selfr   r   F/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/features/io.pyname:      zFeaturesWriter.namec                 C   r   r   r   r   r   r   r    storage_path?   r"   zFeaturesWriter.storage_pathkeyvaluec                 C   r   r   r   r   r$   r%   r   r   r    writeD      zFeaturesWriter.writeNr   frame_shifttemporal_dimstartc           	      C   s~   |duo|du}|s t dd ||fD s J d| d| d| ||}t| j| j|t|jd}|s7|S t||||dS )	aD  
        Store a numpy array in the underlying storage and return a manifest
        describing how to retrieve the data.

        If the array contains a temporal dimension (e.g. it represents the
        frame-level features, alignment, posteriors, etc. of an utterance)
        then ``temporal_dim`` and ``frame_shift`` may be specified to enable
        downstream padding, truncating, and partial reads of the array.

        :param key: An ID that uniquely identifies the array.
        :param value: The array to be stored.
        :param frame_shift: Optional float, when the array has a temporal dimension
            it indicates how much time has passed between the starts of consecutive frames
            (expressed in seconds).
        :param temporal_dim: Optional int, when the array has a temporal dimension,
            it indicates which dim to interpret as temporal.
        :param start: Float, when the array is temporal, it indicates what is the offset
            of the array w.r.t. the start of recording. Useful for reading subsets
            of an array when it represents something computed from long recordings.
            Ignored for non-temporal arrays.
        :return: A manifest of type :class:`~lhotse.array.Array` or
            :class:`~lhotse.array.TemporalArray`, depending on the input arguments.
        Nc                 s   s    | ]}|d u V  qd S r   r   ).0argr   r   r    	<genexpr>i   s    z-FeaturesWriter.store_array.<locals>.<genexpr>zOframe_shift and temporal_dim have to be both None or both set (got frame_shift=z, temporal_dim=z).)storage_typer#   storage_keyshape)arrayr*   r)   r+   )allr'   r   r!   r#   listr1   r   )	r   r$   r%   r)   r*   r+   is_temporalr0   r2   r   r   r    store_arrayH   s0   zFeaturesWriter.store_arrayc                 C      | S r   r   r   r   r   r    	__enter__      zFeaturesWriter.__enter__c                 O   r   r   r   r   argskwargsr   r   r    __exit__   r9   zFeaturesWriter.__exit__)NNr   )__name__
__module____qualname____doc__propertyr   strr!   r#   npndarrayr'   r   r   intr   r   r   r6   r8   r=   r   r   r   r    r      s8    

8r   )	metaclassc                
   @   sP   e Zd ZdZeedefddZe		ddeded	e	e de
jfd
dZdS )FeaturesReadera  
    ``FeaturesReader`` defines the interface of how to load numpy arrays from a particular storage backend.
    This backend could either be:

    - separate files on a local filesystem;
    - a single file with multiple arrays;
    - cloud storage;
    - etc.

    Each class inheriting from ``FeaturesReader`` must define:

    - the ``read()`` method, which defines the loading operation
        (accepts the ``key`` to locate the array in the storage and return it).
        The read method should support selecting only a subset of the feature matrix,
        with the bounds expressed as arguments ``left_offset_frames`` and ``right_offset_frames``.
        It's up to the Reader implementation to load only the required part or trim it to that
        range only after loading. It is assumed that the time dimension is always the first one.
    - the ``name()`` property that is unique to this particular storage mechanism -
        it is stored in the features manifests (metadata) and used to automatically deduce
        the backend when loading the features.

    The features writing must be defined separately in a class inheriting from ``FeaturesWriter``.
    r   c                 C   r   r   r   r   r   r   r    r!      r"   zFeaturesReader.namer   Nr$   left_offset_framesright_offset_framesc                 C   r   r   r   r   r$   rI   rJ   r   r   r    read   s   zFeaturesReader.readr   N)r>   r?   r@   rA   rB   r   rC   r!   rF   r   rD   rE   rL   r   r   r   r    rH      s"    rH   r   c                   C   s   t tttS r   )sortedsetREADER_BACKENDSintersectionWRITER_BACKENDSr   r   r   r    available_storage_backends   s   rS   c                 C      | t | j< | S )z
    Decorator used to add a new ``FeaturesReader`` to Lhotse's registry.

    Example::

        @register_reader
        class MyFeatureReader(FeatureReader):
            ...
    )rP   r!   clsr   r   r    register_reader      

rW   c                 C   rT   )z
    Decorator used to add a new ``FeaturesWriter`` to Lhotse's registry.

    Example::

        @register_writer
        class MyFeatureWriter(FeatureWriter):
            ...
    )rR   r!   rU   r   r   r    register_writer   rX   rY   r!   c                 C   
   t | S )z
    Find a ``FeaturesReader`` sub-class that corresponds to the provided ``name`` and return its type.

    Example:

        reader_type = get_reader("lilcom_files")
        reader = reader_type("/storage/features/")
    )rP   getr!   r   r   r    
get_reader      
	r]   c                 C   rZ   )z
    Find a ``FeaturesWriter`` sub-class that corresponds to the provided ``name`` and return its type.

    Example:

        writer_type = get_writer("lilcom_files")
        writer = writer_type("/storage/features/")
    )rR   r[   r\   r   r   r    
get_writer   r^   r_   c                       sR   e Zd ZdZdef fddZe	ddededed	e	e
d
d
f fddZ  ZS )FileIOaG  
    Helper util for opening a file object for reading or writing in a directory on the local filesystem,
    or a URL to supported object store (S3, AIStore, etc.).
    ``storage_path`` corresponds to the directory path or base URL prefix;
    ``storage_key`` for each utterance is the name of the file in that directory.
    r#   c                    sL   t    t|| _t|| _| jr"| jdr$| jd d | _d S d S d S )N/)super__init__rC   r#   r   is_urlendswith)r   r#   	__class__r   r    rd      s   


zFileIO.__init__Fr$   mode
add_subdirr   Nc           	      c   sZ   d|v rd|v rJ dd|v rH| dr#t| jdkr#|dd }| j d| }t|d}||fV  W d   dS 1 sAw   Y  dS d|v r| jrc| drZ|dd }| j d| }n%t| j}|jd	d	d
 |r||dd  }|jd	d || }n|| }t|d}||fV  W d   dS 1 sw   Y  dS td| d)aA  
        Open a file for reading or writing on local disk or URL to object store.
        Arg "key" should contain the extension for the file.
        Mode is either "r" or "w".
        Arg "add_subdir" can be set to True, in which case on the local filesystem it will create
            an extra subdirectory of ``self.storage_path`` with the first three letters of ``key``,
            preventing big datasets from exhausting the filesystem with one big directory.
            This arg is ignored for URLs.

        Yields a tuple of (open_file_object, path_or_url).
        rwz6Opening for both reading and writing is not supported.ra   r      NrbT)exist_okparents   )ro   wbz)Unsupported file mode (missing r or w): '')
startswithlenr#   r   re   r	   mkdir
ValueError)	r   r$   ri   rj   
input_pathfoutput_pathpsubdirr   r   r    open_fileobj   s6   "


"zFileIO.open_fileobj)F)r>   r?   r@   rA   r   rd   r   rC   boolr
   tupler}   __classcell__r   r   rg   r    r`      s    r`   c                
       T   e Zd ZdZdZdef fddZe		dded	e	d
e
e	 dejfddZ  ZS )LilcomFilesReadera  
    Reads Lilcom-compressed files from a directory on the local filesystem,
    or a URL to supported object store (S3, AIStore, etc.).
    ``storage_path`` corresponds to the directory path;
    ``storage_key`` for each utterance is the name of the file in that directory.
    lilcom_filesr#   c                       t    t|| _d S r   rc   rd   r`   ior   r#   r;   r<   rg   r   r    rd   7     
zLilcomFilesReader.__init__r   Nr$   rI   rJ   r   c                 C   sN   | j j|dd\}}t| }W d    n1 sw   Y  ||| S )Nrk   ri   )r   r}   lilcom
decompressrL   r   r$   rI   rJ   ry   rx   arrr   r   r    rL   ;     zLilcomFilesReader.readrM   r>   r?   r@   rA   r!   r   rd   r   rC   rF   r   rD   rE   rL   r   r   r   rg   r    r   ,       r   c                       sZ   e Zd ZdZdZddedef fddZede	fd	d
Z
de	dejde	fddZ  ZS )LilcomFilesWritera  
    Writes Lilcom-compressed files to a directory on the local filesystem,
    or a URL to supported object store (S3, AIStore, etc.).
    ``storage_path`` corresponds to the directory path;
    ``storage_key`` for each utterance is the name of the file in that directory.
    r   r#   
tick_powerc                    s   t    t|| _|| _d S r   )rc   rd   r`   r   r   )r   r#   r   r;   r<   rg   r   r    rd   R  s   


zLilcomFilesWriter.__init__r   c                 C      | j jS r   r   r#   r   r   r   r    r#   W     zLilcomFilesWriter.storage_pathr$   r%   c                 C   s   | ds	|d }tj|| jd}| jj|ddd(\}}|| | jjs:dt	|j
dd  }W d    |S W d    |S 1 sEw   Y  |S )Nz.llcr   rl   Trj   ra   )rf   r   compressr   r   r}   r'   re   joinr	   parts)r   r$   r%   serialized_featsry   rz   r   r   r    r'   [  s   



zLilcomFilesWriter.write)r   )r>   r?   r@   rA   r!   r   rF   rd   rB   rC   r#   rD   rE   r'   r   r   r   rg   r    r   G  s     r   c                
       r   )NumpyFilesReadera$  
    Reads non-compressed numpy arrays from files in a directory on the local filesystem,
    or a URL to supported object store (S3, AIStore, etc.).
    ``storage_path`` corresponds to the directory path;
    ``storage_key`` for each utterance is the name of the file in that directory.
    numpy_filesr#   c                    r   r   r   r   rg   r   r    rd   v  r   zNumpyFilesReader.__init__r   Nr$   rI   rJ   r   c                 C   sN   | j j|dd\}}tj|dd}W d    n1 sw   Y  ||| S )Nrk   r   Fallow_pickle)r   r}   rD   loadr   r   r   r    rL   z  r   zNumpyFilesReader.readrM   r   r   r   rg   r    r   k  r   r   c                       sT   e Zd ZdZdZdef fddZedefddZ	d	ed
e
jdefddZ  ZS )NumpyFilesWritera#  
    Writes non-compressed numpy arrays to files in a directory on the local filesystem,
    or a URL to supported object store (S3, AIStore, etc.).
    ``storage_path`` corresponds to the directory path;
    ``storage_key`` for each utterance is the name of the file in that directory.
    r   r#   c                    r   r   r   r   rg   r   r    rd     r   zNumpyFilesWriter.__init__r   c                 C   r   r   r   r   r   r   r    r#     r   zNumpyFilesWriter.storage_pathr$   r%   c                 C   s   | ds	|d }| jj|ddd+\}}tj||dd | jjs5dt|jdd  }W d    |S W d    |S 1 s@w   Y  |S )	Nz.npyrl   Tr   Fr   ra   r   )	rf   r   r}   rD   savere   r   r	   r   )r   r$   r%   ry   rz   r   r   r    r'     s   


zNumpyFilesWriter.write)r>   r?   r@   rA   r!   r   rd   rB   rC   r#   rD   rE   r'   r   r   r   rg   r    r     s     r   c                   C      t dstdd S )Nh5pyzETo read and write HDF5 file formats, please 'pip install h5py' first.r   rw   r   r   r   r    check_h5py_installed  
   r   )maxsizer#   c                 C   s   t   ddl}|| dS )aq  
    Helper internal function used in HDF5 readers.
    It opens the HDF files and keeps their handles open in a global program cache
    to avoid excessive amount of syscalls when the Reader class is instantiated
    and destroyed in a loop repeatedly (frequent use-case).

    The file handles can be freed at any time by calling ``close_cached_file_handles()``.
    r   Nrk   )r   r   File)r#   r   r   r   r    lookup_cache_or_open  s   
r   c                 C   s   | t  d S )z
    Helper internal function to retrieve the chunk size from an HDF5 file.
    Helps avoid unnecessary repeated disk reads.
    r   )CHUNK_SIZE_KEY)h5_file_handler   r   r    lookup_chunk_size  s   r   c                   C   s   t   t  t  dS )z
    Closes the cached file handles in ``lookup_cache_or_open`` and
    ``lookup_reader_cache_or_open`` (see respective docs for more details).
    N)r   cache_clearr   lookup_reader_cache_or_openr   r   r   r    close_cached_file_handles  s   r   c                
       r   )NumpyHdf5Readera[  
    Reads non-compressed numpy arrays from a HDF5 file with a "flat" layout.
    Each array is stored as a separate HDF ``Dataset`` because their shapes (numbers of frames) may vary.
    ``storage_path`` corresponds to the HDF5 file path;
    ``storage_key`` for each utterance is the key corresponding to the array (i.e. HDF5 "Group" name).
    
numpy_hdf5r#   c                    r   r   rc   rd   r   hdfr   rg   r   r    rd     r   zNumpyHdf5Reader.__init__r   Nr$   rI   rJ   r   c                 C   s   | j | || S r   )r   rK   r   r   r    rL     s   	zNumpyHdf5Reader.readrM   r   r   r   rg   r    r     r   r   c                       st   e Zd ZdZdZddedef fddZedefd	d
Z	dede
jdefddZdddZdd Zdd Z  ZS )NumpyHdf5Writera
  
    Writes non-compressed numpy arrays to a HDF5 file with a "flat" layout.
    Each array is stored as a separate HDF ``Dataset`` because their shapes (numbers of frames) may vary.
    ``storage_path`` corresponds to the HDF5 file path;
    ``storage_key`` for each utterance is the key corresponding to the array (i.e. HDF5 "Group" name).

    Internally, this class opens the file lazily so that this object can be passed between processes
    without issues. This simplifies the parallel feature extraction code.
    r   rl   r#   ri   c                    sV   t    t  ddl}t|}||jdkr|jd nd| _|j| j	|d| _
dS )a  
        :param storage_path: Path under which we'll create the HDF5 file.
            We will add a ``.h5`` suffix if it is not already in ``storage_path``.
        :param mode: Modes supported by h5py:
            w        Create file, truncate if exists (default)
            w- or x  Create file, fail if exists
            a        Read/write if exists, create otherwise
        r   N.h5r   )rc   rd   r   r   r	   with_suffixsuffixstorage_path_r   r#   r   )r   r#   ri   r;   r<   r   r{   rg   r   r    rd     s   
	zNumpyHdf5Writer.__init__r   c                 C   
   t | jS r   rC   r   r   r   r   r    r#        
zNumpyHdf5Writer.storage_pathr$   r%   c                 C   s   | j j||d |S )Ndata)r   create_datasetr&   r   r   r    r'     s   zNumpyHdf5Writer.writeNc                 C   
   | j  S r   r   closer   r   r   r    r        
zNumpyHdf5Writer.closec                 C   r7   r   r   r   r   r   r    r8     r9   zNumpyHdf5Writer.__enter__c                 C      |    d S r   r   r   exc_typeexc_valexc_tbr   r   r    r=        zNumpyHdf5Writer.__exit__)rl   r   N)r>   r?   r@   rA   r!   r   rC   rd   rB   r#   rD   rE   r'   r   r8   r=   r   r   r   rg   r    r     s    

r   c                
       r   )LilcomHdf5Readera^  
    Reads lilcom-compressed numpy arrays from a HDF5 file with a "flat" layout.
    Each array is stored as a separate HDF ``Dataset`` because their shapes (numbers of frames) may vary.
    ``storage_path`` corresponds to the HDF5 file path;
    ``storage_key`` for each utterance is the key corresponding to the array (i.e. HDF5 "Group" name).
    lilcom_hdf5r#   c                    r   r   r   r   rg   r   r    rd   0  r   zLilcomHdf5Reader.__init__r   Nr$   rI   rJ   r   c                 C   s$   t | j| d  }||| S )Nr   )r   r   r   tobytesr   r$   rI   rJ   r   r   r   r    rL   4  s   zLilcomHdf5Reader.readrM   r   r   r   rg   r    r   %  r   r   c                       s|   e Zd ZdZdZ		ddededef fdd	Ze	d
efddZ
dedejd
efddZdddZdd Zdd Z  ZS )LilcomHdf5Writera]  
    Writes lilcom-compressed numpy arrays to a HDF5 file with a "flat" layout.
    Each array is stored as a separate HDF ``Dataset`` because their shapes (numbers of frames) may vary.
    ``storage_path`` corresponds to the HDF5 file path;
    ``storage_key`` for each utterance is the key corresponding to the array (i.e. HDF5 "Group" name).
    r   r   rl   r#   r   ri   c                    s\   t    t  ddl}t|}||jdkr|jd nd| _|j| j	|d| _
|| _dS )a  
        :param storage_path: Path under which we'll create the HDF5 file.
            We will add a ``.h5`` suffix if it is not already in ``storage_path``.
        :param tick_power: Determines the lilcom compression accuracy;
            the input will be compressed to integer multiples of 2^tick_power.
        :param mode: Modes supported by h5py:
            w        Create file, truncate if exists (default)
            w- or x  Create file, fail if exists
            a        Read/write if exists, create otherwise
        r   Nr   r   )rc   rd   r   r   r	   r   r   r   r   r#   r   r   )r   r#   r   ri   r;   r<   r   r{   rg   r   r    rd   N  s   

zLilcomHdf5Writer.__init__r   c                 C   r   r   r   r   r   r   r    r#   k  r   zLilcomHdf5Writer.storage_pathr$   r%   c                 C   s*   t j|| jd}| jj|t|d |S )Nr   r   )r   r   r   r   r   rD   void)r   r$   r%   r   r   r   r    r'   o  s   zLilcomHdf5Writer.writeNc                 C   r   r   r   r   r   r   r    r   t  r   zLilcomHdf5Writer.closec                 C   r7   r   r   r   r   r   r    r8   w  r9   zLilcomHdf5Writer.__enter__c                 C   r   r   r   r   r   r   r    r=   z  r   zLilcomHdf5Writer.__exit__)r   rl   r   r>   r?   r@   rA   r!   r   rF   rC   rd   rB   r#   rD   rE   r'   r   r8   r=   r   r   r   rg   r    r   C  s$    
r   __LHOTSE_INTERNAL_CHUNK_SIZE__c                
       r   )ChunkedLilcomHdf5Readera  
    Reads lilcom-compressed numpy arrays from a HDF5 file with chunked lilcom storage.
    Each feature matrix is stored in an array of chunks - binary data compressed with lilcom.
    Upon reading, we check how many chunks need to be retrieved to avoid excessive I/O.

    ``storage_path`` corresponds to the HDF5 file path;
    ``storage_key`` for each utterance is the key corresponding to the array (i.e. HDF5 "Group" name).
    chunked_lilcom_hdf5r#   c                    r   r   r   r   rg   r   r    rd     r   z ChunkedLilcomHdf5Reader.__init__r   Nr$   rI   rJ   r   c                 C   s   t | j}t|| }|d urt|| }nd }dd | j| || D }|r0tj|dd}ntg }|| }	||	 }
|d urF||	 }nd }||
| S )Nc                 S   s   g | ]	}t | qS r   )r   r   r   r,   r   r   r   r    
<listcomp>  s    z0ChunkedLilcomHdf5Reader.read.<locals>.<listcomp>r   axis)r   r   r   r   rD   concatenater2   )r   r$   rI   rJ   
chunk_sizeleft_chunk_idxright_chunk_idxdecompressed_chunksr   shift_framesleft_offset_shiftright_offset_shiftr   r   r    rL     s"   


zChunkedLilcomHdf5Reader.readrM   r   r   r   rg   r    r     s     	r   c                	       s   e Zd ZdZdZ			ddededed	ef fd
dZe	defddZ
dedejdefddZdddZdd Zdd Z  ZS )ChunkedLilcomHdf5Writera  
    Writes lilcom-compressed numpy arrays to a HDF5 file with chunked lilcom storage.
    Each feature matrix is stored in an array of chunks - binary data compressed with lilcom.
    Upon reading, we check how many chunks need to be retrieved to avoid excessive I/O.

    ``storage_path`` corresponds to the HDF5 file path;
    ``storage_key`` for each utterance is the key corresponding to the array (i.e. HDF5 "Group" name).
    r   r   d   rl   r#   r   r   ri   c           
         s   t    t  ddl}t|}||jdkr|jd nd| _|| _|| _	|j
| j|d| _t| jv rM| jt d }	|	tksKJ d| j	 d|	 ddS | jjt| j	d	 dS )
a  
        :param storage_path: Path under which we'll create the HDF5 file.
            We will add a ``.h5`` suffix if it is not already in ``storage_path``.
        :param tick_power: Determines the lilcom compression accuracy;
            the input will be compressed to integer multiples of 2^tick_power.
        :param chunk_size: How many frames to store per chunk.
            Too low a number will require many reads for long feature matrices,
            too high a number will require to read more redundant data.
        :param mode: Modes supported by h5py:
            w        Create file, truncate if exists (default)
            w- or x  Create file, fail if exists
            a        Read/write if exists, create otherwise
        r   Nr   r   r   z*Error: attempted to write with chunk size z2 to an h5py file that was created with chunk size .r   )rc   rd   r   r   r	   r   r   r   r   r   r   r#   r   r   r   )
r   r#   r   r   ri   r;   r<   r   r{   retrieved_chunk_sizerg   r   r    rd     s$   



z ChunkedLilcomHdf5Writer.__init__r   c                 C   r   r   r   r   r   r   r    r#     r   z$ChunkedLilcomHdf5Writer.storage_pathr$   r%   c           	      C   sz   t   dd l}ddlm} ||| j| jd}| jj||t	
dt|fd}t|D ]\}}t	j|t	jd||< q,|S )Nr   lilcom_compress_chunkedr   r   uint8)dtyper1   )r   )r   r   lhotse.features.compressionr   r   r   r   r   
vlen_dtyperD   r   ru   	enumerate
frombufferr   )	r   r$   r%   r   r   r   dsetidxfeatr   r   r    r'     s   
zChunkedLilcomHdf5Writer.writeNc                 C   r   r   r   r   r   r   r    r   
  r   zChunkedLilcomHdf5Writer.closec                 C   r7   r   r   r   r   r   r    r8     r9   z!ChunkedLilcomHdf5Writer.__enter__c                 C   r   r   r   r   r   r   r    r=     r   z ChunkedLilcomHdf5Writer.__exit__)r   r   rl   r   r   r   r   rg   r    r     s*    	*
r   i  c                
       sX   e Zd ZdZdZeZdef fddZe			dde
d	ed
ee dejfddZ  ZS )LilcomChunkyReadera`  
    Reads lilcom-compressed numpy arrays from a binary file with chunked lilcom storage.
    Each feature matrix is stored in an array of chunks - binary data compressed with lilcom.
    Upon reading, we check how many chunks need to be retrieved to avoid excessive I/O.

    ``storage_path`` corresponds to the binary file path.

    ``storage_key`` for each utterance is a comma separated list of offsets in the file.
    The first number is the offset for the whole array,
    and the following numbers are relative offsets for each chunk.
    These offsets are relative to the previous chunk start.
    lilcom_chunkyr#   c                    s   t    || _d S r   )rc   rd   r#   r   rg   r   r    rd   /  s   

zLilcomChunkyReader.__init__r   Nr$   rI   rJ   r   c                 C   s  t || j }|d urt|| j d }nd }ttt|d}t|}||| }g }t	| j
d }t|D ]\}	}
||	 |||
|	  q9W d    n1 sWw   Y  dd |D }|rmtj|dd}ntg }| j| }|| }|d ur|| }nd }||| S )Nrm   ,rn   c                 S   s   g | ]}t |qS r   r   r   r   r   r   r    r   M  s    z+LilcomChunkyReader.read.<locals>.<listcomp>r   r   )r   
CHUNK_SIZEr   r4   maprF   splitrD   cumsumopenr#   r   seekappendrL   r   r2   )r   r$   rI   rJ   r   r   chunk_offsets
chunk_datafileoffsetendr   r   r   r   r   r   r   r    rL   3  s0   




zLilcomChunkyReader.readrM   )r>   r?   r@   rA   r!   CHUNKY_FORMAT_CHUNK_SIZEr   r   rd   r   rC   rF   r   rD   rE   rL   r   r   r   rg   r    r     s"    r   c                       s   e Zd ZdZdZeZ		ddedede	f fdd	Z
ed
e	fddZde	dejd
e	fddZdddZdd Zdd Z  ZS )LilcomChunkyWritera_  
    Writes lilcom-compressed numpy arrays to a binary file with chunked lilcom storage.
    Each feature matrix is stored in an array of chunks - binary data compressed with lilcom.
    Upon reading, we check how many chunks need to be retrieved to avoid excessive I/O.

    ``storage_path`` corresponds to the binary file path.

    ``storage_key`` for each utterance is a comma separated list of offsets in the file.
    The first number is the offset for the whole array,
    and the following numbers are relative offsets for each chunk.
    These offsets are relative to the previous chunk start.
    r   r   rr   r#   r   ri   c                    st   t    d|vr|d }|dv sJ t|}||jdkr#|jd nd| _|| _t| j|d| _	| j	
 | _dS )a)  
        :param storage_path: Path under which we'll create the binary file.
        :param tick_power: Determines the lilcom compression accuracy;
            the input will be compressed to integer multiples of 2^tick_power.
        :param chunk_size: How many frames to store per chunk.
            Too low a number will require many reads for long feature matrices,
            too high a number will require to read more redundant data.
        :param mode: Modes, one of: "w" (write) or "a" (append); can be "wb" and "ab", "b" is implicit
        b)rr   abz.lcar   N)rc   rd   r	   r   r   r   r   r   r#   r  tellcurr_offset)r   r#   r   ri   r;   r<   r{   rg   r   r    rd   q  s   
zLilcomChunkyWriter.__init__r   c                 C   r   r   r   r   r   r   r    r#     r   zLilcomChunkyWriter.storage_pathr$   r%   c           	      C   sl   ddl m} ||| j| jd}| jg}t|D ]\}}| j|}|| |  j|7  _qd	t
t|S )Nr   r   r   r   )r   r   r   r   r	  r   r  r'   r   r   r   rC   )	r   r$   r%   r   r   offsetsr   r   nbytesr   r   r    r'     s   

zLilcomChunkyWriter.writeNc                 C   s   | j   d S r   )r  r   r   r   r   r    r        zLilcomChunkyWriter.closec                 C   r7   r   r   r   r   r   r    r8     r9   zLilcomChunkyWriter.__enter__c                 C   r   r   r   r   r   r   r    r=     r   zLilcomChunkyWriter.__exit__)r   rr   r   )r>   r?   r@   rA   r!   r  r   r   rF   rC   rd   rB   r#   rD   rE   r'   r   r8   r=   r   r   r   rg   r    r  _  s&     
r  c                
       sN   e Zd ZdZdZ fddZe		ddeded	e	e d
e
jfddZ  ZS )LilcomURLReaderao  
    Downloads Lilcom-compressed files from a URL (S3, GCP, Azure, HTTP, etc.).
    ``storage_path`` corresponds to the root URL (e.g. "s3://my-data-bucket")
    ``storage_key`` will be concatenated to ``storage_path`` to form a full URL (e.g. "my-feature-file.llc")

    .. caution::
        Requires ``smart_open`` to be installed (``pip install smart_open``).
    
lilcom_urlc                       t    t|i || _d S r   )rc   rd   r   _innerr:   rg   r   r    rd        
zLilcomURLReader.__init__r   Nr$   rI   rJ   r   c                 C   s   | j |||S r   )r  rL   rK   r   r   r    rL     s   zLilcomURLReader.readrM   )r>   r?   r@   rA   r!   rd   r   rC   rF   r   rD   rE   rL   r   r   r   rg   r    r    s     	r  c                       sN   e Zd ZdZdZ fddZedefddZded	e	j
defd
dZ  ZS )LilcomURLWriteraj  
    Writes Lilcom-compressed files to a URL (S3, GCP, Azure, HTTP, etc.).
    ``storage_path`` corresponds to the root URL (e.g. "s3://my-data-bucket")
    ``storage_key`` will be concatenated to ``storage_path`` to form a full URL (e.g. "my-feature-file.llc")

    .. caution::
        Requires ``smart_open`` to be installed (``pip install smart_open``).
    r  c                    r  r   )rc   rd   r   r  r:   rg   r   r    rd     r  zLilcomURLWriter.__init__r   c                 C   r   r   )r  r#   r   r   r   r    r#     r   zLilcomURLWriter.storage_pathr$   r%   c                 C   s   | j ||S r   )r  r'   r&   r   r   r    r'     r  zLilcomURLWriter.write)r>   r?   r@   rA   r!   rd   rB   rC   r#   rD   rE   r'   r   r   r   rg   r    r    s    	 r  c                   C   r   )Nkaldi_native_ioDTo read Kaldi feats.scp, please 'pip install kaldi_native_io' first.r   r   r   r   r    check_kaldi_native_io_installed  r   r  c                 C   s   t   ddl}|d|  S )ar  
    Helper internal function used in KaldiReader.
    It opens kaldi scp files and keeps their handles open in a global program cache
    to avoid excessive amount of syscalls when the Reader class is instantiated
    and destroyed in a loop repeatedly (frequent use-case).

    The file handles can be freed at any time by calling ``close_cached_file_handles()``.
    r   Nzscp:)r  r  RandomAccessFloatMatrixReader)r#   r  r   r   r    r     s   
r   c                
       r   )KaldiReadera&  
    Reads Kaldi's "feats.scp" file using kaldi_native_io.
    ``storage_path`` corresponds to the path to ``feats.scp``.
    ``storage_key`` corresponds to the utterance-id in Kaldi.

    .. caution::
        Requires ``kaldi_native_io`` to be installed (``pip install kaldi_native_io``).
    kaldiior#   c                    sJ   t    || _|drt| j| _d S t  dd l}d | _|j| _	d S )Nz.scpr   )
rc   rd   r#   rf   r   storager  r  FloatMatrixreader)r   r#   r;   r<   r  rg   r   r    rd     s   

zKaldiReader.__init__r   Nr$   rI   rJ   r   c                 C   s:   | j d urt| j | }n	| j| j }||| S r   )r  rD   copyr  rL   r#   numpyr   r   r   r    rL     s   
zKaldiReader.readrM   r   r   r   rg   r    r    s     	r  c                       sv   e Zd ZdZdZ	ddedef fddZede	fd	d
Z
de	dejde	fddZdddZdd Zdd Z  ZS )KaldiWritera  
    Write data to Kaldi's "feats.scp" and "feats.ark" files using kaldi_native_io.
    ``storage_path`` corresponds to a directory where we'll create "feats.scp"
    and "feats.ark" files.
    ``storage_key`` corresponds to the utterance-id in Kaldi.

    The following ``compression_method`` values are supported by kaldi_native_io::

        kAutomaticMethod = 1
        kSpeechFeature = 2
        kTwoByteAuto = 3
        kTwoByteSignedInteger = 4
        kOneByteAuto = 5
        kOneByteUnsignedInteger = 6
        kOneByteZeroOne = 7

    .. note:: Setting compression_method works only with 2D arrays.

    Example::

        >>> data = np.random.randn(131, 80)
        >>> with KaldiWriter('featdir') as w:
        ...     w.write('utt1', data)
        >>> reader = KaldiReader('featdir/feats.scp')
        >>> read_data = reader.read('utt1')
        >>> np.testing.assert_equal(data, read_data)

    .. caution::
        Requires ``kaldi_native_io`` to be installed (``pip install kaldi_native_io``).
    r  rm   r#   compression_methodc                    sz   t dstddd l}t   t|| _| jjddd t| jd | _	|
d| j d| j d	| _||| _d S )
Nr  r  r   T)rp   ro   z	feats.scpzark,scp:z/feats.ark,z
/feats.scp)r   rw   r  rc   rd   r	   storage_dirrv   rC   r   CompressedMatrixWriterr  CompressionMethodr  )r   r#   r  r;   r<   r  rg   r   r    rd   Q  s   

zKaldiWriter.__init__r   c                 C   s   | j S r   )r   r   r   r   r    r#   g  s   zKaldiWriter.storage_pathr$   r%   c                 C   s   | j ||| j |S r   )r  r'   r  r&   r   r   r    r'   k  s   zKaldiWriter.writeNc                 C   r   r   )r  r   r   r   r   r    r   o  r   zKaldiWriter.closec                 C   r7   r   r   r   r   r   r    r8   r  r9   zKaldiWriter.__enter__c                 C   r   r   r   r   r   r   r    r=   u  r   zKaldiWriter.__exit__)rm   r   )r>   r?   r@   rA   r!   r   rF   rd   rB   rC   r#   rD   rE   r'   r   r8   r=   r   r   r   rg   r    r  .  s    
r  r/   c                 C   s   d| v S Nmemoryr   )r/   r   r   r    is_in_memory~  s   r%  c                 C   s   d| v sJ t | S r#  )r_   r\   r   r   r    get_memory_writer  s   r&  c                
   @   F   e Zd ZdZdZdd Ze		ddeded	e	e d
e
jfddZdS )MemoryLilcomReader memory_lilcomc                 O   r   r   r   r:   r   r   r    rd     r9   zMemoryLilcomReader.__init__r   Nraw_datarI   rJ   r   c                 C      t |}||| S r   r   r   r+  rI   rJ   r   r   r   r    rL        
zMemoryLilcomReader.readrM   r>   r?   r@   rA   r!   rd   r   bytesrF   r   rD   rE   rL   r   r   r   r    r(         r(  c                   @   sl   e Zd ZdZdZdddeddfdd	Zedd
dZde	de
jdefddZdddZdd Zdd ZdS )MemoryLilcomWriterr)  r*  r   lilcom_tick_powerr4  r   Nc                O   s
   || _ d S r   r3  )r   r4  r;   r<   r   r   r    rd     s   
zMemoryLilcomWriter.__init__c                 C   r   r   r   r   r   r   r    r#     r(   zMemoryLilcomWriter.storage_pathr$   r%   c                 C   s(   t |jt jsJ dtj|| jdS )Nz7Lilcom compression supports only floating-point arrays.r   )rD   
issubdtyper   floatingr   r   r4  r&   r   r   r    r'     s   zMemoryLilcomWriter.writec                 C   r   r   r   r   r   r   r    r     r9   zMemoryLilcomWriter.closec                 C   r7   r   r   r   r   r   r    r8     r9   zMemoryLilcomWriter.__enter__c                 C   r   r   r   r   r   r   r    r=     r9   zMemoryLilcomWriter.__exit__r   )r>   r?   r@   rA   r!   rF   rd   rB   r#   rC   rD   rE   r0  r'   r   r8   r=   r   r   r   r    r2    s    

r2  c                
   @   r'  )MemoryRawReaderr)  
memory_rawc                 O   r   r   r   r:   r   r   r    rd     r9   zMemoryRawReader.__init__r   Nr+  rI   rJ   r   c                 C   r,  r   )pickleloadsr-  r   r   r    rL     r.  zMemoryRawReader.readrM   r/  r   r   r   r    r7    r1  r7  c                   @   s\   e Zd ZdZdZdd ZedddZd	ed
e	j
defddZdddZdd Zdd ZdS )MemoryRawWriterr)  r8  c                 O   r   r   r   r:   r   r   r    rd     r9   zMemoryRawWriter.__init__r   Nc                 C   r   r   r   r   r   r   r    r#     r(   zMemoryRawWriter.storage_pathr$   r%   c                 C   s
   t |S r   )r9  dumpsr&   r   r   r    r'     r   zMemoryRawWriter.writec                 C   r   r   r   r   r   r   r    r     r9   zMemoryRawWriter.closec                 C   r7   r   r   r   r   r   r    r8     r9   zMemoryRawWriter.__enter__c                 C   r   r   r   r   r   r   r    r=     r9   zMemoryRawWriter.__exit__r   )r>   r?   r@   rA   r!   rd   rB   r#   rC   rD   rE   r0  r'   r   r8   r=   r   r   r   r    r;    s    
r;  c                
   @   r'  )MemoryNpyReaderr)  
memory_npyc                 O   r   r   r   r:   r   r   r    rd     r9   zMemoryNpyReader.__init__r   Nr+  rI   rJ   r   c                 C   s   t |}t|}||| S r   )r   rD   r   )r   r+  rI   rJ   streamr   r   r   r    rL     s   
zMemoryNpyReader.readrM   r/  r   r   r   r    r=    r1  r=  c                   @   s,   e Zd ZdZdZdd ZdejfddZdS )	DummySharReaderr)  sharc                 O   r   r   r   r:   r   r   r    rd     r9   zDummySharReader.__init__r   c                 O   s   t d)NzeInconsistent state: found a Lhotse Shar placeholder array that was not filled during deserialization.)RuntimeErrorr:   r   r   r    rL   	  s   zDummySharReader.readN)	r>   r?   r@   rA   r!   rd   rD   rE   rL   r   r   r   r    r@     s
    r@  r   )Sr9  abcr   r   
contextlibr   	functoolsr   r   r   mathr   r   pathlibr	   typingr
   r   r   r   r   r   r  rD   lhotse.arrayr   r   lhotse.cachingr   lhotse.serializationr   lhotse.utilsr   r   r   r   r   r   r   rH   rP   rR   rC   rS   rW   rY   r]   r_   r`   r   r   r   r   r   r   rF   r   r   r   r   r   r   r   r   r   r  r   r  r  r  r  r   r  r  r~   r%  r&  r(  r2  r7  r;  r=  r@  r   r   r   r    <module>   s     
m(<

2:5VAQ'J!