o
    Sij                      @   s  d dl Z d dlmZmZmZmZ d dlZd dlZd dl	m
Z
 d dlmZmZmZmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZmZmZ d d
lmZ d dlmZ G dd dej j!j"Z#G dd de#Z$G dd de#Z%G dd de
Z&dee fddZ'dd Z(dS )    N)AnyDictListOptional)IterableDataset)RecordingSetSecondscompute_num_samplesvalidate)torchaudio_supports_ffmpeg)suppress_audio_loading_errors)	AugmentFn)CutSet)collate_audiocollate_featurescollate_matrices)FeatureExtractor)_get_strided_batch_streamingc                       sN   e Zd ZdZd fddZdedeeef fddZ	deddfd	d
Z
  ZS )UnsupervisedDatasetz
    Dataset that contains no supervision - it only provides the features extracted from recordings.

    .. code-block::

        {
            'features': (B x T x F) tensor
            'features_lens': (B, ) tensor
        }
    returnNc                    s   t    d S N)super__init__self	__class__ O/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/dataset/unsupervised.pyr      s   zUnsupervisedDataset.__init__cutsc                 C   s"   |  | t|\}}|||dS )N)r   featuresfeatures_lens)	_validater   )r   r   r    r!   r   r   r   __getitem__!   s   
zUnsupervisedDataset.__getitem__c                 C   "   t | tdd |D sJ d S )Nc                 s       | ]}|j V  qd S r   )has_features.0cutr   r   r   	<genexpr>,       z0UnsupervisedDataset._validate.<locals>.<genexpr>r
   allr   r   r   r   r   r"   *      zUnsupervisedDataset._validater   N)__name__
__module____qualname____doc__r   r   r   strr   r#   r"   __classcell__r   r   r   r   r      s
    	r   c                       sX   e Zd ZdZddeddf fddZdedeee	f fd	d
Z
deddfddZ  ZS )UnsupervisedWaveformDataseta  
    A variant of UnsupervisedDataset that provides waveform samples instead of features.
    The output is a tensor of shape (C, T), with C being the number of channels and T the number of audio samples.
    In this implementation, there will always be a single channel.

    Returns:

    .. code-block::

        {
            'audio': (B x NumSamples) float tensor
            'audio_lens': (B, ) int tensor
        }
    Tcollater   Nc                    s   t    || _d S r   )r   r   r8   )r   r8   r   r   r   r   ?   s   

z$UnsupervisedWaveformDataset.__init__r   c              	   C   s   |  | | jrt|\}}|||dS g }g }|D ]!}t  ||  || W d    n1 s6w   Y  qt||dS )N)r   audio
audio_lens)r   r9   )r"   r8   r   r   append
load_audior   	from_cuts)r   r   r9   r:   remain_cutsremain_audioscr   r   r   r#   C   s    
z'UnsupervisedWaveformDataset.__getitem__c                 C   r$   )Nc                 s   r%   r   has_recordingr'   r   r   r   r*   X   r+   z8UnsupervisedWaveformDataset._validate.<locals>.<genexpr>r,   r.   r   r   r   r"   V   r/   z%UnsupervisedWaveformDataset._validate)T)r1   r2   r3   r4   boolr   r   r   r5   r   r#   r"   r6   r   r   r   r   r7   /   s
    r7   c                       sX   e Zd ZdZ	ddedee f fddZdede	j
fd	d
ZdeddfddZ  ZS )DynamicUnsupervisedDataseta(  
    An example dataset that shows how to use on-the-fly feature extraction in Lhotse.
    It accepts two additional inputs - a FeatureExtractor and an optional WavAugmenter for time-domain data augmentation..
    The output is approximately the same as that of the ``UnsupervisedDataset`` -
    there might be slight differences for ``MixedCut``s, because this dataset mixes them in the time domain,
    and ``UnsupervisedDataset`` does that in the feature domain.
    Cuts that are not mixed will yield identical results in both dataset classes.
    Nfeature_extractor
augment_fnc                    s   t    || _|| _d S r   )r   r   rE   rF   )r   rE   rF   r   r   r   r   e   s   

z#DynamicUnsupervisedDataset.__init__r   r   c                    s,     | dtf fdd}t||}|S )Nr   c              	   3   sJ    | D ]}t   |j j jdV  W d    n1 sw   Y  qd S )N)	extractorrF   )r   compute_featuresrE   rF   )r   r)   r   r   r   generate_cutq   s   
z<DynamicUnsupervisedDataset.__getitem__.<locals>.generate_cut)r"   r   r   )r   r   rI   r    r   r   r   r#   n   s   
z&DynamicUnsupervisedDataset.__getitem__c                 C   r$   )Nc                 s   r%   r   rA   r'   r   r   r   r*   ~   r+   z7DynamicUnsupervisedDataset._validate.<locals>.<genexpr>r,   r.   r   r   r   r"   |   r/   z$DynamicUnsupervisedDataset._validater   )r1   r2   r3   r4   r   r   r   r   r   torchTensorr#   r"   r6   r   r   r   r   rD   [   s    	rD   c                   @   s<   e Zd ZdZdedededdfddZdd	d
Zdd ZdS )RecordingChunkIterableDataseta  
    This dataset iterates over chunks of a recording, for each recording provided.
    It supports setting a chunk_shift < chunk_size to run model predictions on
    overlapping audio chunks.

    The format of yielded items is the following::

        {
            "recording_id": str
            "begin_time": tensor with dtype=float32 shape=(1,)
            "end_time": tensor with dtype=float32 shape=(1,)
            "audio": tensor with dtype=float32 shape=(chunk_size_in_samples,)
        }

    Unlike most other datasets in Lhotse, this dataset does not yield batched items,
    and should be used like the following::

        >>> recordings = RecordingSet.from_file("my-recordings.jsonl.gz")
        ... dataset = RecordingChunkIterableDataset(recordings, chunk_size=30.0, chunk_shift=25.0)
        ... dloader = torch.utils.data.DataLoader(
        ...     dataset,
        ...     batch_size=32,
        ...     collate_fn=audio_chunk_collate,
        ...     worker_init_fn=audio_chunk_worker_init_fn,
        ... )

    
recordings
chunk_sizechunk_shiftr   Nc                 C   s4   t || _|| _|| _d| _t| j| _|   d S )Nr   )listrM   rN   rO   startlenendr
   )r   rM   rN   rO   r   r   r   r      s   
z&RecordingChunkIterableDataset.__init__c                 C   s   | j D ]B}t|jdksJ dt|j d|j d|jd jdks3J d|jd j d|j d|jdksEJ d	|j d
|j dqd S )N   zCWe currently don't support multi-source audio in this dataset (got z sources in recording z).r   filezHWe currently only support 'file' AudioSource type in this dataset (got: z in recording zDWe currently only support single-channel audio in this dataset (got z channels in recording )rM   rR   sourcesidtypenum_channels)r   rr   r   r   r
      s   
z&RecordingChunkIterableDataset.validatec           	   	   c   s    dd l }| j| j| j D ]b}t| j|j}t| j| j |j}d}| j}||j	d j
d8}|j||tjdD ]&}t|}|jtj|tjdtj|tjd|dV  || j7 }|| j }q:W d    n1 skw   Y  qd S )Nr   rb)overlapdtype)r]   )recording_id
begin_timeend_timer9   )	soundfilerM   rQ   rS   r	   rN   sampling_raterO   	SoundFilerV   sourceblocksnpfloat32rJ   	as_tensorrW   )	r   sfrZ   rN   chunk_overlapr_   r`   streamchunkr   r   r   __iter__   s2   


z&RecordingChunkIterableDataset.__iter__r0   )	r1   r2   r3   r4   r   r   r   r
   rm   r   r   r   r   rL      s    

rL   batchc                 C   s   ddl m} dd | D }|| }tdd |D }tt||f}t|D ]\}}t|||d |jd f< q'||d< |S )Nr   )default_collatec                 S   s   g | ]}| d qS )r9   )pop)r(   dr   r   r   
<listcomp>   s    z'audio_chunk_collate.<locals>.<listcomp>c                 s   s    | ]}|j d  V  qdS )r   N)shape)r(   ar   r   r   r*      s    z&audio_chunk_collate.<locals>.<genexpr>r9   )	torch.utils.dataro   maxrJ   zerosrR   	enumeraterh   rs   )rn   ro   audiosoutmaxlenr9   irt   r   r   r   audio_chunk_collate   s   r}   c                 C   sd   t jj }|j}|j}|j}tt	|| t
|j }|j} || |  |_t|j| ||_d S r   )rJ   utilsdataget_worker_infodatasetrQ   rS   intmathceilfloatnum_workersrW   min)	worker_idworker_infor   overall_startoverall_end
per_workerr   r   r   audio_chunk_worker_init_fn   s   r   ))r   typingr   r   r   r   numpyrf   rJ   ru   r   lhotser   r   r	   r
   lhotse.audio.backendr   lhotse.audio.utilsr   lhotse.augmentationr   
lhotse.cutr   lhotse.dataset.collationr   r   r   lhotse.featuresr   lhotse.features.kaldi.layersr   r~   r   Datasetr   r7   rD   rL   r}   r   r   r   r   r   <module>   s&    ,&N