o
    Si)                     @   s   d dl mZmZ d dlZd dlmZ d dlmZ d dlm	Z	 d dl
mZ d dlmZ d dlmZmZ d d	lmZ G d
d deZdS )    )DictOptionalN)CrossEntropyLoss)Dataset)validate)RecordingSet)CutSet)collate_featurescollate_matrices)SupervisionSetc                       sb   e Zd ZdZ			ddedee dee deddf
 fd	d
Z	dede
eejf fddZ  ZS )DiarizationDatasetaB  
    A PyTorch Dataset for the speaker diarization task.
    Our assumptions about speaker diarization are the following:

    * we assume a single channel input (for now), which could be either a true mono signal
        or a beamforming result from a microphone array.
    * we assume that the supervision used for model training is a speech activity matrix, with one
        row dedicated to each speaker (either in the current cut or the whole dataset,
        depending on the settings). The columns correspond to feature frames. Each row is effectively
        a Voice Activity Detection supervision for a single speaker. This setup is somewhat inspired by
        the TS-VAD paper: https://arxiv.org/abs/2005.07272

    Each item in this dataset is a dict of:

    .. code-block::

        {
            'features': (B x T x F) tensor
            'features_lens': (B, ) tensor
            'speaker_activity': (B x num_speaker x T) tensor
        }

    .. note: In cases when padding needs to be performed during collation,
        the cuts are silence-padded, and the speaker activity tensor is padded
        with CrossEntropyLoss().ignore_index.

    Constructor arguments:

    :param cuts: a ``CutSet`` used to create the dataset object.
    :param uem: a ``SupervisionSet`` used to set regions for diarization
    :param min_speaker_dim: optional int, when specified it will enforce that the matrix shape is at least
        that value (useful for datasets like CHiME 6 where the number of speakers is always 4, but some cuts
        might have less speakers than that).
    :param global_speaker_ids: a bool, indicates whether the same speaker should always retain the same row index
        in the speaker activity matrix (useful for speaker-dependent systems)
    :param root_dir: a prefix path to be attached to the feature files paths.
    NFcutsuemmin_speaker_dimglobal_speaker_idsreturnc           	         s   t    t| |s|| _nEtdd |D }tj||d }g }|  D ]\} ||vr;|dd  D 7 }q)| fdd|| D 7 }q)tj|t	
|d| _|radd t| jjD nd | _|| _d S )	Nc                 S   s   i | ]}|j r|jj|jqS  )has_recording	recordingid).0cr   r   N/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/dataset/diarization.py
<dictcomp>D   s    z/DiarizationDataset.__init__.<locals>.<dictcomp>)
recordingssupervisionsc                 S   s   g | ]}|j qS r   )data)r   itr   r   r   
<listcomp>M   s    z/DiarizationDataset.__init__.<locals>.<listcomp>c                    s8   h | ]} j |j|jd D ]}|jj|j|jdqqS ))beginend)start)overlapr   r    r   trim)r   uem_itr   treer   r   	<setcomp>O   s    z.DiarizationDataset.__init__.<locals>.<setcomp>c                 S   s   i | ]\}}||qS r   r   )r   idxspkr   r   r   r   Y   s    )super__init__r   r   r   r   from_manifestsindex_supervisionsitemsr   from_segments	enumeratespeakersr   )	selfr   r   r   r   r   uem_intervalsr   cut_id	__class__r%   r   r+   5   s<   


zDiarizationDataset.__init__c                    s2   t |\}}||t fdd|D t jddS )Nc                 3   s"    | ]}|j  j jd V  qdS ))r   speaker_to_idx_mapN)speakers_feature_maskr   r1   )r   cutr2   r   r   	<genexpr>e   s    
z1DiarizationDataset.__getitem__.<locals>.<genexpr>)padding_value)featuresfeatures_lensspeaker_activity)r	   r
   r   ignore_index)r2   r   r=   r>   r   r:   r   __getitem___   s   
	zDiarizationDataset.__getitem__)NNF)__name__
__module____qualname____doc__r   r   r   intboolr+   r   strtorchTensorrA   __classcell__r   r   r5   r   r      s"    )$*r   )typingr   r   rI   torch.nnr   torch.utils.datar   lhotser   lhotse.audior   
lhotse.cutr   lhotse.dataset.collationr	   r
   lhotse.supervisionr   r   r   r   r   r   <module>   s    