o
    SiB$                     @   s   d dl mZmZmZmZ d dlZd dlmZmZ d dl	m
Z
 d dlmZ d dlmZmZ d dlmZmZ d dlmZ G d	d
 d
ejjjZdeddfddZdS )    )CallableDictListUnionN)
DataLoaderdefault_collate)validate)CutSet)BatchIOPrecomputedFeatures)compute_num_framesifnone)Hdf5MemoryIssueFixc                
       s   e Zd ZdZddde fdedeeegef  deee	j
ge	j
f  def fdd	Zd
edeeee	j
ee f f fddZ  ZS )K2SpeechRecognitionDataseta  
    The PyTorch Dataset for the speech recognition task using k2 library.

    This dataset expects to be queried with lists of cut IDs,
    for which it loads features and automatically collates/batches them.

    To use it with a PyTorch DataLoader, set ``batch_size=None``
    and provide a :class:`SimpleCutSampler` sampler.

    Each item in this dataset is a dict of:

    .. code-block::

        {
            'inputs': float tensor with shape determined by :attr:`input_strategy`:
                      - single-channel:
                        - features: (B, T, F)
                        - audio: (B, T)
                      - multi-channel: currently not supported
            'supervisions': [
                {
                    'sequence_idx': Tensor[int] of shape (S,)
                    'text': List[str] of len S

                    # For feature input strategies
                    'start_frame': Tensor[int] of shape (S,)
                    'num_frames': Tensor[int] of shape (S,)

                    # For audio input strategies
                    'start_sample': Tensor[int] of shape (S,)
                    'num_samples': Tensor[int] of shape (S,)

                    # Optionally, when return_cuts=True
                    'cut': List[AnyCut] of len S
                }
            ]
        }

    Dimension symbols legend:
    * ``B`` - batch size (number of Cuts)
    * ``S`` - number of supervision segments (greater or equal to B, as each Cut may have multiple supervisions)
    * ``T`` - number of frames of the longest Cut
    * ``F`` - number of features

    The 'sequence_idx' field is the index of the Cut used to create the example in the Dataset.
    FNreturn_cutscut_transformsinput_transformsinput_strategyc                    s>   t    || _t|g | _t|g | _|| _tdd| _dS )a  
        k2 ASR IterableDataset constructor.

        :param return_cuts: When ``True``, will additionally return a "cut" field in each batch with the Cut
            objects used to create that batch.
        :param cut_transforms: A list of transforms to be applied on each sampled batch,
            before converting cuts to an input representation (audio/features).
            Examples: cut concatenation, noise cuts mixing, etc.
        :param input_transforms: A list of transforms to be applied on each sampled batch,
            after the cuts are converted to audio/features.
            Examples: normalization, SpecAugment, etc.
        :param input_strategy: Converts cuts into a collated batch of audio/features.
            By default, reads pre-computed features from disk.
        d   )reset_intervalN)	super__init__r   r   r   r   r   r   hdf5_fix)selfr   r   r   r   	__class__ U/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/dataset/speech_recognition.pyr   =   s   
z#K2SpeechRecognitionDataset.__init__cutsreturnc                    s  t | | j  |jdd}| jD ]}||}q|jdd}| |}t|dkr0|\}}}n|\}}| j|}tj	t
| dd}| jD ]}|||d}qH|tdd t|D d	}|d
 | | jrtdd |D |d
 d< tdd |D }	|	rg g g }
}}|d j |d j du rz| jjj W n ty   tdw |D ]5}|jD ]/}|
dd |jd D  | fdd|jd D  | fdd|jd D  qq|
|d
 d< ||d
 d< ||d
 d< |S )z
        Return a new batch, with the batch size automatically determined using the constraints
        of max_duration and max_cuts.
        F)	ascending      )dim)supervision_segmentsc                 S   s&   g | ]\}}|j D ]}d |jiq	qS )text)supervisionsr%   ).0sequence_idxcutsupervisionr   r   r   
<listcomp>   s    z:K2SpeechRecognitionDataset.__getitem__.<locals>.<listcomp>)inputsr&   r&   c                 S   s   g | ]
}|j D ]}|qqS r   )r&   )r'   r)   supr   r   r   r+      s
    r)   c                 s   s0    | ]}|j D ]}|jd uod|jv V  qqd S )Nword)r&   	alignment)r'   csr   r   r   	<genexpr>   s    z9K2SpeechRecognitionDataset.__getitem__.<locals>.<genexpr>r   Nz[Can't determine the frame_shift -- it is not present either in cuts or the input_strategy. c                 S   s   g | ]}|j qS r   )symbolr'   aliwordr   r   r   r+      s    r.   c                       g | ]
}t |j d qS frame_shiftsampling_rate)r   startr4   r8   r   r   r+          c                    r6   r7   )r   endr4   r8   r   r   r+      r<   
word_startword_end)validate_for_asrr   updatesort_by_durationr   r   lensupervision_intervalstorchstacklistvaluesr   r   	enumerater   allr9   r:   	extractorAttributeError
ValueErrorr&   appendr/   )r   r   tnfm	input_tplr,   _rD   segmentsbatchhas_word_alignmentswordsstartsendsr0   r1   r   r8   r   __getitem__^   sv   








z&K2SpeechRecognitionDataset.__getitem__)__name__
__module____qualname____doc__r   boolr   r   r	   rE   Tensorr
   r   r   strr   rX   __classcell__r   r   r   r   r      s     10!r   r   r   c                 C   sv   t |  d}| D ]0}|jD ]*}|j| ks"J d|j d|j d|j|j| ks7J d|j d|j dqqd S )NgMb`?zHSupervisions starting before the cut are not supported for ASR (sup id: z
, cut id: )zESupervisions ending after the cut are not supported for ASR (sup id: )r   r&   r;   idr=   duration)r   tolr)   r*   r   r   r   r@      s(   
	r@   )typingr   r   r   r   rE   torch.utils.data.dataloaderr   r   lhotser   
lhotse.cutr	   lhotse.dataset.input_strategiesr
   r   lhotse.utilsr   r   lhotse.workaroundsr   utilsdataDatasetr   r@   r   r   r   r   <module>   s     @