o
    Si"                     @   s   d dl mZmZmZmZ d dlZd dlmZ d dlm	Z	 d dl
mZmZ d dlmZ d dlmZmZ d dlmZ G d	d
 d
ejjjZdS )    )CallableDictListUnionN)default_collate)CutSet)BatchIOPrecomputedFeatures)validate_for_asr)compute_num_framesifnone)Hdf5MemoryIssueFixc                
       s   e Zd ZdZddde fdedeeegef  deee	j
ge	j
f  def fdd	Zd
edeeee	j
ee f f fddZ  ZS )K2Speech2TextTranslationDataseta  
    The PyTorch Dataset for the speech translation task using k2 library.

    This dataset expects to be queried with lists of cut IDs,
    for which it loads features and automatically collates/batches them.

    To use it with a PyTorch DataLoader, set ``batch_size=None``
    and provide a :class:`SimpleCutSampler` sampler.

    Each item in this dataset is a dict of:

    .. code-block::

        {
            'inputs': float tensor with shape determined by :attr:`input_strategy`:
                      - single-channel:
                        - features: (B, T, F)
                        - audio: (B, T)
                      - multi-channel: currently not supported
            'supervisions': [
                {
                    'sequence_idx': Tensor[int] of shape (S,)
                    'src_text': List[str] of len S
                    'tgt_text': List[str] of len S

                    # For feature input strategies
                    'start_frame': Tensor[int] of shape (S,)
                    'num_frames': Tensor[int] of shape (S,)

                    # For audio input strategies
                    'start_sample': Tensor[int] of shape (S,)
                    'num_samples': Tensor[int] of shape (S,)

                    # Optionally, when return_cuts=True
                    'cut': List[AnyCut] of len S
                }
            ]
        }

    Dimension symbols legend:
    * ``B`` - batch size (number of Cuts)
    * ``S`` - number of supervision segments (greater or equal to B, as each Cut may have multiple supervisions)
    * ``T`` - number of frames of the longest Cut
    * ``F`` - number of features

    The 'sequence_idx' field is the index of the Cut used to create the example in the Dataset.
    FNreturn_cutscut_transformsinput_transformsinput_strategyc                    s>   t    || _t|g | _t|g | _|| _tdd| _dS )a2  
        K2 Speech2TextTranslation IterableDataset constructor.

        :param return_cuts: When ``True``, will additionally return a "cut" field in each batch with the Cut
            objects used to create that batch.
        :param cut_transforms: A list of transforms to be applied on each sampled batch,
            before converting cuts to an input representation (audio/features).
            Examples: cut concatenation, noise cuts mixing, etc.
        :param input_transforms: A list of transforms to be applied on each sampled batch,
            after the cuts are converted to audio/features.
            Examples: normalization, SpecAugment, etc.
        :param input_strategy: Converts cuts into a collated batch of audio/features.
            By default, reads pre-computed features from disk.
        d   )reset_intervalN)	super__init__r   r   r   r   r   r   hdf5_fix)selfr   r   r   r   	__class__ U/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/dataset/speech_translation.pyr   @   s   
z(K2Speech2TextTranslationDataset.__init__cutsreturnc                    s  t | | j  |jdd}| jD ]}||}q|jdd}| |}t|dkr0|\}}}n|\}}| j|}tj	t
| dd}| jD ]}|||d}qH|tdd t|D d	}|d
 | | jrtdd |D |d
 d< tdd |D }	|	rg g g }
}}|d j |d j du rz| jjj W n ty   tdw |D ]5}|jD ]/}|
dd |jd D  | fdd|jd D  | fdd|jd D  qq|
|d
 d< ||d
 d< ||d
 d< |S )z
        Return a new batch, with the batch size automatically determined using the constraints
        of max_duration and max_cuts.
        F)	ascending      )dim)supervision_segmentsc                 S   s.   g | ]\}}|j D ]}|j|jd  dq	qS )translated_text)texttgt_text)supervisionsr%   custom).0sequence_idxcutsupervisionr   r   r   
<listcomp>   s    z?K2Speech2TextTranslationDataset.__getitem__.<locals>.<listcomp>)inputsr'   r'   c                 S   s   g | ]
}|j D ]}|qqS r   )r'   )r)   r+   supr   r   r   r-      s
    r+   c                 s   s0    | ]}|j D ]}|jd uod|jv V  qqd S )Nword)r'   	alignment)r)   csr   r   r   	<genexpr>   s    z>K2Speech2TextTranslationDataset.__getitem__.<locals>.<genexpr>r   Nz[Can't determine the frame_shift -- it is not present either in cuts or the input_strategy. c                 S   s   g | ]}|j qS r   )symbolr)   aliwordr   r   r   r-      s    r0   c                       g | ]
}t |j d qS frame_shiftsampling_rate)r   startr6   r:   r   r   r-          c                    r8   r9   )r   endr6   r:   r   r   r-      r>   
word_startword_end)r
   r   updatesort_by_durationr   r   lensupervision_intervalstorchstacklistvaluesr   r   	enumerater   allr;   r<   	extractorAttributeError
ValueErrorr'   appendr1   )r   r   tnfm	input_tplr.   _rE   segmentsbatchhas_word_alignmentswordsstartsendsr2   r3   r   r:   r   __getitem__a   sv   








z+K2Speech2TextTranslationDataset.__getitem__)__name__
__module____qualname____doc__r	   boolr   r   r   rF   Tensorr   r   r   strr   rY   __classcell__r   r   r   r   r      s     20!r   )typingr   r   r   r   rF   torch.utils.data.dataloaderr   
lhotse.cutr   lhotse.dataset.input_strategiesr   r	   !lhotse.dataset.speech_recognitionr
   lhotse.utilsr   r   lhotse.workaroundsr   utilsdataDatasetr   r   r   r   r   <module>   s   