o
    wi
                     @   sl   d dl Z d dlmZ d dlmZ d dlmZmZmZ d dl	m
Z
 d dlmZ d dlmZ G dd	 d	eZdS )
    N)LightningDataModule)CombinedLoader)
DictConfig	OmegaConf	open_dict)FallbackDataset)!get_lhotse_dataloader_from_config)TokenizerSpecc                       sx   e Zd ZdZdedejjjddf fddZ	dd	 Z
d
d Zdd ZdedejjjeB fddZdd Zdd Z  ZS )
DataModulea  
    A Lightning DataModule specialized for Lhotse dataloading.
    It takes care of setting up the proper DP ranks for dataloaders, and instantiating them.
    Keep in mind the actual dataset paths and blend are defined by the YAML config, not Python code.

    The typical structure of the YAML config used to initialize this module looks like the following:

    .. code-block:: yaml

        data:
          train_ds:
            input_cfg: path/to/input_cfg.yaml
            num_workers: 2
            batch_size: 4
            # ... Other settings, see nemo/collections/common/data/lhotse/dataloader.py

          validation_ds:
            # The entries under 'datasets' are a list of separate dataloaders.
            # The structure is <dataset-name>: {<dataloader-dict-config>}
            # They inherit all settings from validation_ds, but can individually override them.
            datasets:
              val_set_0:  # rename to your dataset name, add more as needed
                cuts_path: ???  # needs to be specified
            batch_size: 4
            # ... Other settings, see nemo/collections/common/data/lhotse/dataloader.py

    See also the examples in ``examples/speechlm2/conf``.

    Args:
        cfg: a DictConfig instance, typically corresponding to `data` namespace in YAML configs.
        tokenizer: a tokenizer instance, typically NeMo's AutoTokenizer wrapping HF's AutoTokenizer.
        dataset: a torch.utils.data.Dataset instance, expected to define __getitem__ that accepts
            a lhotse.CutSet. It converts metadata + raw data to a batch of PyTorch tensors.
            The data sampling is controlled by Lhotse samplers rather than the dataset.
    	tokenizerdatasetreturnNc                    sz   t    || _t| j  dD ]}|| jv r%dt| j|_dt| j|_qW d    n1 s0w   Y  || _|| _d S )N)validation_dstest_dsT)	super__init__cfgr   getattrforce_finiteforce_map_datasetr   r   )selfr   r   r   k	__class__ g/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/speechlm2/data/datamodule.pyr   =   s   


zDataModule.__init__c                 C   s4   d| j vrd S t| j j|  |  t| j| jdS )Ntrain_dsconfigglobal_rank
world_sizer   r   )r   r   r   _get_dp_rank_get_world_sizer   r   r   r   r   r   r   train_dataloaderH   s   
zDataModule.train_dataloaderc                 C       d| j vrd S | j j}| |S )Nr   )r   r   _build_test_dataloaderr   r   r   r   r   val_dataloaderS      

zDataModule.val_dataloaderc                 C   r%   )Nr   )r   r   r&   r'   r   r   r   test_dataloaderY   r)   zDataModule.test_dataloaderr   c              	   C   s   d|vr-t | d|_d|_W d    n1 sw   Y  t||  |  | j| jdS | }t |
 |`	W d    n1 sBw   Y  i }|j	
 D ]%\}}t | t||}W d    n1 sgw   Y  | |||< qNt|ddS )NdatasetsTr   max_size)mode)r   r   r   r   r!   r"   r   r   copyr+   itemsr   merger&   r   )r   r   base_cfgdloadersnameitemr   r   r   r&   _   s.   


z!DataModule._build_test_dataloaderc                 C   s\   t j r,t j r,t| jdr't| jjdr'| jjjd ur'| jjj d S t j	 S dS )Nmodeldevice_meshr   )
torchdistributedis_availableis_initializedhasattrtrainerr5   r6   get_coordinateget_rankr#   r   r   r   r!      s   

zDataModule._get_dp_rankc                 C   sZ   t j r+t j r+t| jdr&t| jjdr&| jjjd ur&| jjjjd S t j	 S dS )Nr5   r6   r      )
r7   r8   r9   r:   r;   r<   r5   r6   shapeget_world_sizer#   r   r   r   r"      s   

zDataModule._get_world_size)__name__
__module____qualname____doc__r	   r7   utilsdataDatasetr   r$   r(   r*   r   
DataLoaderr   r&   r!   r"   __classcell__r   r   r   r   r
      s     $$r
   )r7   	lightningr   lightning.pytorch.utilitiesr   	omegaconfr   r   r   %nemo.collections.common.data.fallbackr   #nemo.collections.common.data.lhotser   "nemo.collections.common.tokenizersr	   r
   r   r   r   r   <module>   s   