o
    }oi                     @   s   d dl mZ d dlZd dlmZmZ d dlmZ d dl	m
Z
 d dlmZ ddlmZ G dd	 d	ZG d
d dejjjZG dd dejZdS )    N)EVAL_DATALOADERSTRAIN_DATALOADERS)
DataLoader)	DiTConfig)MegatronDataSampler   )	pos_id_3dc                   @   s2   e Zd ZdZddddddZdd Zd	d
 ZdS )PosEmb3Dz?Generates and provides 3D positional embeddings for video data.`   i  )max_tmax_hmax_wc                C   s   || _ || _|| _|   d S N)r   r   r   generate_pos_id)selfr   r   r    r   m/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/diffusion/data/diffusion_fake_datamodule.py__init__   s   zPosEmb3D.__init__c              	   C   sB   t jt t j| jddt j| jddt j| jdddd| _dS )zBGenerates the positional ID grid based on max_t, max_h, and max_w.cpu)device)dimN)torchstackmeshgridaranger   r   r   gridr   r   r   r   r   #   s   zPosEmb3D.generate_pos_idc                C   sl   || j ks|| jks|| jkr(t| j || _ t| j|| _t| j|| _|   | jd|d|d|f S )a<  Retrieves a subset of the positional IDs for the specified dimensions.

        Parameters:
            t (int): Number of time frames.
            h (int): Height dimension.
            w (int): Width dimension.

        Returns:
            torch.Tensor: The positional IDs tensor with shape (t, h, w, 3).
        N)r   r   r   maxr   r   )r   thwr   r   r   get_pos_id_3d.   s   zPosEmb3D.get_pos_id_3dN)__name__
__module____qualname____doc__r   r   r"   r   r   r   r   r	      s
    r	   c                   @   s>   e Zd ZdZ		dddZdd Zdd	 Zd
d Zdd ZdS )DiTVideoLatentFakeDatasetz:A fake dataset for generating synthetic video latent data.       c	           	      C   s4   || _ || _|| _|| _|| _|| _|| _|| _d S r   )r   
max_height	max_width
patch_sizein_channelstext_dimtext_seqlen
seq_length)	r   n_framesr   r   r,   r-   crossattn_emb_sizemax_text_seqlenr0   r   r   r   r   D   s   
z"DiTVideoLatentFakeDataset.__init__c                 C   s   dS )z$Returns the total number of samples.i r   r   r   r   r   __len__X   s   z!DiTVideoLatentFakeDataset.__len__c           
   	   C   s   | j }| j}| j}| j}| j}tj| j||d  tjdd }tj	| j
| jtjd}tj||| || ddd}	||tj|jd gtjd tj| j
gtjd tj| jdftjdtj|jd tjddS )	zGenerates a single sample of data.

        Parameters:
            idx (int): Index of the data sample.

        Returns:
            dict: A dictionary containing video latent data and related information.
           )dtypeg      ?)r   r    r!   r      r   )videot5_text_embeddings	seq_len_q
seq_len_kvpos_ids	loss_mask)r   r*   r+   r,   r-   r   onesr0   bfloat16randnr/   r.   r   r"   reshapetensorshapeint32squeezezeros)
r   idxr   r    r!   pcvideo_latenttext_embeddingpos_embr   r   r   __getitem__\   s   	  z%DiTVideoLatentFakeDataset.__getitem__c                 C   s   t jjj|S )zA default implementation of a collation function.

        Users should override this method to define custom data loaders.
        )r   utilsdata
dataloaderdefault_collater   batchr   r   r   _collate_fnx   s   z%DiTVideoLatentFakeDataset._collate_fnc                 C   s
   |  |S )a  Method that user passes as a functor to DataLoader.

        The method optionally performs neural type checking and adds types to the outputs.

        Please note, subclasses of Dataset should not implement `input_types`.

        Usage:
            dataloader = torch.utils.data.DataLoader(
                    ....,
                    collate_fn=dataset.collate_fn,
                    ....
            )

        Returns:
            Collated batch, with or without types.
        )rT   rR   r   r   r   
collate_fn   s   
z$DiTVideoLatentFakeDataset.collate_fnN)r(   r)   )	r#   r$   r%   r&   r   r4   rM   rT   rU   r   r   r   r   r'   A   s    

r'   c                       s   e Zd ZdZ							dded	ed
edededededdf fddZddeddfddZ	de
fddZdefddZdefddZ  ZS )VideoLatentFakeDataModulezIA LightningDataModule for generating fake video latent data for training.   r      TNFmodel_configr0   micro_batch_sizeglobal_batch_sizenum_workers
pin_memoryuse_train_split_for_valreturnc	           	         s>   t    || _|| _|| _|| _|| _t| j||d| _d S )N)seq_lenrZ   r[   )	superr   r0   rZ   r[   r\   rY   r   data_sampler)	r   rY   r0   rZ   r[   r\   r]   task_encoderr^   	__class__r   r   r      s   
z"VideoLatentFakeDataModule.__init__ stagec                 C   s2   t | jj| jj| jj| jj| jj| jjd| _dS )zSets up the dataset for training and validation.

        Parameters:
            stage (str): Optional stage argument (unused).
        )r1   r   r   r,   r-   r2   N)	r'   rY   
max_frames	max_img_h	max_img_wpatch_spatialr-   r2   	_train_ds)r   rg   r   r   r   setup   s   zVideoLatentFakeDataModule.setupc                 C      t | ds	|   | | jS )z Returns the training DataLoader.rl   hasattrrm   _create_dataloaderrl   r   r   r   r   train_dataloader      
z*VideoLatentFakeDataModule.train_dataloaderc                 C   rn   )z"Returns the validation DataLoader.rl   ro   r   r   r   r   val_dataloader   rs   z(VideoLatentFakeDataModule.val_dataloaderc                 K   s   t |f| jdd|jd|S )zCreates a DataLoader for the given dataset.

        Parameters:
            dataset (Dataset): The dataset to load.
            **kwargs: Additional arguments for DataLoader.

        Returns:
            DataLoader: The DataLoader instance.
        T)r\   r]   persistent_workersrU   )r   r\   rU   )r   datasetkwargsr   r   r   rq      s   
z,VideoLatentFakeDataModule._create_dataloader)rW   r   rX   r   TNF)rf   )r#   r$   r%   r&   r   intboolr   strrm   r   rr   r   rt   r   rq   __classcell__r   r   rd   r   rV      s<    	
rV   )lightning.pytorchpytorchplr   !lightning.pytorch.utilities.typesr   r   torch.utils.datar   'nemo.collections.diffusion.models.modelr   nemo.lightning.pytorch.pluginsr   diffusion_taskencoderr   r	   rN   rO   Datasetr'   LightningDataModulerV   r   r   r   r   <module>   s   'R