o
    }oi"                     @   s   d dl mZmZmZmZ d dlmZ d dlZ	d dl
Z
d dlmZmZ d dlmZ d dlmZmZ d dlmZ G dd dejZG d	d
 d
eZdS )    )DictListOptionalTupleN)EVAL_DATALOADERSTRAIN_DATALOADERS)data)
DataLoaderDataset)MegatronDataSamplerc                       s   e Zd ZdZ																
d)dedededeeef dededeee  dedededededededef fddZ	d*de
ddfdd Zdefd!d"Zdefd#d$Zdefd%d&Zdefd'd(Z  ZS )+MockDataModulea8  
    Mock DataModule for testing and development.
    Generates synthetic data for training, validation, and testing purposes.

    Args:
        seq_length (int): Sequence length for the generated data.
        decoder_seq_length (Optional[int]): Decoder sequence length if applicable, used in pp.
        vocab_size (int): Size of the vocabulary of tokenizer.
        crop_size (Tuple[int, int]): Image crop size (height, width).
        micro_batch_size (int): Micro batch size for data loading.
        global_batch_size (int): Global batch size across all processes.
        rampup_batch_size (Optional[List[int]]): Batch size ramp-up configuration.
        num_train_samples (int): Number of training samples to generate.
        num_val_samples (int): Number of validation samples to generate.
        num_test_samples (int): Number of test samples to generate.
        num_workers (int): Number of workers for data loading.
        pin_memory (bool): Whether to pin memory for data loading.
        persistent_workers (bool): Whether workers should remain persistent.
       N  0  r         '  TF
seq_lengthdecoder_seq_length
vocab_size	crop_sizemicro_batch_sizeglobal_batch_sizerampup_batch_size	tokenizerimage_processornum_train_samplesnum_val_samplesnum_test_samplesnum_workers
pin_memorypersistent_workersc                    sz   t    || _|| _|| _|| _|
| _|| _|| _|| _	|| _
|| _|| _|| _|| _|	| _t| j| j|||d| _d S )N)seq_lendecoder_seq_lenr   r   r   )super__init__r   r   r   r   r   r   r   r    r!   r"   r   r   r   r   r   data_sampler)selfr   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   	__class__ Y/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/vlm/mllama/data/mock.pyr&   0   s,   
zMockDataModule.__init__ stagereturnc                 C   sR   t | j| jd| j| j| _t | j| jd| j| j| _t | j| jd| j| j| _	dS )z(Set up datasets for the specified stage.trainvalidtestN)
_MockMLlamaDatasetr   r   r   r   	_train_dsr   _validation_dsr   _test_ds)r(   r.   r+   r+   r,   setupZ   s   
zMockDataModule.setupc                 C      t | ds	|   | | jS )z$Returns the DataLoader for training.r4   )hasattrr7   _create_dataloaderr4   r(   r+   r+   r,   train_dataloaderf      
zMockDataModule.train_dataloaderc                 C   r8   )z&Returns the DataLoader for validation.r5   )r9   r7   r:   r5   r;   r+   r+   r,   val_dataloaderl   r=   zMockDataModule.val_dataloaderc                 C   r8   )z#Returns the DataLoader for testing.r6   )r9   r7   r:   r6   r;   r+   r+   r,   test_dataloaderr   r=   zMockDataModule.test_dataloaderc                 K   s"   t |f| j| j| j|jd|S )z/Creates a DataLoader for the specified dataset.)r    r!   r"   
collate_fn)r	   r    r!   r"   r@   )r(   datasetkwargsr+   r+   r,   r:   x   s   z!MockDataModule._create_dataloader)r   Nr   r   r   r   NNNr   r   r   r   TF)r-   )__name__
__module____qualname____doc__intr   r   r   boolr&   strr7   r   r<   r   r>   r?   r	   r:   __classcell__r+   r+   r)   r,   r      sj    

	
*r   c                       s   e Zd ZdZ	ddededededdf
 fd	d
ZdefddZdedej	fddZ
deeejf fddZdd Zdd Z  ZS )r3   a  
    Mock dataset for generating synthetic data with text and image components.

    Args:
        vocab_size (int): Vocabulary size for text data.
        crop_size (Tuple[int, int]): Image crop size (height, width).
        name (str): Name of the dataset split ('train', 'valid', 'test').
        num_samples (int): Number of samples in the dataset.
        seq_length (int): Sequence length for the text data.
        seed (int): Seed for random number generation.
    *   namenum_samplesr   seedr/   Nc                    s`   t    || _|| _|| _|\| _| _|| _|| _t	j
| jt	jd| _t	j| jt	jd| _d S )N)dtype)r%   r&   rL   r   r   image_heightimage_widthlengthrN   torchonesfloat	loss_maskarangeint64position_ids)r(   r   r   rL   rM   r   rN   r)   r+   r,   r&      s   
	z_MockMLlamaDataset.__init__c                 C   s   | j S )z-Returns the number of samples in the dataset.)rR   r;   r+   r+   r,   __len__   s   z_MockMLlamaDataset.__len__idxc                 C   s,   t jj| j| d}|j| j| jgt jdS )zAGenerates a random sequence of integers representing text tokens.rN   sizerO   )nprandomdefault_rngrN   integersr   r   rX   )r(   r[   np_genr+   r+   r,   	_get_text   s   z_MockMLlamaDataset._get_textc              	   C   s   t jj| j| d}t|j| j| jd gt j	d}t|
ddd| j| jf}t|jddgt j	dd }| }|dd }|dd }|td	d
ggtdg||| j| j|dS )zGenerates a single data sample.r\      r]   r      r   N   i   )imagesmasks
num_chunkstokensaspect_ratio_idsrV   rY   labels)r_   r`   ra   rN   rS   
from_numpyrb   r   r   rX   standard_normalrP   rQ   clonetensorrV   rY   )r(   r[   rc   rl   ri   rm   rn   r+   r+   r,   __getitem__   s    "
z_MockMLlamaDataset.__getitem__c                 C   sB   i }dd |D |d< d|d< | tj| |d|d< |S )z
        A default implementation of a collation function.
        Users should override this method to define custom data loaders.
        c                 S   s   g | ]}| d qS )rj   )pop).0sampler+   r+   r,   
<listcomp>   s    z2_MockMLlamaDataset._collate_fn.<locals>.<listcomp>batch_masksNattention_maskri   batch_images)updater   
dataloaderdefault_collatert   )r(   batchcollated_batchr+   r+   r,   _collate_fn   s   z_MockMLlamaDataset._collate_fnc                 C   s
   |  |S )a  Method that user pass as functor to DataLoader.

        The method optionally performs neural type checking and add types to the outputs.

        Please note, subclasses of Dataset should not implement `input_types`.

        # Usage:
        dataloader = torch.utils.data.DataLoader(
                ....,
                collate_fn=dataset.collate_fn,
                ....
        )

        Returns
        -------
            Collated batch, with or without types.
        )r   )r(   r~   r+   r+   r,   r@      s   
z_MockMLlamaDataset.collate_fn)rK   )rC   rD   rE   rF   rI   rG   r&   rZ   r_   ndarrayrd   r   rS   Tensorrs   r   r@   rJ   r+   r+   r)   r,   r3      s&    r3   )typingr   r   r   r   lightning.pytorchpytorchplnumpyr_   rS   !lightning.pytorch.utilities.typesr   r   torch.utilsr   torch.utils.datar	   r
   nemo.lightning.pytorch.pluginsr   LightningDataModuler   r3   r+   r+   r+   r,   <module>   s   i