o
    }oiE9                     @   s   d dl mZmZmZ d dlmZ d dlZd dl	Z	d dl
mZmZ d dlmZ d dlmZmZ d dlmZ d dlmZmZ d dlmZ G d	d
 d
ejZG dd deZdS )    )DictListOptionalN)EVAL_DATALOADERSTRAIN_DATALOADERS)data)
DataLoaderDataset)AutoProcessor)
AudioToken
ImageToken)MegatronDataSamplerc                       s   e Zd ZdZ																	
d'dedee dededededeee  dedededededef fddZd(de	ddfddZ
defdd Zdefd!d"Zdefd#d$Zdefd%d&Z  ZS ))MockDataModulez
    A mock data module for AVLM training, validation, and testing.

    Provides datasets and data loaders for training, validation, and testing phases.
    Includes data sampling and preprocessing for multimodal tasks.
        N@       逖    TF
seq_lengthdecoder_seq_lengthimage_embedding_tokensaudio_embedding_tokensmicro_batch_sizeglobal_batch_sizerampup_batch_sizenum_train_samplesnum_val_samplesnum_test_samplesnum_workers
pin_memorypersistent_workersc                    s   t    || _|| _|| _|| _|| _|| _|| _|| _	|| _
|	| _|| _d}t|j| _|| _|| _|| _t| j| j||	|
d| _dS )a  
        Initializes the mock data module with data sampling and preprocessing configurations.

        Args:
            seq_length (int): Maximum sequence length for tokens.
            decoder_seq_length (Optional[int]): Sequence length for the decoder.
            tokenizer: Tokenizer for text processing.
            image_processor: Processor for image preprocessing.
            audio_processor: Processor for audio preprocessing.
            image_embedding_tokens: Number of image embedding tokens for one image.
            audio_embedding_tokens: Number of audio embedding tokens for one audio.
            micro_batch_size (int): Batch size per GPU.
            global_batch_size (int): Total batch size across GPUs.
            rampup_batch_size (Optional[List[int]]): Batch size ramp-up schedule.
            num_train_samples (int): Number of training samples.
            num_val_samples (int): Number of validation samples.
            num_test_samples (int): Number of testing samples.
            num_workers (int): Number of workers for data loading.
            pin_memory (bool): Whether to pin memory for data loaders.
            persistent_workers (bool): Whether to keep workers alive after the first iteration.
        zllava-hf/llava-1.5-7b-hf)seq_lendecoder_seq_lenr   r   r   N)super__init__r   r#   r   r   r   r   r    r!   r   r   	tokenizerr
   from_pretrainedimage_processoraudio_processorr   r   r   data_sampler)selfr   r   r&   r(   r)   r   r   r   r   r   r   r   r   r   r    r!   
model_name	__class__ S/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/avlm/data/mock.pyr%   %   s0   
(zMockDataModule.__init__ stagereturnc              
   C   s|   t | j| j| j| j| jd| j| jd| _t | j| j| j| j| jd| j	| jd| _
t | j| j| j| j| jd| j| jd| _dS )z
        Sets up the training, validation, and testing datasets.

        Args:
            stage (str): Stage of the setup ('train', 'valid', 'test').
        train)r&   r(   r)   r   r   namenum_samplesr   validtestN)_MockAVLMDatasetr&   r(   r)   r   r   r   r   	_train_dsr   _validation_dsr   _test_ds)r+   r2   r/   r/   r0   setupf   s<   

zMockDataModule.setupc                 C      t | ds	|   | | jS )zz
        Creates the training data loader.

        Returns:
            TRAIN_DATALOADERS: Training data loader.
        r:   )hasattrr=   _create_dataloaderr:   r+   r/   r/   r0   train_dataloader      
zMockDataModule.train_dataloaderc                 C   r>   )z}
        Creates the validation data loader.

        Returns:
            EVAL_DATALOADERS: Validation data loader.
        r;   )r?   r=   r@   r;   rA   r/   r/   r0   val_dataloader   rC   zMockDataModule.val_dataloaderc                 C   r>   )zw
        Creates the testing data loader.

        Returns:
            TEST_DATALOADERS: Testing data loader.
        r<   )r?   r=   r@   r<   rA   r/   r/   r0   test_dataloader   rC   zMockDataModule.test_dataloaderc                 K   s"   t |f| j| j| j|jd|S )a  
        Creates a generic data loader for the given dataset.

        Args:
            dataset: The dataset for which the data loader is created.
            **kwargs: Additional arguments for the DataLoader.

        Returns:
            DataLoader: The created data loader.
        )r   r    r!   
collate_fn)r   r   r    r!   rF   )r+   datasetkwargsr/   r/   r0   r@      s   z!MockDataModule._create_dataloader)r   NNNNr   r   r   r   Nr   r   r   r   TF)r1   )__name__
__module____qualname____doc__intr   r   boolr%   strr=   r   rB   r   rD   rE   r   r@   __classcell__r/   r/   r-   r0   r      sd    		

A&r   c                       s   e Zd ZdZ	ddededededdf
 fd	d
ZdefddZdedej	fddZ
deeejf fddZdd Zdd Z  ZS )r9   a  
    A mock dataset for AVLM, generating synthetic multimodal data.

    Attributes:
        tokenizer: Tokenizer for text inputs.
        image_processor: Processor for image inputs.
        audio_processor: Processor for audio inputs.
        name (str): Name of the dataset ('train', 'valid', 'test').
        num_samples (int): Number of samples in the dataset.
        seq_length (int): Sequence length for text tokens.
        seed (int): Random seed for reproducibility.
    *   r5   r6   r   seedr3   Nc
                    s   t    || _|| _|j| _|j}
|
d |
d | _| _|| _|	| _	t
j| jt
jd| _t
j| jt
jd| _|| _|| _|| _|| _|| _dS )a  
        Initializes the mock dataset with synthetic multimodal data.

        Args:
            tokenizer: Tokenizer for text inputs.
            image_processor: Processor for image inputs.
            audio_processor: Processor for audio inputs.
            name (str): Dataset name ('train', 'valid', 'test').
            num_samples (int): Total number of samples in the dataset.
            seq_length (int): Sequence length for text tokens.
            seed (int): Random seed for data generation.
        heightwidthdtypeN)r$   r%   r5   r   
vocab_size	crop_sizeimage_heightimage_widthlengthrR   torchonesfloat	loss_maskarangeint64position_idsr&   r(   r)   r   r   )r+   r&   r(   r)   r   r   r5   r6   r   rR   rX   r-   r/   r0   r%      s   

z_MockAVLMDataset.__init__c                 C   s   | j S )zy
        Returns the length of the dataset.

        Returns:
            int: Number of samples in the dataset.
        )r[   rA   r/   r/   r0   __len__   s   z_MockAVLMDataset.__len__idxc                 C   s,   t jj| j| d}|j| j| jgt jdS )z
        Generates synthetic text data.

        Args:
            idx (int): Index of the sample.

        Returns:
            np.ndarray: Synthetic text token IDs.
        rR   sizerV   )nprandomdefault_rngrR   integersrW   r   ra   )r+   rd   np_genr/   r/   r0   	_get_text  s   
z_MockAVLMDataset._get_textc                 C   s  t jj| j| d}d}d}t|j| j| jd gt j	d}dd| j
 gdd| j
 gg}tj||d d |d d < tj||d d |d d < dd| j gd	d	| j gg}tj||d d |d d < tj||d d |d d < | }|d
d }|dd
 }tjt|tjd}	|t|jd| j| jgt jdg }
g }g }|
D ]}| jj|dddd d }|| ||jd  qt|}tj|tjd}tj|| j| jgg tjd}td}t|jdd||gd}t|jd||gd}||| j|	| j|||||d
S )z
        Generates a synthetic multimodal sample.

        Args:
            idx (int): Index of the sample.

        Returns:
            Dict[str, torch.Tensor]: A dictionary containing synthetic tokens, images, and metadata.
        re   r      rf      i  r   i  i  NrU      ptF)return_tensors
do_rescalepixel_valuesgKAg      g      ?)rg   i>  )
tokenslabelsr_   attention_maskrb   imagesimage_sizesnum_image_tilesaudiosaudio_lengths) rh   ri   rj   rR   r\   
from_numpyrk   rW   r   ra   r   r   token_idr   r   cloner]   lenlongrY   rZ   float32r(   
preprocessappendshapestacktensorrM   uniformr_   rb   )r+   rd   rl   
num_images
num_audiosrv   images_tokens_indexaudios_tokens_indexrw   rx   ry   processed_imagesr{   imageprocessed_imagerz   audio_max_lengthr|   r}   r/   r/   r0   __getitem__  sX   "

z_MockAVLMDataset.__getitem__c                 C   s   t j|}|d  jdg|d jdd R  |d< |d  |d< |d  jdg|d jdd R  |d< |d  jdg|d jdd R  |d< |d  |d< |S )	z
        A default implementation of a collation function.
        Users should override this method to define custom data loaders.
        ry   rp   r   Nr{   rz   r|   r}   )r   
dataloaderdefault_collate
contiguousviewr   flatten)r+   batchcollated_batchr/   r/   r0   _collate_fnR  s   ,&,z_MockAVLMDataset._collate_fnc                 C   s
   |  |S )a  Method that user pass as functor to DataLoader.

        The method optionally performs neural type checking and add types to the outputs.

        Please note, subclasses of Dataset should not implement `input_types`.

        # Usage:
        dataloader = torch.utils.data.DataLoader(
                ....,
                collate_fn=dataset.collate_fn,
                ....
        )

        Returns
        -------
            Collated batch, with or without types.
        )r   )r+   r   r/   r/   r0   rF   a  s   
z_MockAVLMDataset.collate_fn)rQ   )rI   rJ   rK   rL   rO   rM   r%   rc   rh   ndarrayrm   r   r\   Tensorr   r   rF   rP   r/   r/   r-   r0   r9      s&    	
,	@r9   )typingr   r   r   lightning.pytorchpytorchplnumpyrh   r\   !lightning.pytorch.utilities.typesr   r   torch.utilsr   torch.utils.datar   r	   transformersr
   /nemo.collections.multimodal.data.energon.configr   r   nemo.lightning.pytorch.pluginsr   LightningDataModuler   r9   r/   r/   r/   r0   <module>   s    &