o
    }oi[/                     @   s   d dl mZmZmZ d dlmZ d dlZd dl	Z	d dl
mZmZ d dlmZ d dlmZmZ d dlmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ G dd dejZ G dd deZ!dS )    )DictListOptionalN)EVAL_DATALOADERSTRAIN_DATALOADERS)data)
DataLoaderDataset)AutoProcessorLlavaNextConfig)AutoTokenizer)get_number_of_features)IMAGE_TOKEN_INDEX)MegatronDataSampler)loggingc                       s   e Zd ZdZ													d#d	ed
ee dededeee  dedededededef fddZd$de	ddfddZ
defddZdefddZdefdd Zdefd!d"Z  ZS )%MockDataModulez
    A mock data module for LLaVA-Next training, validation, and testing.

    Provides datasets and data loaders for training, validation, and testing phases.
    Includes data sampling and preprocessing for multimodal tasks.
       N      逖 TF
seq_lengthdecoder_seq_lengthmicro_batch_sizeglobal_batch_sizerampup_batch_sizenum_train_samplesnum_val_samplesnum_test_samplesnum_workers
pin_memorypersistent_workersc                    s   t    || _|| _|| _|	| _|
| _|| _|| _|| _	|| _
|| _d}d}|du s/|du r;td d}t|}|p@t|| _|pF|j| _t| j| j|||d| _dS )a  
        Initializes the mock data module with data sampling and preprocessing configurations.

        Args:
            seq_length (int): Maximum sequence length for tokens.
            decoder_seq_length (Optional[int]): Sequence length for the decoder.
            tokenizer: Tokenizer for text processing.
            image_processor: Processor for image preprocessing.
            micro_batch_size (int): Batch size per GPU.
            global_batch_size (int): Total batch size across GPUs.
            rampup_batch_size (Optional[List[int]]): Batch size ramp-up schedule.
            num_train_samples (int): Number of training samples.
            num_val_samples (int): Number of validation samples.
            num_test_samples (int): Number of testing samples.
            num_workers (int): Number of workers for data loading.
            pin_memory (bool): Whether to pin memory for data loaders.
            persistent_workers (bool): Whether to keep workers alive after the first iteration.
         NzYProcessor or tokenizer are not provided! Fall back to `llava-hf/llava-v1.6-vicuna-7b-hf`.z llava-hf/llava-v1.6-vicuna-7b-hf)seq_lendecoder_seq_lenr   r   r   )super__init__r   r#   r   r   r   r   r   r    r   r   r   warningr
   from_pretrainedr   	tokenizerimage_processorr   data_sampler)selfr   r   r(   r)   r   r   r   r   r   r   r   r   r    
model_name	processor	__class__ ]/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/vlm/llava_next/data/mock.pyr%   (   s8   
"
zMockDataModule.__init__r!   stagereturnc                 C   sR   t | j| jd| j| j| _t | j| jd| j| j| _t | j| jd| j| j| _	dS )z
        Sets up the training, validation, and testing datasets.

        Args:
            stage (str): Stage of the setup ('train', 'valid', 'test').
        trainvalidtestN)
_MockLlavaNextDatasetr(   r)   r   r   	_train_dsr   _validation_dsr   _test_ds)r+   r2   r0   r0   r1   setupi   s   
zMockDataModule.setupc                 C      t | ds	|   | | jS )zz
        Creates the training data loader.

        Returns:
            TRAIN_DATALOADERS: Training data loader.
        r8   )hasattrr;   _create_dataloaderr8   r+   r0   r0   r1   train_dataloaderz      
zMockDataModule.train_dataloaderc                 C   r<   )z}
        Creates the validation data loader.

        Returns:
            EVAL_DATALOADERS: Validation data loader.
        r9   )r=   r;   r>   r9   r?   r0   r0   r1   val_dataloader   rA   zMockDataModule.val_dataloaderc                 C   r<   )zw
        Creates the testing data loader.

        Returns:
            TEST_DATALOADERS: Testing data loader.
        r:   )r=   r;   r>   r:   r?   r0   r0   r1   test_dataloader   rA   zMockDataModule.test_dataloaderc                 K   s"   t |f| j| j| j|jd|S )a  
        Creates a generic data loader for the given dataset.

        Args:
            dataset: The dataset for which the data loader is created.
            **kwargs: Additional arguments for the DataLoader.

        Returns:
            DataLoader: The created data loader.
        )r   r   r    
collate_fn)r   r   r   r    rD   )r+   datasetkwargsr0   r0   r1   r>      s   z!MockDataModule._create_dataloader)r   NNNr   r   Nr   r   r   r   TF)r!   )__name__
__module____qualname____doc__intr   r   boolr%   strr;   r   r@   r   rB   rC   r   r>   __classcell__r0   r0   r.   r1   r       sV    	
	
Ar   c                       s   e Zd ZdZ	ddededededdf
 fd	d
ZdefddZdedej	fddZ
deeejf fddZdd Zdd Z  ZS )r7   a  
    A mock dataset for LLaVA-Next, generating synthetic multimodal data.

    Attributes:
        tokenizer: Tokenizer for text inputs.
        image_processor: Processor for image inputs.
        name (str): Name of the dataset ('train', 'valid', 'test').
        num_samples (int): Number of samples in the dataset.
        seq_length (int): Sequence length for text tokens.
        seed (int): Random seed for reproducibility.
    *   namenum_samplesr   seedr3   Nc                    s^   t    || _|| _|j| _|j}|d |d | _| _|| _|| _	|| _
|| _t | _dS )a  
        Initializes the mock dataset with synthetic multimodal data.

        Args:
            tokenizer: Tokenizer for text inputs.
            image_processor: Processor for image inputs.
            name (str): Dataset name ('train', 'valid', 'test').
            num_samples (int): Total number of samples in the dataset.
            seq_length (int): Sequence length for text tokens.
            seed (int): Random seed for data generation.
        heightwidthN)r$   r%   rP   r   
vocab_size	crop_sizeimage_heightimage_widthlengthrR   r(   r)   r   	hf_config)r+   r(   r)   rP   rQ   r   rR   rV   r.   r0   r1   r%      s   
z_MockLlavaNextDataset.__init__c                 C   s   | j S )zy
        Returns the length of the dataset.

        Returns:
            int: Number of samples in the dataset.
        )rY   r?   r0   r0   r1   __len__   s   z_MockLlavaNextDataset.__len__idxc                 C   s,   t jj| j| d}|j| j| jgt jdS )z
        Generates synthetic text data.

        Args:
            idx (int): Index of the sample.

        Returns:
            np.ndarray: Synthetic text token IDs.
        rR   sizedtype)nprandomdefault_rngrR   integersrU   r   int64)r+   r\   np_genr0   r0   r1   	_get_text   s   
z_MockLlavaNextDataset._get_textc              	   C   sX  t jj| j| d}t|j| j| jd gt j	d}t
| j| j| j| j| jj| jjj}t|dd ttg| |dd gd}| }t|jd| j| jgt jd}|dd }|dd }t|}tj|tjd	}tj|tj	d	}	tjt|tjd	}
tj| j| jggtjd	}| jj|d
ddd d }|jd }|||||	|||
dS )z
        Generates a synthetic multimodal sample.

        Args:
            idx (int): Index of the sample.

        Returns:
            Dict[str, torch.Tensor]: A dictionary containing synthetic tokens, images, and metadata.
        r]      r^   N   r      )r`   ptF)return_tensors
do_rescalepixel_values)mediatokenslabels	loss_maskposition_idsimage_sizesnum_media_tilesattention_mask)ra   rb   rc   rR   torch
from_numpyrd   rU   r   re   r   rW   rX   rZ   image_grid_pinpointsvision_config
patch_sizeconcatenatetensorr   clonefloat32lenonesfloatarangelongr)   
preprocessshape)r+   r\   rf   rq   num_image_tokensrr   imagesr   rs   rt   rw   ru   image_arrayrv   r0   r0   r1   __getitem__   s>   ". 
z!_MockLlavaNextDataset.__getitem__c                 C   sh   t j|}|d  jdg|d jdd R  |d< |d  jdg|d jdd R  |d< |S )z
        A default implementation of a collation function.
        Users should override this method to define custom data loaders.
        rp   rk   ri   Nru   )r   
dataloaderdefault_collate
contiguousviewr   )r+   batchcollated_batchr0   r0   r1   _collate_fn&  s
   ,&z!_MockLlavaNextDataset._collate_fnc                 C   s
   |  |S )a  Method that user pass as functor to DataLoader.

        The method optionally performs neural type checking and add types to the outputs.

        Please note, subclasses of Dataset should not implement `input_types`.

        # Usage:
        dataloader = torch.utils.data.DataLoader(
                ....,
                collate_fn=dataset.collate_fn,
                ....
        )

        Returns
        -------
            Collated batch, with or without types.
        )r   )r+   r   r0   r0   r1   rD   3  s   
z _MockLlavaNextDataset.collate_fn)rO   )rG   rH   rI   rJ   rM   rK   r%   r[   ra   ndarrayrg   r   rx   Tensorr   r   rD   rN   r0   r0   r.   r1   r7      s&    $	/r7   )"typingr   r   r   lightning.pytorchpytorchplnumpyra   rx   !lightning.pytorch.utilities.typesr   r   torch.utilsr   torch.utils.datar   r	   transformersr
   r   =nemo.collections.common.tokenizers.huggingface.auto_tokenizerr   +nemo.collections.vlm.llava_next.model.utilsr   0nemo.collections.vlm.neva.data.multimodal_tokensr   nemo.lightning.pytorch.pluginsr   
nemo.utilsr   LightningDataModuler   r7   r0   r0   r0   r1   <module>   s     