o
    }oi"                     @   s   d dl mZmZmZmZ d dlmZ d dlZ	d dl
Z
d dlmZmZ d dlmZ d dlmZmZ d dlmZ d dlmZ G dd	 d	ejZG d
d deZdS )    )AnyDictListOptionalN)EVAL_DATALOADERSTRAIN_DATALOADERS)data)
DataLoaderDataset)MegatronDataSampler)loggingc                       s   e Zd ZdZ														d(d	ed
ee dededededeee  dededededededee f fddZ	d)de
ddfddZdefddZdefd d!Zdefd"d#Zdefd$d%Zdee
ef fd&d'Z  ZS )*MockDataModulezO
    Mock data module with data sampling and preprocessing configurations.
    M   N      逖 TF
seq_lengthdecoder_seq_length	tokenizerimage_processormicro_batch_sizeglobal_batch_sizerampup_batch_sizenum_train_samplesnum_val_samplesnum_test_samplesnum_workers
pin_memorypersistent_workerstask_encoderc                    s   t    || _|| _|| _|| _|| _|	| _|
| _|| _	|| _
|| _|| _|| _|| _|du s4|du rWtd ddlm} ddlm} |d}|pO|d| _|pU|j| _t| j| j|||d| _dS )a  
        Initializes the mock data module with data sampling and preprocessing configurations.
        task_encoder: This Mock data module uses Energon Task encoder if provided.

        Args:
            seq_length (int): Maximum sequence length for tokens.
            decoder_seq_length (Optional[int]): Sequence length for the decoder. Used by Megatron Sampler.
            tokenizer: Tokenizer for text processing.
            image_processor: Processor for image preprocessing.
            micro_batch_size (int): Batch size for training and validation.
            global_batch_size (int): Total batch size across GPUs.
            rampup_batch_size (Optional[List[int]]): Batch size ramp-up schedule. Used by Megatron Sampler.
            num_train_samples (int): Number of training samples.
            num_val_samples (int): Number of validation samples.
            num_test_samples (int): Number of testing samples.
            num_workers (int): Number of workers for data loading.
            pin_memory (bool): Whether to pin memory for data loading.
            persistent_workers (bool): Whether workers should remain persistent.
            task_encoder: Task encoder for Energon tasks.
        NzVProcessor or tokenizer are not provided! Fall back to `openai/clip-vit-large-patch14`.r   )AutoProcessor)AutoTokenizerzopenai/clip-vit-large-patch14)seq_lendecoder_seq_lenr   r   r   )super__init__r   r   r   r   r   r   r   r   r   r   r   r   r   r   warningtransformersr    =nemo.collections.common.tokenizers.huggingface.auto_tokenizerr!   from_pretrainedr   data_sampler)selfr   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   	processor	__class__ W/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/vlm/clip/data/mock.pyr%   !   s8   
%

zMockDataModule.__init__ stagereturnc                 C   sX   t | j| jd| j| j| jd| _t | j| jd| j| j| _t | j| jd| j	| j| _
d S )Ntrain)r   validtest)_MockClipDatasetr   r   r   r   r   	_train_dsr   _validation_dsr   _test_ds)r+   r2   r/   r/   r0   setupf   s   
zMockDataModule.setupc                 C      t | ds	|   | | jS )Nr8   )hasattrr;   _create_dataloaderr8   r+   r/   r/   r0   train_dataloaderw      
zMockDataModule.train_dataloaderc                 C   r<   )Nr9   )r=   r;   r>   r9   r?   r/   r/   r0   val_dataloader}   rA   zMockDataModule.val_dataloaderc                 C   r<   )Nr:   )r=   r;   r>   r:   r?   r/   r/   r0   test_dataloader   rA   zMockDataModule.test_dataloaderc                 K   s&   t |f| j| j| j| j|jd|S )N)r   r   r   
batch_size
collate_fn)r	   r   r   r   r   rE   )r+   datasetkwargsr/   r/   r0   r>      s   z!MockDataModule._create_dataloaderc                 C   s   t d i S )aT  
        Save the state of the data module.

        This method is called when saving a checkpoint. It generates and saves the state of the data module,
        including the state of the dataloader and the number of consumed samples.

        Returns:
        Dict[str, Any]: A dictionary containing the state of the data module.
        zHtrainer object not connected to data module object returning empty state)r   r&   r?   r/   r/   r0   
state_dict   s   
zMockDataModule.state_dict)r   NNNr   r   Nr   r   r   r   TFN)r1   )__name__
__module____qualname____doc__intr   r   boolr   r%   strr;   r   r@   r   rB   rC   r	   r>   r   rH   __classcell__r/   r/   r-   r0   r      sf    
	
Er   c                       sn   e Zd Z		ddededededdf
 fdd	Zdefd
dZdeeej	f fddZ
dd Zdd Z  ZS )r7   *   Nnamenum_samplesr   seedr3   c           	         sP   t    || _|| _|j| _|j}|d |d | _| _|| _|| _	|| _
d S )Nheightwidth)r$   r%   rR   r   
vocab_size	crop_sizeimage_heightimage_widthlengthrT   r   )	r+   r   r   rR   rS   r   rT   r   rX   r-   r/   r0   r%      s   


z_MockClipDataset.__init__c                 C   s   | j S )N)r[   r?   r/   r/   r0   __len__   s   z_MockClipDataset.__len__c                 C   sx   t jj| j| d}t|j| j| jgt j	d}t|jd| j
| jgt jd}| jd ur7| j|ddS ||dS )N)rT   )sizedtype   zThis is Random Mock Text)imagetxt)imagescaptions)nprandomdefault_rngrT   torch
from_numpyintegersrW   r   int64rY   rZ   float32r   encode_sample)r+   idxnp_gentokensrb   r/   r/   r0   __getitem__   s    
z_MockClipDataset.__getitem__c                 C   s   t j|}d|d< |S )z
        A default implementation of a collation function.
        Users should override this method to define custom data loaders.
        Nattention_mask)r   
dataloaderdefault_collate)r+   batchcollated_batchr/   r/   r0   _collate_fn   s   z_MockClipDataset._collate_fnc                 C   s
   |  |S )a  Method that user pass as functor to DataLoader.

        The method optionally performs neural type checking and add types to the outputs.

        Please note, subclasses of Dataset should not implement `input_types`.

        # Usage:
        dataloader = torch.utils.data.DataLoader(
                ....,
                collate_fn=dataset.collate_fn,
                ....
        )

        Returns
        -------
            Collated batch, with or without types.
        )rv   )r+   rt   r/   r/   r0   rE      s   
z_MockClipDataset.collate_fn)rQ   N)rI   rJ   rK   rO   rM   r%   r\   r   rg   Tensorrp   rv   rE   rP   r/   r/   r-   r0   r7      s$    		r7   )typingr   r   r   r   lightning.pytorchpytorchplnumpyrd   rg   !lightning.pytorch.utilities.typesr   r   torch.utilsr   torch.utils.datar	   r
   nemo.lightning.pytorch.pluginsr   
nemo.utilsr   LightningDataModuler   r7   r/   r/   r/   r0   <module>   s    	