o
    wi$                     @   s   d dl mZmZmZ d dlmZ d dlZd dl	Z	d dl
mZmZ d dlmZ d dlmZmZ d dlmZ d dlmZ d dlmZ G d	d
 d
ejZG dd deZdS )    )DictListOptionalN)EVAL_DATALOADERSTRAIN_DATALOADERS)data)
DataLoaderDataset)IMAGE_TOKEN_INDEX)MegatronDataSampler)loggingc                       s   e Zd Z														d%ded	ee d
ededededeee  dededededededef fddZd&deddfddZ	de
fddZdefdd Zdefd!d"Zdefd#d$Z  ZS )'MockDataModule   N      逖 TF
seq_lengthdecoder_seq_length	tokenizerimage_processormicro_batch_sizeglobal_batch_sizerampup_batch_sizenum_train_samplesnum_val_samplesnum_test_samplesnum_workers
pin_memorypersistent_workerspacked_sequencec                    s   t    || _|| _|| _|| _|| _|	| _|
| _|| _	|| _
|| _|| _|d u s.|d u rStd ddlm} ddlm} |d}|pK|ddd| _|pQ|j| _t| j| j|||d| _d S )	NzQProcessor or tokenizer are not provided! Fall back to `llava-hf/llava-1.5-7b-hf`.r   )AutoProcessor)AutoTokenizerzllava-hf/llava-1.5-7b-hfF)use_fast)seq_lendecoder_seq_lenr   r   r   )super__init__r   r$   r   r   r   r   r   r   r   r   r   r   warningtransformersr    =nemo.collections.common.tokenizers.huggingface.auto_tokenizerr!   from_pretrainedr   r   r   data_sampler)selfr   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   	processor	__class__ `/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/vlm/neva/data/mock.pyr&      s4   


zMockDataModule.__init__ stagereturnc                 C   s   | j }| jr| jdkr|| j }td| d t| j| jd| j|| jd| _	t| j| jd| j
|| jd| _t| j| jd| j|| jd| _d S )N   zPacked sequence is used with mock dataset. Sequence length for each sample is update to `seq_length // self.micro_batch_size = z`!train)r   validtest)r   r   r   r   r'   _MockNevaDatasetr   r   r   	_train_dsr   _validation_dsr   _test_ds)r,   r3   r   r0   r0   r1   setupN   s@   
zMockDataModule.setupc                 C      t | ds	|   | | jS )Nr:   )hasattrr=   _create_dataloaderr:   r,   r0   r0   r1   train_dataloadero      
zMockDataModule.train_dataloaderc                 C   r>   )Nr;   )r?   r=   r@   r;   rA   r0   r0   r1   val_dataloadert   rC   zMockDataModule.val_dataloaderc                 C   r>   )Nr<   )r?   r=   r@   r<   rA   r0   r0   r1   test_dataloadery   rC   zMockDataModule.test_dataloaderc                 K   s"   t |f| j| j| j|jd|S )N)r   r   r   
collate_fn)r   r   r   r   rF   )r,   datasetkwargsr0   r0   r1   r@   ~   s   z!MockDataModule._create_dataloader)r   NNNr   r   Nr   r   r   r   TFF)r2   )__name__
__module____qualname__intr   r   boolr&   strr=   r   rB   r   rD   rE   r   r@   __classcell__r0   r0   r.   r1   r      sb    
	
/!r   c                       s   e Zd Z			ddededededed	d
f fddZd	efddZded	ej	fddZ
d	eeejf fddZdd Zdd Z  ZS )r9   *   F@  namenum_samplesr   seedr   r4   Nc	           
         s   t    || _|| _|j| _|j}	|	d |	d | _| _|| _|| _	|| _
|| _tj| jd | tjd| _tj| jtjd| _d S )Nheightwidthr5   )dtype)r%   r&   rR   r   
vocab_size	crop_sizeimage_heightimage_widthlengthrT   r   num_image_embeddings_per_tiletorchonesfloat	loss_maskarangeint64position_ids)
r,   r   r   rR   rS   r   rT   r   r]   rY   r.   r0   r1   r&      s   
z_MockNevaDataset.__init__c                 C   s   | j S )N)r\   rA   r0   r0   r1   __len__   s   z_MockNevaDataset.__len__idxc                 C   s,   t jj| j| d}|j| j| jgt jdS )NrT   sizerW   )nprandomdefault_rngrT   integersrX   r   rc   )r,   rf   np_genr0   r0   r1   	_get_text   s   z_MockNevaDataset._get_textc                 C   s   t jj| j| d}t|j| j| jd | j	 gt j
d}t|d< | }t|jd| j| jgt jd}|d d }|dd  }|||| j| jdS )Nrg      rh      r5   )mediatokenslabelsra   rd   )rj   rk   rl   rT   r^   
from_numpyrm   rX   r   r]   rc   r
   clonerZ   r[   float32ra   rd   )r,   rf   rn   rt   ru   imagesr0   r0   r1   __getitem__   s"    z_MockNevaDataset.__getitem__c              	   C   s   t j|}d|d< | jr_ddlm} |d }|jd }| j}tj	d|d | |tj
|jd}tj	d|d | |tj
|jd}d}	||||||||	d	}
|
|d
< dD ]}|| dd||< qR|S )z
        A default implementation of a collation function.
        Users should override this method to define custom data loaders.
        Nattention_maskr   )PackedSeqParamsrt   r5   )steprW   devicethd)cu_seqlens_qcu_seqlens_kvcu_seqlens_q_paddedcu_seqlens_kv_paddedmax_seqlen_qmax_seqlen_kv
qkv_formatpacked_seq_params)rt   ru   ra   rd   rr   )r   
dataloaderdefault_collater   megatron.core.packed_seq_paramsr|   shaper   r^   rb   int32r~   reshape)r,   batchcollated_batchr|   rt   
batch_sizevalid_seqlen
cu_seqlenscu_seqlens_paddedr   r   keyr0   r0   r1   _collate_fn   s6   
	z_MockNevaDataset._collate_fnc                 C   s
   |  |S )a  Method that user pass as functor to DataLoader.

        The method optionally performs neural type checking and add types to the outputs.

        Please note, subclasses of Dataset should not implement `input_types`.

        # Usage:
        dataloader = torch.utils.data.DataLoader(
                ....,
                collate_fn=dataset.collate_fn,
                ....
        )

        Returns
        -------
            Collated batch, with or without types.
        )r   )r,   r   r0   r0   r1   rF      s   
z_MockNevaDataset.collate_fn)rP   FrQ   )rI   rJ   rK   rN   rL   rM   r&   re   rj   ndarrayro   r   r^   Tensorrz   r   rF   rO   r0   r0   r.   r1   r9      s,    
$r9   )typingr   r   r   lightning.pytorchpytorchplnumpyrj   r^   !lightning.pytorch.utilities.typesr   r   torch.utilsr   torch.utils.datar   r	   0nemo.collections.vlm.neva.data.multimodal_tokensr
   nemo.lightning.pytorch.pluginsr   
nemo.utilsr   LightningDataModuler   r9   r0   r0   r0   r1   <module>   s   k