o
    wi7!                     @   s   d dl mZmZmZmZ d dlmZ d dlZ	d dl
Z
d dlmZmZ d dlmZ d dlmZmZ d dlmZ d dlmZ ed\ZZerJd d	lmZ G d
d dejZG dd deZdS )    )TYPE_CHECKINGDictListOptionalN)EVAL_DATALOADERSTRAIN_DATALOADERS)data)
DataLoaderDataset)MegatronDataSampler)safe_importtransformer_engine)TokenizerSpecc                       s   e Zd ZdZ															d)d
ededed dededeee  dededededededef fddZd*de	ddfddZ
defdd Zdefd!d"Zdefd#d$Zdefd%d&Zd'd( Z  ZS )+MockDataModulezMock data module for testing      N      '  TF
seq_lengthseq_length_dec	tokenizerr   micro_batch_sizeglobal_batch_sizerampup_batch_sizenum_train_samplesnum_val_samplesnum_test_samplesnum_workers
pin_memorypersistent_workerscreate_attention_maskc                    s   ddl m} |   t   || _|| _|| _|| _	|| _
|| _|	| _|
| _|| _|| _|p3t | _ddlm} |pA|dd| _t| j|||d| _|   d S )Nr   )CallbackGroup)get_nmt_tokenizermegatronBertWordPieceCase)seq_lenr   r   r   )nemo.lightning.callback_groupr"   get_instanceon_dataloader_init_startsuper__init__r   r   r   r   r   r   r   r   r   r    HAVE_TEr!   3nemo.collections.nlp.modules.common.tokenizer_utilsr#   r   r   data_sampleron_dataloader_init_end)selfr   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   	__class__ ^/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/llm/t5/data/mock.pyr+   $   s.   
zMockDataModule.__init__ stagereturnc                 C   sR   t | jd| j| j| j| _t | jd| j| j| j| _t | jd| j| j| j| _	dS )zSetup the datasetstrainvalidtestN)
_MockT5Datasetr   r   r   r   	_train_dsr   _validation_dsr   _test_ds)r0   r6   r3   r3   r4   setupQ   s   
zMockDataModule.setupc                 C      t | ds	|   | | jS )zDataloader for training setr<   )hasattrr?   _create_dataloaderr<   r0   r3   r3   r4   train_dataloader]      
zMockDataModule.train_dataloaderc                 C   r@   )zDataloader for validation setr=   )rA   r?   rB   r=   rC   r3   r3   r4   val_dataloaderc   rE   zMockDataModule.val_dataloaderc                 C   r@   )zDataloader for test setr>   )rA   r?   rB   r>   rC   r3   r3   r4   test_dataloaderi   rE   zMockDataModule.test_dataloaderc                 K   s"   t |f| j| j| j|jd|S )N)r   r   r    
collate_fn)r	   r   r   r    rH   )r0   datasetkwargsr3   r3   r4   rB   o   s   z!MockDataModule._create_dataloaderc              	   C   s~   ddl m} || jj| j| j_|| jj| j| j_zddlm} W n t	t
fy3   ddlm} Y nw | j j| 9  _dS )zx
        Reconfigure trainer.limit_train_batches and trainer.limit_val_batches in terms of num of microbatches.
        r   )_reconfigure_limit_batches)get_num_microbatchesN)#nemo.collections.llm.gpt.data.utilsrK   trainerlimit_train_batchesr<   limit_val_batchesr=   )megatron.core.num_microbatches_calculatorrL   ImportErrorModuleNotFoundError(apex.transformer.pipeline_parallel.utilsnum_sanity_val_steps)r0   rK   rL   r3   r3   r4   reconfigure_limit_batchesy   s   
z(MockDataModule.reconfigure_limit_batches)r   r   Nr   r   Nr   r   r   r   TFF)r5   )__name__
__module____qualname____doc__intr   r   boolr+   strr?   r   rD   r   rF   rG   r	   rB   rV   __classcell__r3   r3   r1   r4   r   !   s`    
	
-
r   c                       s   e Zd Z		ddddedededed	ed
eddf fddZdefddZdedej	fddZ
deeejf fddZdd Zdd Z  ZS )r;   *   Fr   r   namenum_samplesr   r   seedr!   r7   Nc                    s   t    || _|| _|| _|j| _|| _|| _|| _t	j
| jdd| _t	j
| jdd| _| jdk  | _| jdk  | _t	j
| jt	jd| _d S )Ncpu)deviceg      ?)dtype)r*   r+   r`   r   r   
vocab_sizelengthrb   r!   torchonesmask_encodermask_decoderfloat	loss_mask)r0   r   r`   ra   r   r   rb   r!   r1   r3   r4   r+      s   

z_MockT5Dataset.__init__c                 C   s   | j S )N)rg   rC   r3   r3   r4   __len__   s   z_MockT5Dataset.__len__idxc                 C   s,   t jj| j| d}|j| j| jgt jdS )Nrb   sizere   )nprandomdefault_rngrb   integersrf   r   int64)r0   ro   np_genr3   r3   r4   	_get_text   s   z_MockT5Dataset._get_textc                 C   s   t jj| j| d}t|j| j| jgt j	d}t|j| j| j
gt j	d}t|j| j| j
gt j	d}|||| jd| j| jd}|S )Nrp   rq   r   )text_enctext_declabelsrm   	truncatedenc_maskdec_mask)rs   rt   ru   rb   rh   
from_numpyrv   rf   r   rw   r   rm   rj   rk   )r0   ro   rx   encoder_inputdecoder_inputr|   batchr3   r3   r4   __getitem__   s   
z_MockT5Dataset.__getitem__c                 C   s   t j|S )z
        A default implementation of a collation function.
        Users should override this method to define custom data loaders.
        )r   
dataloaderdefault_collater0   r   r3   r3   r4   _collate_fn   s   z_MockT5Dataset._collate_fnc                 C   s
   |  |S )a  Method that user pass as functor to DataLoader.

        The method optionally performs neural type checking and add types to the outputs.

        Please note, subclasses of Dataset should not implement `input_types`.

        # Usage:
        dataloader = torch.utils.data.DataLoader(
                ....,
                collate_fn=dataset.collate_fn,
                ....
        )

        Returns
        -------
            Collated batch, with or without types.
        )r   r   r3   r3   r4   rH      s   
z_MockT5Dataset.collate_fn)r_   F)rW   rX   rY   r]   r[   r\   r+   rn   rs   ndarrayry   r   rh   Tensorr   r   rH   r^   r3   r3   r1   r4   r;      s2    	r;   )typingr   r   r   r   lightning.pytorchpytorchplnumpyrs   rh   !lightning.pytorch.utilities.typesr   r   torch.utilsr   torch.utils.datar	   r
   nemo.lightning.pytorch.pluginsr   nemo.utils.import_utilsr   _r,   1nemo.collections.common.tokenizers.tokenizer_specr   LightningDataModuler   r;   r3   r3   r3   r4   <module>   s   p