o
    }oiv                      @   s   d dl mZmZmZmZ d dlmZ d dlZ	d dl
Z
d dlmZmZ d dlmZ d dlmZmZ d dlmZ d dlmZ ed\ZZerJd d	lmZ G d
d dejZG dd deZdS )    )TYPE_CHECKINGDictListOptionalN)EVAL_DATALOADERSTRAIN_DATALOADERS)data)
DataLoaderDataset)MegatronDataSampler)safe_importtransformer_engine)TokenizerSpecc                       s   e Zd ZdZ															d)d
ededed dededeee  dededededededef fddZd*de	ddfddZ
defdd Zdefd!d"Zdefd#d$Zdefd%d&Zd'd( Z  ZS )+MockDataModulezMock data module for testing      N      '  TF
seq_lengthseq_length_dec	tokenizerr   micro_batch_sizeglobal_batch_sizerampup_batch_sizenum_train_samplesnum_val_samplesnum_test_samplesnum_workers
pin_memorypersistent_workerscreate_attention_maskc                    s   t    || _|| _|| _|| _|| _|| _|	| _|
| _	|| _
|| _|p't | _ddlm} |p5|dd| _t| j|||d| _d S )Nr   )get_nmt_tokenizermegatronBertWordPieceCase)seq_lenr   r   r   )super__init__r   r   r   r   r   r   r   r   r   r    HAVE_TEr!   3nemo.collections.nlp.modules.common.tokenizer_utilsr"   r   r   data_sampler)selfr   r   r   r   r   r   r   r   r   r   r   r    r!   r"   	__class__ U/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/llm/t5/data/mock.pyr'   $   s(   
zMockDataModule.__init__ stagereturnc                 C   sR   t | jd| j| j| j| _t | jd| j| j| j| _t | jd| j| j| j| _	dS )zSetup the datasetstrainvalidtestN)
_MockT5Datasetr   r   r   r   	_train_dsr   _validation_dsr   _test_ds)r+   r1   r.   r.   r/   setupK   s   
zMockDataModule.setupc                 C      t | ds	|   | | jS )zDataloader for training setr7   )hasattrr:   _create_dataloaderr7   r+   r.   r.   r/   train_dataloaderW      
zMockDataModule.train_dataloaderc                 C   r;   )zDataloader for validation setr8   )r<   r:   r=   r8   r>   r.   r.   r/   val_dataloader]   r@   zMockDataModule.val_dataloaderc                 C   r;   )zDataloader for test setr9   )r<   r:   r=   r9   r>   r.   r.   r/   test_dataloaderc   r@   zMockDataModule.test_dataloaderc                 K   s"   t |f| j| j| j|jd|S )N)r   r   r    
collate_fn)r	   r   r   r    rC   )r+   datasetkwargsr.   r.   r/   r=   i   s   z!MockDataModule._create_dataloaderc              	   C   s~   ddl m} || jj| j| j_|| jj| j| j_zddlm} W n t	t
fy3   ddlm} Y nw | j j| 9  _dS )zx
        Reconfigure trainer.limit_train_batches and trainer.limit_val_batches in terms of num of microbatches.
        r   )_reconfigure_limit_batches)get_num_microbatchesN)#nemo.collections.llm.gpt.data.utilsrF   trainerlimit_train_batchesr7   limit_val_batchesr8   )megatron.core.num_microbatches_calculatorrG   ImportErrorModuleNotFoundError(apex.transformer.pipeline_parallel.utilsnum_sanity_val_steps)r+   rF   rG   r.   r.   r/   reconfigure_limit_batchess   s   
z(MockDataModule.reconfigure_limit_batches)r   r   Nr   r   Nr   r   r   r   TFF)r0   )__name__
__module____qualname____doc__intr   r   boolr'   strr:   r   r?   r   rA   rB   r	   r=   rQ   __classcell__r.   r.   r,   r/   r   !   s`    
	
'
r   c                       s   e Zd Z		ddddedededed	ed
eddf fddZdefddZdedej	fddZ
deeejf fddZdd Zdd Z  ZS )r6   *   Fr   r   namenum_samplesr   r   seedr!   r2   Nc                    s   t    || _|| _|| _|j| _|| _|| _|| _t	j
| jdd| _t	j
| jdd| _| jdk  | _| jdk  | _t	j
| jt	jd| _d S )Ncpu)deviceg      ?)dtype)r&   r'   r[   r   r   
vocab_sizelengthr]   r!   torchonesmask_encodermask_decoderfloat	loss_mask)r+   r   r[   r\   r   r   r]   r!   r,   r.   r/   r'      s   

z_MockT5Dataset.__init__c                 C   s   | j S )N)rb   r>   r.   r.   r/   __len__   s   z_MockT5Dataset.__len__idxc                 C   s,   t jj| j| d}|j| j| jgt jdS )Nr]   sizer`   )nprandomdefault_rngr]   integersra   r   int64)r+   rj   np_genr.   r.   r/   	_get_text   s   z_MockT5Dataset._get_textc                 C   s   t jj| j| d}t|j| j| jgt j	d}t|j| j| j
gt j	d}t|j| j| j
gt j	d}|||| jd| j| jd}|S )Nrk   rl   r   )text_enctext_declabelsrh   	truncatedenc_maskdec_mask)rn   ro   rp   r]   rc   
from_numpyrq   ra   r   rr   r   rh   re   rf   )r+   rj   rs   encoder_inputdecoder_inputrw   batchr.   r.   r/   __getitem__   s   
z_MockT5Dataset.__getitem__c                 C   s   t j|S )z
        A default implementation of a collation function.
        Users should override this method to define custom data loaders.
        )r   
dataloaderdefault_collater+   r~   r.   r.   r/   _collate_fn   s   z_MockT5Dataset._collate_fnc                 C   s
   |  |S )a  Method that user pass as functor to DataLoader.

        The method optionally performs neural type checking and add types to the outputs.

        Please note, subclasses of Dataset should not implement `input_types`.

        # Usage:
        dataloader = torch.utils.data.DataLoader(
                ....,
                collate_fn=dataset.collate_fn,
                ....
        )

        Returns
        -------
            Collated batch, with or without types.
        )r   r   r.   r.   r/   rC      s   
z_MockT5Dataset.collate_fn)rZ   F)rR   rS   rT   rX   rV   rW   r'   ri   rn   ndarrayrt   r   rc   Tensorr   r   rC   rY   r.   r.   r,   r/   r6      s2    	r6   )typingr   r   r   r   lightning.pytorchpytorchplnumpyrn   rc   !lightning.pytorch.utilities.typesr   r   torch.utilsr   torch.utils.datar	   r
   nemo.lightning.pytorch.pluginsr   nemo.utils.import_utilsr   _r(   1nemo.collections.common.tokenizers.tokenizer_specr   LightningDataModuler   r6   r.   r.   r.   r/   <module>   s   j