o
    }oi %                     @   s   d dl mZmZmZmZ d dlmZ d dlZ	d dl
Z
d dlmZmZ d dlmZ d dlmZmZ d dlmZ d dlmZ ed\ZZerJd d	lmZ G d
d dejZG dd deZdS )    )TYPE_CHECKINGDictListOptionalN)EVAL_DATALOADERSTRAIN_DATALOADERS)data)
DataLoaderDataset)MegatronDataSampler)safe_importtransformer_engine)TokenizerSpecc                       s   e Zd ZdZ																d*d
eded dededeee  dedededededededee dee f fddZ	d+deddfddZ
defd d!Zdefd"d#Zdefd$d%Zdefd&d'Zd(d) Z  ZS ),MockDataModulea  PyTorch Lightning-compatible data module for testing pre-training and fine-tuning workloads.
    MockDataModule will generate random token indices to simulate a dataset.
    Args:
        seq_length (int): Sequence length.
        tokenizer (Optional["TokenizerSpec"]): An instance of a TokenizerSpec object.
        micro_batch_size (int): Batch size per GPU.
        global_batch_size (int): Global batch size.
        rampup_batch_size (Optional[List[int]]): Rampup batch size, should be in format of
            [start_global_batch_size, batch_size_increment, ramup_samples].
        num_workers (int): See ``torch.utils.data.DataLoader`` documentation.
        pin_memory (bool): See ``torch.utils.data.DataLoader`` documentation.
        persistent_workers (bool): See ``torch.utils.data.DataLoader`` documentation.
        num_train_samples (Optional[int]): The number of samples to use for training, defaults to total
            train steps times global batch size.
        num_val_samples (Optional[int]): The number of samples to use for validation, defaults to total
            validation steps times global batch size.
        num_test_samples (Optional[int]): The number of samples to use for testing, defaults to total
            test steps times global batch size.
       N      逖 '  TF
seq_length	tokenizerr   micro_batch_sizeglobal_batch_sizerampup_batch_sizenum_train_samplesnum_val_samplesnum_test_samplesnum_workers
pin_memorypersistent_workerscreate_attention_mask
vocab_filemerges_filec                    s   t    || _|| _|| _|| _|| _|| _|	| _|
| _	|| _
|p$t | _|d u r:ddlm} |dd||d| _n|| _t| j| j| j|d| _d S )Nr   )get_nmt_tokenizermegatronGPT2BPETokenizer)r!   r"   )seq_lenr   r   r   )super__init__r   r   r   r   r   r   r   r   r   HAVE_TEr    3nemo.collections.nlp.modules.common.tokenizer_utilsr#   r   r   data_sampler)selfr   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   	__class__ V/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/llm/gpt/data/mock.pyr(   6   s.   

zMockDataModule.__init__ stagereturnc                 C   sR   t | jd| j| j| j| _t | jd| j| j| j| _t | jd| j| j| j| _	dS )z(
        Setup the data module.
        trainvalidtestN)
_MockGPTDatasetr   r   r   r    	_train_dsr   _validation_dsr   _test_ds)r,   r2   r/   r/   r0   setupc   s   
zMockDataModule.setupc                 C      t | ds	|   | | jS )z+
        Get the train dataloader.
        r8   )hasattrr;   _create_dataloaderr8   r,   r/   r/   r0   train_dataloaderq      
zMockDataModule.train_dataloaderc                 C   r<   )z0
        Get the validation dataloader.
        r9   )r=   r;   r>   r9   r?   r/   r/   r0   val_dataloadery   rA   zMockDataModule.val_dataloaderc                 C   r<   )z*
        Get the test dataloader.
        r:   )r=   r;   r>   r:   r?   r/   r/   r0   test_dataloader   rA   zMockDataModule.test_dataloaderc                 K   s"   t |f| j| j| j|jd|S )N)r   r   r   
collate_fn)r	   r   r   r   rD   )r,   datasetkwargsr/   r/   r0   r>      s   z!MockDataModule._create_dataloaderc              	   C   s~   ddl m} || jj| j| j_|| jj| j| j_zddlm} W n t	t
fy3   ddlm} Y nw | j j| 9  _dS )zx
        Reconfigure trainer.limit_train_batches and trainer.limit_val_batches in terms of num of microbatches.
        r   )_reconfigure_limit_batches)get_num_microbatchesN)#nemo.collections.llm.gpt.data.utilsrG   trainerlimit_train_batchesr8   limit_val_batchesr9   )megatron.core.num_microbatches_calculatorrH   ImportErrorModuleNotFoundError(apex.transformer.pipeline_parallel.utilsnum_sanity_val_steps)r,   rG   rH   r/   r/   r0   reconfigure_limit_batches   s   
z(MockDataModule.reconfigure_limit_batches)r   Nr   r   Nr   r   r   r   TFFNN)r1   )__name__
__module____qualname____doc__intr   r   boolstrr(   r;   r   r@   r   rB   rC   r	   r>   rR   __classcell__r/   r/   r-   r0   r   !   sf    
	
-
r   c                       s   e Zd Z		ddddedededed	ed
df fddZd
efddZded
ej	fddZ
d
eeejf fddZdd Zdd Z  ZS )r7   *   Fr   r   namenum_samplesr   seedr    r3   Nc                    s   t    || _|| _|j| _|| _|| _|| _|r2t	tj
| j| jfddd| _| jdk | _tj
| jtjd| _tj| jtjd| _d S )Ncpu)devicer   g      ?)dtype)r'   r(   r\   r   
vocab_sizelengthr^   r    torchtrilones	unsqueezeattention_maskfloat	loss_maskarangeint64position_ids)r,   r   r\   r]   r   r^   r    r-   r/   r0   r(      s   
	$z_MockGPTDataset.__init__c                 C   s   | j S )N)rc   r?   r/   r/   r0   __len__   s   z_MockGPTDataset.__len__idxc                 C   s,   t jj| j| d}|j| j| jgt jdS )Nr^   sizera   )nprandomdefault_rngr^   integersrb   r   rl   )r,   ro   np_genr/   r/   r0   	_get_text   s   z_MockGPTDataset._get_textc                 C   sl   t jj| j| d}t|j| j| jd gt j	d}|d d |dd  | j
| jd}| jr4| j|d< |S )Nrp      rq   )tokenslabelsrj   rm   rh   )rs   rt   ru   r^   rd   
from_numpyrv   rb   r   rl   rj   rm   r    rh   )r,   ro   rw   r{   batchr/   r/   r0   __getitem__   s   "


z_MockGPTDataset.__getitem__c                 C   s   t j|S )z
        A default implementation of a collation function.
        Users should override this method to define custom data loaders.
        )r   
dataloaderdefault_collater,   r~   r/   r/   r0   _collate_fn   s   z_MockGPTDataset._collate_fnc                 C   s
   |  |S )a  Method that user pass as functor to DataLoader.

        The method optionally performs neural type checking and add types to the outputs.

        Please note, subclasses of Dataset should not implement `input_types`.

        # Usage:
        dataloader = torch.utils.data.DataLoader(
                ....,
                collate_fn=dataset.collate_fn,
                ....
        )

        Returns
        -------
            Collated batch, with or without types.
        )r   r   r/   r/   r0   rD      s   
z_MockGPTDataset.collate_fn)r[   F)rS   rT   rU   rY   rW   rX   r(   rn   rs   ndarrayrx   r   rd   Tensorr   r   rD   rZ   r/   r/   r-   r0   r7      s.    r7   )typingr   r   r   r   lightning.pytorchpytorchplnumpyrs   rd   !lightning.pytorch.utilities.typesr   r   torch.utilsr   torch.utils.datar	   r
   nemo.lightning.pytorch.pluginsr   nemo.utils.import_utilsr   _r)   1nemo.collections.common.tokenizers.tokenizer_specr   LightningDataModuler   r7   r/   r/   r/   r0   <module>   s    