o
    wi%                     @   s   d dl mZmZmZmZ d dlmZ d dlZ	d dl
Z
d dlmZmZ d dlmZ d dlmZmZ d dlmZ d dlmZ ed\ZZerJd d	lmZ G d
d dejZG dd deZdS )    )TYPE_CHECKINGDictListOptionalN)EVAL_DATALOADERSTRAIN_DATALOADERS)data)
DataLoaderDataset)MegatronDataSampler)safe_importtransformer_engine)TokenizerSpecc                       s   e Zd ZdZ																d*d
eded dededeee  dedededededededee dee f fddZ	d+deddfddZ
defd d!Zdefd"d#Zdefd$d%Zdefd&d'Zd(d) Z  ZS ),MockDataModulea  PyTorch Lightning-compatible data module for testing pre-training and fine-tuning workloads.
    MockDataModule will generate random token indices to simulate a dataset.
    Args:
        seq_length (int): Sequence length.
        tokenizer (Optional["TokenizerSpec"]): An instance of a TokenizerSpec object.
        micro_batch_size (int): Batch size per GPU.
        global_batch_size (int): Global batch size.
        rampup_batch_size (Optional[List[int]]): Rampup batch size, should be in format of
            [start_global_batch_size, batch_size_increment, ramup_samples].
        num_workers (int): See ``torch.utils.data.DataLoader`` documentation.
        pin_memory (bool): See ``torch.utils.data.DataLoader`` documentation.
        persistent_workers (bool): See ``torch.utils.data.DataLoader`` documentation.
        num_train_samples (Optional[int]): The number of samples to use for training, defaults to total
            train steps times global batch size.
        num_val_samples (Optional[int]): The number of samples to use for validation, defaults to total
            validation steps times global batch size.
        num_test_samples (Optional[int]): The number of samples to use for testing, defaults to total
            test steps times global batch size.
       N      逖 '  TF
seq_length	tokenizerr   micro_batch_sizeglobal_batch_sizerampup_batch_sizenum_train_samplesnum_val_samplesnum_test_samplesnum_workers
pin_memorypersistent_workerscreate_attention_mask
vocab_filemerges_filec                    s   ddl m} |   t   || _|| _|| _|| _	|| _
|| _|	| _|
| _|| _|p0t | _|d u rFddlm} |dd||d| _n|| _t| j| j| j|d| _|   d S )Nr   )CallbackGroup)get_nmt_tokenizermegatronGPT2BPETokenizer)r!   r"   )seq_lenr   r   r   )nemo.lightning.callback_groupr#   get_instanceon_dataloader_init_startsuper__init__r   r   r   r   r   r   r   r   r   HAVE_TEr    3nemo.collections.nlp.modules.common.tokenizer_utilsr$   r   r   data_sampleron_dataloader_init_end)selfr   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   	__class__ _/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/llm/gpt/data/mock.pyr,   6   s4   

zMockDataModule.__init__ stagereturnc                 C   sR   t | jd| j| j| j| _t | jd| j| j| j| _t | jd| j| j| j| _	dS )z(
        Setup the data module.
        trainvalidtestN)
_MockGPTDatasetr   r   r   r    	_train_dsr   _validation_dsr   _test_ds)r1   r7   r4   r4   r5   setupi   s   
zMockDataModule.setupc                 C      t | ds	|   | | jS )z+
        Get the train dataloader.
        r=   )hasattrr@   _create_dataloaderr=   r1   r4   r4   r5   train_dataloaderw      
zMockDataModule.train_dataloaderc                 C   rA   )z0
        Get the validation dataloader.
        r>   )rB   r@   rC   r>   rD   r4   r4   r5   val_dataloader   rF   zMockDataModule.val_dataloaderc                 C   rA   )z*
        Get the test dataloader.
        r?   )rB   r@   rC   r?   rD   r4   r4   r5   test_dataloader   rF   zMockDataModule.test_dataloaderc                 K   s"   t |f| j| j| j|jd|S )N)r   r   r   
collate_fn)r	   r   r   r   rI   )r1   datasetkwargsr4   r4   r5   rC      s   z!MockDataModule._create_dataloaderc              	   C   s~   ddl m} || jj| j| j_|| jj| j| j_zddlm} W n t	t
fy3   ddlm} Y nw | j j| 9  _dS )zx
        Reconfigure trainer.limit_train_batches and trainer.limit_val_batches in terms of num of microbatches.
        r   )_reconfigure_limit_batches)get_num_microbatchesN)#nemo.collections.llm.gpt.data.utilsrL   trainerlimit_train_batchesr=   limit_val_batchesr>   )megatron.core.num_microbatches_calculatorrM   ImportErrorModuleNotFoundError(apex.transformer.pipeline_parallel.utilsnum_sanity_val_steps)r1   rL   rM   r4   r4   r5   reconfigure_limit_batches   s   
z(MockDataModule.reconfigure_limit_batches)r   Nr   r   Nr   r   r   r   TFFNN)r6   )__name__
__module____qualname____doc__intr   r   boolstrr,   r@   r   rE   r   rG   rH   r	   rC   rW   __classcell__r4   r4   r2   r5   r   !   sf    
	
3
r   c                       s   e Zd Z		ddddedededed	ed
df fddZd
efddZded
ej	fddZ
d
eeejf fddZdd Zdd Z  ZS )r<   *   Fr   r   namenum_samplesr   seedr    r8   Nc                    s   t    || _|| _|j| _|| _|| _|| _|r2t	tj
| j| jfddd| _| jdk | _tj
| jtjd| _tj| jtjd| _d S )Ncpu)devicer   g      ?)dtype)r+   r,   ra   r   
vocab_sizelengthrc   r    torchtrilones	unsqueezeattention_maskfloat	loss_maskarangeint64position_ids)r1   r   ra   rb   r   rc   r    r2   r4   r5   r,      s   
	$z_MockGPTDataset.__init__c                 C   s   | j S )N)rh   rD   r4   r4   r5   __len__   s   z_MockGPTDataset.__len__idxc                 C   s,   t jj| j| d}|j| j| jgt jdS )Nrc   sizerf   )nprandomdefault_rngrc   integersrg   r   rq   )r1   rt   np_genr4   r4   r5   	_get_text   s   z_MockGPTDataset._get_textc                 C   sl   t jj| j| d}t|j| j| jd gt j	d}|d d |dd  | j
| jd}| jr4| j|d< |S )Nru      rv   )tokenslabelsro   rr   rm   )rx   ry   rz   rc   ri   
from_numpyr{   rg   r   rq   ro   rr   r    rm   )r1   rt   r|   r   batchr4   r4   r5   __getitem__   s   "


z_MockGPTDataset.__getitem__c                 C   s   t j|S )z
        A default implementation of a collation function.
        Users should override this method to define custom data loaders.
        )r   
dataloaderdefault_collater1   r   r4   r4   r5   _collate_fn   s   z_MockGPTDataset._collate_fnc                 C   s
   |  |S )a  Method that user pass as functor to DataLoader.

        The method optionally performs neural type checking and add types to the outputs.

        Please note, subclasses of Dataset should not implement `input_types`.

        # Usage:
        dataloader = torch.utils.data.DataLoader(
                ....,
                collate_fn=dataset.collate_fn,
                ....
        )

        Returns
        -------
            Collated batch, with or without types.
        )r   r   r4   r4   r5   rI      s   
z_MockGPTDataset.collate_fn)r`   F)rX   rY   rZ   r^   r\   r]   r,   rs   rx   ndarrayr}   r   ri   Tensorr   r   rI   r_   r4   r4   r2   r5   r<      s.    r<   )typingr   r   r   r   lightning.pytorchpytorchplnumpyrx   ri   !lightning.pytorch.utilities.typesr   r   torch.utilsr   torch.utils.datar	   r
   nemo.lightning.pytorch.pluginsr   nemo.utils.import_utilsr   _r-   1nemo.collections.common.tokenizers.tokenizer_specr   LightningDataModuler   r<   r4   r4   r4   r5   <module>   s    