o
    wi!                     @   s   d dl mZmZmZmZ d dlmZ d dlZ	d dl
Z
d dlmZmZ d dlmZ d dlmZmZ d dlmZ er>d dlmZ G dd	 d	ejZG d
d deZdS )    )TYPE_CHECKINGDictListOptionalN)EVAL_DATALOADERSTRAIN_DATALOADERS)data)
DataLoaderDataset)MegatronDataSampler)TokenizerSpecc                       s   e Zd ZdZ											d&d	ed
ed dededeee  dedededededef fddZd'de	ddfddZ
defddZdefddZdefd d!Zdefd"d#Zd$d% Z  ZS )(BERTMockDataModuleab  Mock DataModule for BERT model.
    Args:
        seq_length (int): Sequence length.
        tokenizer (Optional["TokenizerSpec"]): An instance of a TokenizerSpec object.
        micro_batch_size (int): Batch size per GPU.
        global_batch_size (int): Global batch size.
        rampup_batch_size (Optional[List[int]]): Rampup batch size, should be in format of
            [start_global_batch_size, batch_size_increment, ramup_samples].
        num_train_samples (int): Number of training samples.
        num_val_samples (int): Number of validation samples.
        num_test_samples (int): Number of test samples.
        num_workers (int): See ``torch.utils.data.DataLoader`` documentation.
        pin_memory (bool): See ``torch.utils.data.DataLoader`` documentation.
        persistent_workers (bool): See ``torch.utils.data.DataLoader`` documentation.
       N      '  TF
seq_length	tokenizerr   micro_batch_sizeglobal_batch_sizerampup_batch_sizenum_train_samplesnum_val_samplesnum_test_samplesnum_workers
pin_memorypersistent_workersc                    s   t    || _|| _|| _|| _|	| _|
| _|| _|| _	|| _
|d u r1ddlm} |dd| _n|| _t| j| j
| j	|d| _d S )Nr   )get_nmt_tokenizermegatronBertWordPieceLowerCase)seq_lenr   r   r   )super__init__r   r   r   r   r   r   r   r   r   3nemo.collections.nlp.modules.common.tokenizer_utilsr   r   r   data_sampler)selfr   r   r   r   r   r   r   r   r   r   r   r   	__class__ `/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/llm/bert/data/mock.pyr"   /   s(   
zBERTMockDataModule.__init__ stagereturnc                 C   sF   t | jd| j| j| _t | jd| j| j| _t | jd| j| j| _dS )zSetup train/val/test datasets.trainvalidtestN)	_MockBERTDatasetr   r   r   	_train_dsr   _validation_dsr   _test_ds)r%   r+   r(   r(   r)   setupU   s$   
zBERTMockDataModule.setupc                 C      t | ds	|   | | jS )zCreate train dataloader.r1   )hasattrr4   _create_dataloaderr1   r%   r(   r(   r)   train_dataloaderj      
z#BERTMockDataModule.train_dataloaderc                 C   r5   )zCreate validation dataloader.r2   )r6   r4   r7   r2   r8   r(   r(   r)   val_dataloaderp   r:   z!BERTMockDataModule.val_dataloaderc                 C   r5   )zCreate test dataloader.r3   )r6   r4   r7   r3   r8   r(   r(   r)   test_dataloaderv   r:   z"BERTMockDataModule.test_dataloaderc                 K   s"   t |f| j| j| j|jd|S )N)r   r   r   
collate_fn)r	   r   r   r   r=   )r%   datasetkwargsr(   r(   r)   r7   |   s   z%BERTMockDataModule._create_dataloaderc              	   C   s~   ddl m} || jj| j| j_|| jj| j| j_zddlm} W n t	t
fy3   ddlm} Y nw | j j| 9  _dS )zx
        Reconfigure trainer.limit_train_batches and trainer.limit_val_batches in terms of num of microbatches.
        r   )_reconfigure_limit_batches)get_num_microbatchesN)#nemo.collections.llm.gpt.data.utilsr@   trainerlimit_train_batchesr1   limit_val_batchesr2   )megatron.core.num_microbatches_calculatorrA   ImportErrorModuleNotFoundError(apex.transformer.pipeline_parallel.utilsnum_sanity_val_steps)r%   r@   rA   r(   r(   r)   reconfigure_limit_batches   s   
z,BERTMockDataModule.reconfigure_limit_batches)r   Nr   r   Nr   r   r   r   TF)r*   )__name__
__module____qualname____doc__intr   r   boolr"   strr4   r   r9   r   r;   r<   r	   r7   rK   __classcell__r(   r(   r&   r)   r      sT    
	
&
r   c                       s   e Zd Z	ddddededededd	f fd
dZdefddZdedejfddZ	de
eejf fddZdd Zdd Z  ZS )r0     r   r   namenum_samplesr   seedr,   Nc                    sV   t    || _|| _|j| _|| _|| _tj| jtj	d| _
tj| jtjd| _d S )Ndtype)r!   r"   rU   r   
vocab_sizelengthrW   torchonesfloat	loss_maskarangeint64position_ids)r%   r   rU   rV   r   rW   r&   r(   r)   r"      s   
z_MockBERTDataset.__init__c                 C   s   | j S )N)r[   r8   r(   r(   r)   __len__   s   z_MockBERTDataset.__len__idxc                 C   s,   t jj| j| d}|j| j| jgt jdS )NrW   sizerY   )nprandomdefault_rngrW   integersrZ   r   ra   )r%   rd   np_genr(   r(   r)   	_get_text   s   z_MockBERTDataset._get_textc           
      C   s   t jj| j| d}t|j| j| jgt j	d}t|j| j| jgt j	d}tj
| jtj	d}| dk }tt j| jgt j	d}| dk }|||t||| jt|d}	|	S )Nre   rf   rX   g      ?)texttypeslabels	is_randompadding_maskr_   	truncated)rh   ri   rj   rW   r\   
from_numpyrk   rZ   r   ra   zerosr]   rP   r_   )
r%   rd   rl   tokensrp   assignmentsis_next_random	mask_padsrs   batchr(   r(   r)   __getitem__   s    
z_MockBERTDataset.__getitem__c                 C   s   t j|S )z
        A default implementation of a collation function.
        Users should override this method to define custom data loaders.
        )r   
dataloaderdefault_collater%   rz   r(   r(   r)   _collate_fn   s   z_MockBERTDataset._collate_fnc                 C   s
   |  |S )a  Method that user pass as functor to DataLoader.

        The method optionally performs neural type checking and add types to the outputs.

        Please note, subclasses of Dataset should not implement `input_types`.

        # Usage:
        dataloader = torch.utils.data.DataLoader(
                ....,
                collate_fn=dataset.collate_fn,
                ....
        )

        Returns
        -------
            Collated batch, with or without types.
        )r   r~   r(   r(   r)   r=      s   
z_MockBERTDataset.collate_fn)rT   )rL   rM   rN   rR   rP   r"   rc   rh   ndarrayrm   r   r\   Tensorr{   r   r=   rS   r(   r(   r&   r)   r0      s(    r0   )typingr   r   r   r   lightning.pytorchpytorchplnumpyrh   r\   !lightning.pytorch.utilities.typesr   r   torch.utilsr   torch.utils.datar	   r
   nemo.lightning.pytorch.pluginsr   1nemo.collections.common.tokenizers.tokenizer_specr   LightningDataModuler   r0   r(   r(   r(   r)   <module>   s    