o
    }oi4                     @   s   d dl Z d dlmZ d dlmZ d dlmZmZmZm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZ er<d dlmZ G d	d
 d
ejZdS )    N)	lru_cache)Path)TYPE_CHECKINGListOptionalUnion)
DataLoader)create_sft_dataset)MegatronDataSampler)TokenizerSpecc                       s  e Zd ZdZ												
d.deeef dededed dededee	e  dededede
de
f fddZdefddZdefddZdefd d!Zdefd"d#Zed$d% Zdefd&d'Zedefd(d)Zedefd*d+Zedefd,d-Z  ZS )/FineTuningDataModulea  Base class for fine-tuning an LLM.

    This class provides a foundation for building custom data modules for fine-tuning Nemo NLP models. It inherits from
    `pl.LightningDataModule` from the PyTorch Lightning library and handles data loading, preprocessing, and batch creation
    for training, validation, and testing.

    Args:
        dataset_root (Union[str, Path]): The root directory containing the training, validation, and test data.
        seq_length (int, optional): The maximum sequence length for the input and output text. Defaults to 2048.
        tokenizer (Optional[TokenizerSpec], optional): The tokenizer to use for preprocessing the text. Defaults to None.
            If not provided, a BertWordPieceCase tokenizer will be used.
        micro_batch_size (int, optional): The micro batch size for training. Defaults to 4.
        global_batch_size (int, optional): The global batch size for training. Defaults to 8.
        rampup_batch_size (Optional[List[int]], optional): A list of batch sizes for ramping up during training. Defaults to None.
        seed (int, optional): The random seed for data shuffling. Defaults to 1234.
        memmap_workers (int, optional): The number of worker processes for loading data using TextMemMapDataset. Defaults to 1.
        num_workers (int, optional): The number of worker processes for data loading. Defaults to 8.
        pin_memory (bool, optional): Whether to pin memory during data loading for faster GPU training. Defaults to True.
        persistent_workers (bool, optional): Whether to keep data loading workers persistent across epochs. Defaults to False.
        max_train_steps (int, optional): Maximum number of steps to train. Used to calculate samples mapping for the mmap dataset
          N           TFdataset_root
seq_lengthseq_length_dec	tokenizerr   micro_batch_sizeglobal_batch_sizerampup_batch_sizeseedmemmap_workersnum_workers
pin_memorypersistent_workersc                    s   t    || _|| _|| _t|| _|d u r1ddlm} i }dd t	dD |d< |dd|d	}|| _
|	| _|
| _|| _|| _|| _|| _|| _d | _d | _d S )
Nr   )get_nmt_tokenizerc                 S   s   g | ]}d | dqS )z
<extra_id_> ).0ir!   r!   \/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/llm/t5/data/fine_tuning.py
<listcomp>O   s    z1FineTuningDataModule.__init__.<locals>.<listcomp>d   additional_special_tokensmegatronBertWordPieceCase)special_tokens)super__init__r   r   r   r   r   3nemo.collections.nlp.modules.common.tokenizer_utilsr   ranger   r   r   r   r   r   r   r   data_samplermax_train_samples)selfr   r   r   r   r   r   r   r   r   r   r   r   r   r*   	__class__r!   r$   r,   5   s0   


zFineTuningDataModule.__init__stagec                 C   s>   t | j| j| j| jdd| _tt| j| j	j
 d | _d S )Nbatch)seq_lenr   r   r   dataloader_typegGz?)r
   r   r   r   r   r/   intmathceiltrainer	max_stepsr0   )r1   r4   r!   r!   r$   setupa   s   "
zFineTuningDataModule.setupreturnc                 C   s   |  | jt| j| jdS )N)max_num_samples)_create_dataloader_create_datasetstr
train_pathr0   r1   r!   r!   r$   train_dataloadern   s   z%FineTuningDataModule.train_dataloaderc                 C   s   |  | jt| jddS )NT)is_test)r@   rA   rB   validation_pathrD   r!   r!   r$   val_dataloaderv   s   z#FineTuningDataModule.val_dataloaderc                 C   s   |  | jt| jdddS )N    T)tokens_to_generaterF   )r@   rA   rB   	test_pathrD   r!   r!   r$   test_dataloader~   s   z$FineTuningDataModule.test_dataloaderc                 K   s&   t |f| j| j| j| j| jd|S )N)r   r   r   r   r   )r	   r   r   r   r   r   )r1   pathkwargsr!   r!   r$   rA      s   z$FineTuningDataModule._create_datasetc                 K   s"   t |f| j| j| j|jd|S )N)r   r   r   
collate_fn)r   r   r   r   rO   )r1   datasetrN   r!   r!   r$   r@      s   z'FineTuningDataModule._create_dataloaderc                 C   
   | j d S )Nztraining.jsonlr   rD   r!   r!   r$   rC         
zFineTuningDataModule.train_pathc                 C   rQ   )Nzvalidation.jsonlrR   rD   r!   r!   r$   rG      rS   z$FineTuningDataModule.validation_pathc                 C   rQ   )Nz
test.jsonlrR   rD   r!   r!   r$   rK      rS   zFineTuningDataModule.test_path)r   r   Nr   r   Nr   r   r   TF)__name__
__module____qualname____doc__r   rB   r   r8   r   r   boolr,   r=   r   rE   rH   rL   r   rA   r@   propertyrC   rG   rK   __classcell__r!   r!   r2   r$   r      sf    

	
,	

r   )r9   	functoolsr   pathlibr   typingr   r   r   r   lightning.pytorchpytorchpltorch.utils.datar   !nemo.collections.llm.t5.data.corer	   nemo.lightning.pytorch.pluginsr
   "nemo.collections.common.tokenizersr   LightningDataModuler   r!   r!   r!   r$   <module>   s   