o
    wi                     @   s   d dl Z d dlZd dlmZmZmZmZmZ d dlZ	d dl
mZ d dlmZ d dlmZ d dlmZ d dlmZ erFd dlmZ d d	lmZ G d
d deeZdS )    N)TYPE_CHECKINGAnyDictListOptional)load_dataset)get_dataset_root)FineTuningDataModule)IOMixin)logging)TokenizerSpec)PackedSequenceSpecsc                        s   e Zd ZdZ															d'd
eded dededeee  dedededededededed deee	e
f  f fddZd( fddZdd  Zd)d#ed$efd%d&Z  ZS )*DollyDataModulea  A data module for fine-tuning on the Dolly dataset.

    This class inherits from the `FineTuningDataModule` class and is specifically designed for fine-tuning models on the
    "databricks/databricks-dolly-15k" dataset. It handles data download, preprocessing, splitting, and preparing the data
    in a format suitable for training, validation, and testing.

    Args:
        force_redownload (bool, optional): Whether to force re-download the dataset even if it exists locally. Defaults to False.
        delete_raw (bool, optional): Whether to delete the raw downloaded dataset after preprocessing. Defaults to True.
        See FineTuningDataModule for the other args
       N      FT     
seq_length	tokenizerr   micro_batch_sizeglobal_batch_sizerampup_batch_sizeforce_redownload
delete_rawseedmemmap_workersnum_workers
pin_memorypersistent_workerspacked_sequence_specsr   dataset_kwargsc                    s:   || _ || _t jtd|||||||	|
||||d d S )Ndolly)dataset_rootr   r   r   r   r   r   r   r   r   r   r    r!   )r   r   super__init__r   )selfr   r   r   r   r   r   r   r   r   r   r   r   r    r!   	__class__ `/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/llm/gpt/data/dolly.pyr%   -   s"   
zDollyDataModule.__init__returnc                    s0   | j  r| jr|  }| | t   d S )N)
train_pathexistsr   _download_data_preprocess_and_split_datar$   prepare_data)r&   dsetr'   r)   r*   r0   Q   s   
zDollyDataModule.prepare_datac                 C   s8   t d| jj d tdt| j| jrddS d dS )NzDownloading z...zdatabricks/databricks-dolly-15kr   )	cache_dirdownload_mode)r   infor(   __name__r   strr#   r   )r&   r)   r)   r*   r.   X   s   zDollyDataModule._download_data皙?333333?train_ratio	val_ratioc                 C   s  t d| jj d d| | }i }|d}|j|| | jd}|d j|||  | jd}|d |d< |d |d< |d |d< | D ]\}	}| j|	 d	 }
|
j	d
ddj}|D ]_}|d 
 }|dkrtjdddk}|r|d 
 }|dksJ | d| }|d }n |d 
 }|dksJ | d| }|d }n|d }|d }|t|||d dd  q\W d    n1 sw   Y  t |	 d|
  qF| jr| j D ]}| rt| qd	t|jvr|  qd S d S )NzPreprocessing z! to jsonl format and splitting...r   train)	test_sizer   testtraining
validationz.jsonlwzutf-8)encodingcontext r      instructionz

responsecategory)inputoutputrG   
z split saved to )r   r4   r(   r5   gettrain_test_splitr   itemsr#   openstripnprandomrandintwritejsondumpsr   iterdiris_dirshutilrmtreer6   nameunlink)r&   r1   r9   r:   
test_ratiosave_splitsdatasetsplit_datasetsplit_dataset2
split_nameoutput_filefexamplerB   context_firstrE   _input_outputpr)   r)   r*   r/   `   sT   


"z*DollyDataModule._preprocess_and_split_data)r   Nr   r   NFTr   r   r   TFNN)r+   N)r7   r8   )r5   
__module____qualname____doc__intr   r   boolr   r6   r   r%   r0   r.   floatr/   __classcell__r)   r)   r'   r*   r       s`    
	
$r   )rT   rX   typingr   r   r   r   r   numpyrP   datasetsr   "nemo.collections.llm.gpt.data.corer   )nemo.collections.llm.gpt.data.fine_tuningr	   nemo.lightning.io.mixinr
   
nemo.utilsr   "nemo.collections.common.tokenizersr   -nemo.collections.llm.gpt.data.packed_sequencer   r   r)   r)   r)   r*   <module>   s   