o
    }oi                     @   s   d dl Z d dlZd dlmZmZmZmZmZ d dlm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZ erBd dlmZ d d	lmZ G d
d deeZdS )    N)TYPE_CHECKINGAnyDictListOptional)load_dataset)get_dataset_root)FineTuningDataModule)IOMixin)logging)TokenizerSpec)PackedSequenceSpecsc                        s   e Zd ZdZ															d'd
eded dededeee  dedededededededed deee	e
f  f fddZd( fddZdd  Zd)d#ed$efd%d&Z  ZS )*AlpacaDataModulea  A data module for fine-tuning on the Alpaca Python dataset.

    This class inherits from the `FineTuningDataModule` class and is specifically designed for fine-tuning models
    on the "iamtarun/python_code_instructions_18k_alpaca" dataset. It handles data download, preprocessing, splitting,
    and preparing the data in a format suitable for training, validation, and testing.

    Args:
        force_redownload (bool, optional): Whether to force re-download the dataset even if it exists locally.
                                           Defaults to False.
        delete_raw (bool, optional): Whether to delete the raw downloaded dataset after preprocessing.
                                     Defaults to True.
        See FineTuningDataModule for the other args
       N      FT     
seq_length	tokenizerr   micro_batch_sizeglobal_batch_sizerampup_batch_sizeforce_redownload
delete_rawseedmemmap_workersnum_workers
pin_memorypersistent_workerspacked_sequence_specsr   dataset_kwargsc                    s:   || _ || _t jtd|||||||	|
||||d d S )Nalpaca)dataset_rootr   r   r   r   r   r   r   r   r   r   r    r!   )r   r   super__init__r   )selfr   r   r   r   r   r   r   r   r   r   r   r   r    r!   	__class__ X/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/llm/gpt/data/alpaca.pyr%   .   s"   
zAlpacaDataModule.__init__returnc                    s0   | j  r| jr|  }| | t   d S )N)
train_pathexistsr   _download_data_preprocess_and_split_datar$   prepare_data)r&   dsetr'   r)   r*   r0   R   s   
zAlpacaDataModule.prepare_datac                 C   s8   t d| jj d tdt| j| jrddS d dS )NzDownloading z...z,iamtarun/python_code_instructions_18k_alpacar   )	cache_dirdownload_mode)r   infor(   __name__r   strr#   r   )r&   r)   r)   r*   r.   Y   s   zAlpacaDataModule._download_data皙?333333?train_ratio	val_ratioc              
   C   sv  t d| jj d d| | }i }|d}|j|| | jd}|d j|||  | jd}|d |d< |d |d< |d |d< | D ]P\}	}| j|	 d	 }
|
j	d
dd+}|D ] }|d d |d 
d }|d }|t||dd  q\W d    n1 sw   Y  t |	 d|
  qF| jr| j D ]}| rt| qd	t|jvr|  qd S d S )NzPreprocessing z! to jsonl format and splitting...r   train)	test_sizer   testtraining
validationz.jsonlwzutf-8)encodingpromptz
### Outputoutput)inputrC   
z split saved to )r   r4   r(   r5   gettrain_test_splitr   itemsr#   openfindwritejsondumpsr   iterdiris_dirshutilrmtreer6   nameunlink)r&   r1   r9   r:   
test_ratiosave_splitsdatasetsplit_datasetsplit_dataset2
split_nameoutput_fileforB   
completionpr)   r)   r*   r/   a   s<   
z+AlpacaDataModule._preprocess_and_split_data)r   Nr   r   NFTr   r   r   TFNN)r+   N)r7   r8   )r5   
__module____qualname____doc__intr   r   boolr   r6   r   r%   r0   r.   floatr/   __classcell__r)   r)   r'   r*   r      s`    
	
$r   )rL   rP   typingr   r   r   r   r   datasetsr   "nemo.collections.llm.gpt.data.corer   )nemo.collections.llm.gpt.data.fine_tuningr	   nemo.lightning.io.mixinr
   
nemo.utilsr   "nemo.collections.common.tokenizersr   -nemo.collections.llm.gpt.data.packed_sequencer   r   r)   r)   r)   r*   <module>   s   