o
    }oi-                     @   s   d dl Z d dlZd dlmZ d dlmZmZmZmZm	Z	m
Z
 d dlmZmZ d dlmZ d dlmZ d dlmZ d dlmZ erLd d	lmZ d d
lmZ G dd deeZdS )    N)Path)TYPE_CHECKINGAnyDictListOptionalUnion)DatasetDictload_dataset)get_dataset_root)FineTuningDataModule)IOMixin)logging)TokenizerSpec)PackedSequenceSpecsc                "       s   e Zd ZdZ																d*d
eeeef  deded dededee	e  de
de
dededede
de
ded deeeef  f fddZd+ fddZd d! Z	"d,d#ed$e
d%efd&d'Zd(d) Z  ZS )-SquadDataModulea  A data module for fine-tuning on the Squad dataset.

    This class inherits from the `FineTuningDataModule` class and is specifically designed for
    fine-tuning models on the Stanford Question Answering Dataset (SQuAD). It handles data download,
    preprocessing, splitting, and preparing the data in a format suitable for training,
    validation, and testing.

    Args:
        dataset_root (Optional[Union[str, Path]]): The root directory containing the training,
            validation, and test data. Defaults to None, which by default downloads the data.
        force_redownload (bool, optional): Whether to force re-download the dataset even if it
            exists locally. Defaults to False.
        delete_raw (bool, optional): Whether to delete the raw downloaded dataset after preprocessing.
            Defaults to True.

        See FineTuningDataModule for the other args
    N         FT     dataset_root
seq_length	tokenizerr   micro_batch_sizeglobal_batch_sizerampup_batch_sizeforce_redownload
delete_rawseedmemmap_workersnum_workers
pin_memorypersistent_workerspacked_sequence_specsr   dataset_kwargsc                    sF   || _ || _t j|d ur|ntd||||||	|
|||||d d S )Nsquad)r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   )r   r   super__init__r   )selfr   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   	__class__ W/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/llm/gpt/data/squad.pyr(   2   s"   
zSquadDataModule.__init__returnc                    s0   | j  r| jr|  }| | t   d S )N)
train_pathexistsr   _download_data_preprocess_and_split_datar'   prepare_data)r)   dsetr*   r,   r-   r3   W   s   
zSquadDataModule.prepare_datac                 C   s8   t d| jj d tdt| j| jrddS d dS )NzDownloading z...r&   r   )	cache_dirdownload_mode)r   infor+   __name__r
   strr   r   r)   r,   r,   r-   r1   ^   s   zSquadDataModule._download_data皙?r4   split_val_from_trainval_proportionc              	   C   s  t d| jj d i }|d}|d}|r2|j|| jd}|d |d< |d |d< ||d< n|j|| jd}||d< |d |d< |d |d< | D ]d\}}	| j| d }
|
j	d	d
d?}|	D ]4}i }d|d  d |d  d |d< |d d d |d< |dkr|d d |d< |
t|d  qdW d   n1 sw   Y  t | d|
  qN| jr| j D ]}| rt| qdt|jvr|  qdS dS )a  Preprocesses and splits the downloaded dataset into training, validation, and test sets.

        Args:
            dset (DatasetDict): The downloaded dataset object.
            split_val_from_train (bool, optional): Whether to split the validation set from the training set.
                If False, the validation set is split from the test set. Defaults to True.
            val_proportion (float, optional): The proportion of the training or test set to be used
                for the validation split. Defaults to 0.05.
        zPreprocessing z! to jsonl format and splitting...train
validation)	test_sizer   trainingtestz.jsonlwzutf-8)encodingz	Context: contextz Question: questionz Answer:inputanswerstextr   outputoriginal_answers
Nz split saved to )r   r7   r+   r8   gettrain_test_splitr   itemsr   openwritejsondumpsr   iterdiris_dirshutilrmtreer9   nameunlink)r)   r4   r<   r=   save_splits	train_setval_setsplit_dataset
split_namedatasetoutput_filefexample	json_linepr,   r,   r-   r2   f   sH   


z*SquadDataModule._preprocess_and_split_datac                 C   s   dS )zno opNr,   r:   r,   r,   r-   reconfigure_limit_batches   s   z)SquadDataModule.reconfigure_limit_batches)Nr   Nr   r   NFTr   r   r   TFNN)r.   N)Tr;   )r8   
__module____qualname____doc__r   r   r9   r   intr   boolr   r   r(   r3   r1   r	   floatr2   re   __classcell__r,   r,   r*   r-   r      sv    
	
%	
4r   )rR   rV   pathlibr   typingr   r   r   r   r   r   datasetsr	   r
   "nemo.collections.llm.gpt.data.corer   )nemo.collections.llm.gpt.data.fine_tuningr   nemo.lightning.io.mixinr   
nemo.utilsr   "nemo.collections.common.tokenizersr   -nemo.collections.llm.gpt.data.packed_sequencer   r   r,   r,   r,   r-   <module>   s    