o
    }oi!                     @   s   d dl Z d dlZd dlmZmZmZ d dlmZmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ er:d dlmZ G d	d
 d
eeZdS )    N)TYPE_CHECKINGListOptional)DatasetDictload_dataset)get_dataset_root)FineTuningDataModule)IOMixin)logging)TokenizerSpecc                       s   e Zd ZdZ												
			d)dedededed dededeee  dededededededef fddZ	d*ddZ
dd  Z	!d+d"ed#ed$efd%d&Zd'd( Z  ZS ),SquadDataModulea  A data module for fine-tuning on the Squad dataset.

    This class inherits from the `FineTuningDataModule` class and is specifically designed for fine-tuning models on the
    Stanford Question Answering Dataset (SQuAD). It handles data download, preprocessing, splitting, and preparing the data
    in a format suitable for training, validation, and testing.

    Args:
        force_redownload (bool, optional): Whether to force re-download the dataset even if it exists locally. Defaults to False.
        delete_raw (bool, optional): Whether to delete the raw downloaded dataset after preprocessing. Defaults to True.
        See FineTuningDataModule for the other args
    N            FT     dataset_root
seq_lengthseq_length_dec	tokenizerr   micro_batch_sizeglobal_batch_sizerampup_batch_sizeforce_redownload
delete_rawseedmemmap_workersnum_workers
pin_memorypersistent_workersc                    sD   || _ |	| _t j|d u rtdn||||||||
||||d d S )Nsquad)r   r   r   r   r   r   r   r   r   r   r   r    )r   r   super__init__r   )selfr   r   r   r   r   r   r   r   r   r   r   r   r   r    	__class__ V/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/llm/t5/data/squad.pyr#   +   s    
zSquadDataModule.__init__returnc                 C   s.   | j  r
| js
d S |  }| j|dd d S )NF)split_val_from_train)
train_pathexistsr   _download_data_preprocess_and_split_data)r$   dsetr'   r'   r(   prepare_dataN   s   zSquadDataModule.prepare_datac                 C   s8   t d| jj d tdt| j| jrddS d dS )NzDownloading z...r!   r   )	cache_dirdownload_mode)r
   infor&   __name__r   strr   r   r$   r'   r'   r(   r-   V   s   zSquadDataModule._download_data皙?r/   r*   val_proportionc              	   C   s  t d| jj d i }|d}|d}|r2|j|| jd}|d |d< |d |d< ||d< n|j|| jd}||d< |d |d< |d |d< | D ]l\}}	| j| d }
|
j	d	d
dG}|	D ]<}i }d|d  d d |d  d d |d  |d< |d d d |d< |dkr|d d |d< |
t|d  qdW d   n1 sw   Y  t | d|
  qN| jr| j D ]}| rt| qdt|jvr|  qdS dS )a  Preprocesses and splits the downloaded dataset into training, validation, and test sets.

        Args:
            dset (DatasetDict): The downloaded dataset object.
            split_val_from_train (bool, optional): Whether to split the validation set from the training set.
                If False, the validation set is split from the test set. Defaults to True.
            val_proportion (float, optional): The proportion of the training or test set to be used for the validation split.
                Defaults to 0.05.
        zPreprocessing z! to jsonl format and splitting...train
validation)	test_sizer   trainingtestz.jsonlwzutf-8)encodingzTitle: title zParagraph: contextz Question: questioninputanswerstextr   outputoriginal_answers
Nz split saved to )r
   r3   r&   r4   gettrain_test_splitr   itemsr   openwritejsondumpsr   iterdiris_dirshutilrmtreer5   nameunlink)r$   r/   r*   r8   save_splits	train_setval_setsplit_dataset
split_namedatasetoutput_filefexample	json_linepr'   r'   r(   r.   ^   sd   



z*SquadDataModule._preprocess_and_split_datac                 C   s   d S )Nr'   r6   r'   r'   r(   reconfigure_limit_batches   s   z)SquadDataModule.reconfigure_limit_batches)Nr   r   Nr   r   NFTr   r   r   TF)r)   N)Tr7   )r4   
__module____qualname____doc__r5   intr   r   boolr#   r0   r-   r   floatr.   rb   __classcell__r'   r'   r%   r(   r      sp    
	

#	
<r   )rO   rS   typingr   r   r   datasetsr   r   !nemo.collections.llm.t5.data.corer   (nemo.collections.llm.t5.data.fine_tuningr   nemo.lightning.io.mixinr	   
nemo.utilsr
   "nemo.collections.common.tokenizersr   r   r'   r'   r'   r(   <module>   s   