o
    }oi                     @   s   d dl Z d dlZd dlmZmZmZmZmZ d dlm	Z	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ er>d dlmZ G d	d
 d
eeZdS )    N)TYPE_CHECKINGAnyDictListOptional)DatasetDictload_dataset)get_dataset_root)FineTuningDataModule)IOMixin)logging)TokenizerSpecc                        s   e Zd ZdZ												
			d*dededed dededeee  dedededededededee	ee
f  f fddZd+ fddZdd  Zd,d#ed$ed%efd&d'Zd(d) Z  ZS )-SpecterDataModulea  A data module for fine-tuning on the Specter dataset.

    This class inherits from the `FineTuningDataModule` class and is specifically designed for fine-tuning models
    on the SPECTER Datasets. It handles data download, preprocessing, splitting, and preparing the data
    in a format suitable for training, validation, and testing.

    Args:
        force_redownload (bool, optional): Whether to force re-download the dataset even if it exists locally.
                                           Defaults to False.
        delete_raw (bool, optional): Whether to delete the raw downloaded dataset after preprocessing.
                                     Defaults to True.
        See FineTuningDataModule for the other args
    N         FT     r   dataset_root
seq_length	tokenizerr   micro_batch_sizeglobal_batch_sizerampup_batch_sizeforce_redownload
delete_rawseedmemmap_workersnum_workers
pin_memorypersistent_workersdataset_kwargsc                    sD   || _ || _t j|d u rtdn|||||||	|
||||d d S )Nspecter)r   r   r   r   r   r   r   r   r   r   r    r!   )r   r   super__init__r	   )selfr   r   r   r   r   r   r   r   r   r   r   r   r    r!   	__class__ Z/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/llm/bert/data/specter.pyr$   ,   s    
zSpecterDataModule.__init__returnc                    s0   | j  r| jr|  }| | t   dS )z Prepare dataset for fine-tuning.N)
train_pathexistsr   _download_data_preprocess_and_split_datar#   prepare_data)r%   dsetr&   r(   r)   r/   O   s   
zSpecterDataModule.prepare_datac                 C   s:   t d| jj d tddt| j| jrddS d dS )NzDownloading z...zsentence-transformers/spectertripletr   )	cache_dirdownload_mode)r   infor'   __name__r   strr   r   r%   r(   r(   r)   r-   W   s   z SpecterDataModule._download_data皙?333333?r0   train_ratio	val_ratioc                 C   sd  t d| jj d d| | }i }|d}|j|| | jd}|d j|||  | jd}|d |d< |d |d< |d |d< | D ]G\}	}| j|	 d	 }
|
j	d
dd"}|D ]}|
t|d |d |d gdd  q\W d   n1 s~w   Y  t |	 d|
  qF| jr| j D ]}| rt| qd	t|jvr|  qdS dS )a  Preprocesses and splits the downloaded dataset into training, validation, and test sets.

        Args:
            dset (DatasetDict): The downloaded dataset object.
            split_val_from_train (bool, optional): Whether to split the validation set from the training set.
                If False, the validation set is split from the test set. Defaults to True.
            val_proportion (float, optional): The proportion of the training or test set to be used
                for the validation split. Defaults to 0.05.
        zPreprocessing z! to jsonl format and splitting...r   train)	test_sizer   testtraining
validationz.jsonlwzutf-8)encodinganchorpositivenegative)querypos_docneg_doc
Nz split saved to )r   r4   r'   r5   gettrain_test_splitr   itemsr   openwritejsondumpsr   iterdiris_dirshutilrmtreer6   nameunlink)r%   r0   r:   r;   
test_ratiosave_splitsdatasetsplit_datasetsplit_dataset2
split_nameoutput_filefopr(   r(   r)   r.   `   s<   

"z,SpecterDataModule._preprocess_and_split_datac                 C   s   dS )z?No need to reconfigure trainer.limit_val_batches for finetuningNr(   r7   r(   r(   r)   reconfigure_limit_batches   s   z+SpecterDataModule.reconfigure_limit_batches)Nr   Nr   r   NFTr   r   r   TFN)r*   N)r8   r9   )r5   
__module____qualname____doc__r6   intr   r   boolr   r   r$   r/   r-   r   floatr.   ra   __classcell__r(   r(   r&   r)   r      sb    
	
#	(r   )rO   rS   typingr   r   r   r   r   datasetsr   r   #nemo.collections.llm.bert.data.corer	   *nemo.collections.llm.bert.data.fine_tuningr
   nemo.lightning.io.mixinr   
nemo.utilsr   "nemo.collections.common.tokenizersr   r   r(   r(   r(   r)   <module>   s   