o
    }oi%                     @   s   d dl Z d dlZd dlmZmZmZmZmZm	Z	 d dl
mZmZ d dlmZ d dlmZ d dlmZ er@d dlmZ d dlmZ G d	d
 d
eZdS )    N)TYPE_CHECKINGAnyDictListOptionalUnion)Datasetconcatenate_datasets)FineTuningDataModule)get_dataset_root)logging)TokenizerSpec)PackedSequenceSpecsc                2       s   e Zd ZdZ													
				
							d0deeee f dee dee dee dee dede	ded de	de	deee	  de
de
de	de	d e	d!e
d"e
d#ed$ d%ed&ed'ed(eeeef  f. fd)d*Zd1 fd,d-Zd.d/ Z  ZS )2CustomRetrievalDataModulez2Custom Retrieval Data Module loaded with json fileN{Gz?{Gz?custom_retrieval_dataset         FT     questionpos_docneg_doc	data_rootval_root	test_root	val_ratio
test_ratiodataset_identifier
seq_length	tokenizerr   micro_batch_sizeglobal_batch_sizerampup_batch_sizeforce_redownload
delete_rawseedmemmap_workersnum_workers
pin_memorypersistent_workerspacked_sequence_specsr   	query_keypos_doc_keyneg_doc_keydataset_kwargsc                    sz  || _ || _|du sJ dt|ts|g}|D ]}tj|s(J d| dq|dur?tj|s;J d| dd| _n|| _|durXtj|sTJ d| dd| _n|| _d| j | j | _	|| _
|| _|| _|| _|| _|| _d	| d
| j	 d| j d| j d	}| j
dur|d| j
 7 }| jdur|d| j 7 }t| t jt||||	|
|||||||d dS )a$	  Custom DataModule for Finetuning retrieval Dataset for Embedding model.

        Args:
            data_root (Union[str, List[str]]): The JSONL data file(s) used for training/validation/test.
                if val_root/test_root is not present, data_root will be split to training and val/test based on
                val_ratio/test_ratio.
            val_root (Optional[str]): The JSONL data file used for validation. If not provided, validation set
                will be split from data_root.
            test_root (Optional[str]): The JSONL data file used for test. If not provided, test set
                will be split from data_root.
            val_ratio (Optional[float]): The ratio of validation set when splitting from data_root.
            test_ratio (Optional[float]): The ratio of test set when splitting from data_root.
            dataset_identifier (str): Dataset identifier when saving the dataset to NEMO_HOME.
            seq_length (int, optional): The maximum sequence length for the input and output text. Defaults to 2048.
            tokenizer (Optional[TokenizerSpec], optional): The tokenizer to use for preprocessing the text.
                If not provided, a Megatron GPT2 BPE tokenizer will be used.
            micro_batch_size (int, optional): The micro batch size for training. Defaults to 4.
            global_batch_size (int, optional): The global batch size for training. Defaults to 8.
            rampup_batch_size (Optional[List[int]], optional): A list of batch sizes for ramping up during training.
                Defaults to None.
            seed (int, optional): The random seed for data shuffling. Defaults to 1234.
            memmap_workers (int, optional): The number of worker processes for loading data using TextMemMapDataset.
                Defaults to 1.
            num_workers (int, optional): The number of worker processes for data loading. Defaults to 8.
            pin_memory (bool, optional): Whether to pin memory during data loading for faster GPU training.
                Defaults to True.
            persistent_workers (bool, optional): Whether to keep data loading workers persistent across epochs.
                Defaults to False.
            dataset_kwargs (Optional[Dict[str, Any]], optional): Keyword arguments to pass into the GPTSFTDataset class
        Nz6RetrievalDataModule does not support packed sequences.z
Data root z does not exist.zValidation root r   z
Test root r   zdata_root: z will be split to :z used for train/val/testz, separate validation root: z, separate test root: )dataset_rootr!   r"   r#   r$   r%   r(   r)   r*   r+   r,   r1   )r&   r'   
isinstancelistospathexistsr   r   train_ratior   r   r.   r/   r0   unprocessed_rootr   infosuper__init__r   )selfr   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   	directorylog_info	__class__ [/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/llm/gpt/data/retrieval.pyr=   !   sb   8




z"CustomRetrievalDataModule.__init__returnc                    s&   | j  r| jr|   t   dS )z"Prepare data if not split already.N)
train_pathr8   r&   _preprocess_and_split_datar<   prepare_data)r>   rA   rC   rD   rH      s   z&CustomRetrievalDataModule.prepare_datac                 C   s  t d| jj d i }g }| jD ]}|tt	t
|d qt|}| j| j dkrB|j| j| j | jd}|d |d< n||d< i }| jdkrd| jdkrd|d j| j| j| j  | jd}n!| jdkru| jdkru|d |d< n| jdkr| jdkr|d |d< | jd urtt	t
| jd|d	< n|d |d	< | jd urtt	t
| jd|d< n|d |d< t d
t|d   t dt|d	   t dt|d   | D ]m\}}| j| d }	|	j
dddG}
|D ]<}t|| j tr	|| j d n|| j }t|| j tr|| j n|| j g}|
t|| j ||dd  qW d    n	1 s?w   Y  t | d|	  qd S )NzPreprocessing z! to jsonl format and splitting...rr   )	test_sizer(   traintrainingtest
validationztraining samples: zvalidation samples: ztest samples: z.jsonlwzutf-8)encoding)queryr   r   
z split saved to )r   r;   rB   __name__r:   appendr   	from_listjsonloadopenr	   r   r   train_test_splitr(   r   r   lenitemsr3   r4   r/   r5   r0   writedumpsr.   )r>   save_splitstrain_datasetsdata_dirtrain_datasetsplit_datasetsplit_dataset2
split_namedatasetoutput_filefor   r   rC   rC   rD   rG      sN   


*($z4CustomRetrievalDataModule._preprocess_and_split_data)NNr   r   r   r   Nr   r   NFTr   r   r   TFNr   r   r   N)rE   N)rS   
__module____qualname____doc__r   strr   r   floatintboolr   r   r=   rH   rG   __classcell__rC   rC   rA   rD   r      s    	

nr   )rV   os.pathr6   typingr   r   r   r   r   r   datasetsr   r	   *nemo.collections.llm.bert.data.fine_tuningr
   "nemo.collections.llm.gpt.data.corer   
nemo.utilsr   "nemo.collections.common.tokenizersr   -nemo.collections.llm.gpt.data.packed_sequencer   r   rC   rC   rC   rD   <module>   s    