o
    }o™iË  ã                   @   s€   d dl mZ d dlmZ d dlmZmZmZmZm	Z	m
Z
 d dlmZ d dlmZ er6d dlmZ d dlmZ G dd	„ d	eƒZd
S )é    )Ú	lru_cache)ÚPath)ÚTYPE_CHECKINGÚAnyÚDictÚListÚOptionalÚUnion)Úcreate_sft_dataset)ÚFineTuningDataModule)ÚTokenizerSpec)ÚPackedSequenceSpecsc                       s¨   e Zd ZdZ															dd
eeef deded dededee	e  dededede
de
ded deeeef  de
f‡ fdd„Zeddd„ƒZ‡  ZS ) ÚChatDataModulezÀ
    Base class for fine-tuning an LLM on chat datasets.
    This class calls `GPTSFTChatDataset` for chat template processing

    See base class `FineTuningDataModule` for more details.
    é   Né   é   éÒ  é   TFÚdataset_rootÚ
seq_lengthÚ	tokenizerr   Úmicro_batch_sizeÚglobal_batch_sizeÚrampup_batch_sizeÚseedÚmemmap_workersÚnum_workersÚ
pin_memoryÚpersistent_workersÚpacked_sequence_specsr   Údataset_kwargsÚuse_hf_tokenizer_chat_templatec                    s.   t ƒ  |||||||||	|
|||¡ || _dS )a©  Data module for finetuning on chat datasets.
        See base class `FineTuningDataModule` for more details of the arguments.

        Args:
            use_hf_tokenizer_chat_template: Whether to use the chat template from the HuggingFace tokenizer. If True,
                uses the tokenizer's built-in chat template. If False, uses default chat template from
                GPTSFTChatDataset.  Defaults to False.
        N)ÚsuperÚ__init__r!   )Úselfr   r   r   r   r   r   r   r   r   r   r   r   r    r!   ©Ú	__class__© úV/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/llm/gpt/data/chat.pyr#   #   s    ó
zChatDataModule.__init__c                 K   sB   t |f| j|s| jdkr| jn| j| j| jd|d d| jdœ	|¤ŽS )Nr   TF)	r   r   r   r   ÚchatÚis_testÚpack_metadata_file_pathÚpad_cu_seqlensr!   )r
   r   Úpacked_sequence_sizer   r   r   r!   )r$   ÚpathÚpack_metadata_pathr*   Úkwargsr'   r'   r(   Ú_create_datasetM   s   ÿöõzChatDataModule._create_dataset)r   Nr   r   Nr   r   r   TFNNF)NF)Ú__name__Ú
__module__Ú__qualname__Ú__doc__r	   Ústrr   Úintr   r   Úboolr   r   r#   r   r1   Ú__classcell__r'   r'   r%   r(   r      s\    
ñ
þýüûú
ùø	÷
öõôóòñ*r   N)Ú	functoolsr   Úpathlibr   Útypingr   r   r   r   r   r	   Ú"nemo.collections.llm.gpt.data.corer
   Ú)nemo.collections.llm.gpt.data.fine_tuningr   Ú"nemo.collections.common.tokenizersr   Ú-nemo.collections.llm.gpt.data.packed_sequencer   r   r'   r'   r'   r(   Ú<module>   s    