o
    }oi;                     @   s   d dl Z d dlmZ d dlmZ d dlmZ d dlZd dl	m
Z
 d dlmZ d dlmZ d dlmZmZmZ d	ed
e
dededee f
ddZ	 		ddedededed
e
dedee dedefddZeG dd dZdS )    N)	dataclass)Path)Optional)TokenizerSpec)create_sft_dataset)logging)create_histcreate_packing_strategyfill_packing_strategypath	tokenizermax_seq_lengthseeddataset_kwargsc                    s   |si }| d}|rt|tst||d< |dd}|r#||j_td| |||dd| t	
 fddtt D S )	a  
    Tokenizes a dataset from the provided path using the specified tokenizer
    and prepares it for further processing.

    Args:
        path (Path): Path to the dataset file.
        tokenizer (TokenizerSpec): The tokenizer to use for tokenization.
        max_seq_length (int): Maximum sequence length for the tokens.
        seed (int): Random seed for shuffling the dataset (optional).

    Returns:
        np.ndarray: A NumPy array containing the tokenized data.
    tool_schemaschat_templateNT)r   r   
seq_lengthr   is_testc                    s   g | ]} | qS  r   ).0idatasetr   a/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/llm/gpt/data/packed_sequence.py
<listcomp>E   s    z$tokenize_dataset.<locals>.<listcomp>r   )get
isinstancestrjsondumpspopr   r   r   nparrayrangelen)r   r   r   r   r   tsr   r   r   r   tokenize_dataset   s$   
 r&   first_fit_shuffle
input_pathoutput_pathoutput_metadata_pathpacked_sequence_sizepacking_algorithmc	                 C   s  t d|   t| ||||}	t|	|\}
}t|||\}}t||
||j}t|| |durz%t	|d}t
|}t|tsFJ dW d   n1 sPw   Y  W n tya   g }Y nw || t	|d}t
|| W d   n1 s}w   Y  t d|  dS )a  
    Prepares a packed sequence dataset from a given input file and saves it to an output file.

    Args:
        input_path (Path): Path to the input dataset file.
        output_path (Path): Path to save the packed sequence data.
        packed_sequence_size (int): The maximum size for each packed sequence.
        tokenizer (TokenizerSpec): The tokenizer to use for tokenization.
        max_seq_length (int): Maximum sequence length for the tokens.
        seed (Optional[int]): Random seed for shuffling (optional).
        packing_algorithm (str): The algorithm used for packing sequences
                currently supports "first_fit_shuffle" and "first_fit_decreasing".

    Returns:
        None: Saves the packed sequence data to the specified output path.
    zPreparing packed sequence from Nrzinvalid packing_metadata_file!wz)Packed sequence is prepared and saved to )r   infor&   r   r	   r
   eos_idr!   saveopenr   loadr   listFileNotFoundErrorappenddump)r(   r)   r*   r+   r   r   r   r,   r   r   	sequences	histogramassignmentspacking_metadataoutput_datafpacking_metadata_filer   r   r   prepare_packed_sequence_dataH   s*   
	
r?   c                   @   sn   e Zd ZU dZdZeed< 	 dZeed< 	 dZ	eed< 	 dZ
eed< 	 dZeed< 	 d	Zeed
< 	 dd ZdS )PackedSequenceSpecszPDefines the packed sequence specifications used for generating a packed dataset.r+   Ntokenizer_model_namepacked_train_data_pathpacked_val_data_pathpacked_metadata_pathFpad_cu_seqlensc                 C   s   | j d ur&t| j | _ | j jdksJ d| j  | j  s&J d| j  | jd urLt| j| _| jjdks?J d| j | j sNJ d| j d S d S )Nz.npyz/packed training data file must be a .npy file: z*packed training data file does not exist: z1packed validation data file must be a .npy file: z,packed validation data file does not exist: )rC   r   suffixexistsrD   )selfr   r   r   __post_init__   s(   





z!PackedSequenceSpecs.__post_init__)__name__
__module____qualname____doc__r+   int__annotations__rB   r   rC   rD   rE   rF   boolrJ   r   r   r   r   r@      s   
 r@   )r   r'   N)r   dataclassesr   pathlibr   typingr   numpyr!   "nemo.collections.common.tokenizersr   "nemo.collections.llm.gpt.data.corer   
nemo.utilsr   !nemo.utils.sequence_packing_utilsr   r	   r
   rO   dictr&   r   r?   r@   r   r   r   r   <module>   sX   
4	
<