o
    }oiI                     @   s2  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
mZmZ d dlmZmZmZmZmZmZ d dlZd dlZd dlmZ d dlmZ d dlmZ eeZdZ dZ!d	Z"d
d
dZ#e$dZ%dZ&dZ'dd Z(G dd deZ)G dd de)Z*G dd dZ+de(dfde,fddZ-dd Z.dd Z/			dHde,ded e0fd!d"Z1d#d$ Z2d%e3d&ed'e4d(e5d)e3d*e4fd+d,Z6d%e3d-ee3 fd.d/Z7dId%e3d&ed0eee  d-e3fd1d2Z8d3d4 Z9d5d6 Z:d7d8 Z;d9d: Z<d;d< Z=de,fd=d>Z>d?e,de,d-e,fd@dAZ?dBdC Z@dDdE ZAd-ee4eBf fdFdGZCdS )J    N)	lru_cachepartial)AnyCallableListOptionalTypeUnion)TokenizerSpec)Dataset)AppState iSystem )TEXT_TO_VALUEVALUE_TO_TEXTz\{%-?\s+generation\s+-?%\}z0.2idxc                 C   s   t j| t jdd}t ||kd }|j}| }t|dks)|d d t|kr2|t|d g }t|dkrW|d |d  dk rW|d t|dkrW|d |d  dk sBt j||d}|j	
  ~|S )	z
    Build index of delimiter positions between samples in memmap.
    Can be provided externally.

    Returns a 1D array of ints.
    rdtypemoder         r   )npmemmapuint8wherer   tolistlenpopasarray_mmapclose)fnnewline_intmdatamidx
midx_dtype r+   W/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/llm/gpt/data/utils.pybuild_index_from_memdata6   s     
 
r-   c                       s   e Zd ZdZddddeddfdee dee dee d	ee d
ee	d  dee
eee gef  dee dee f fddZdd Zdd Zdd Zdd Zdd Zddee fddZ  ZS )_TextMemMapDatasetzO
    Allow per-line lazy access to multiple text files using numpy memmap.
    
   r   NTdataset_pathsr'   header_linesworkers	tokenizerr
   build_index_fnsort_dataset_pathsindex_mapping_dirc	                    s  t    g _t|tr|g}t|dk rtd|_ _|_	|_
|_|_|r4tj	_	td tj oBtj }	|	rN|	rXtj dkrXt||j
|d |	rbt sbtj  |	rtt jdkrtt||j
|d |	r~t s~tj  td t }
fddj	D }td	t| d
tjt |
 d  td t fdd|D }|_|_jd _ dS )a  
        Args:
            dataset_paths: list of JSONL file paths.
            newline_int: ASCII code to use to interpret newlines in file.
            header_lines: number of header lines in JSON files.
            workers: number of workers to use for creating index files.
            tokenizer: tokenizer to use to convert text to tokens.
            build_index_fn: a callable build_index_fn(fn, newline_int) -> midx [np.array]
                that returns the index of newlines in a file fn must be pickleable
                (to be used in multiprocessing.Pool.map).
            sort_dataset_paths: whether to sort datasets by paths.
            index_mapping_dir: directory to save the index mapping to.
                If None, will write to the same folder as the dataset.
        r   -files_list must contain at leat one file namezBuilding data filesr   )r2   r4   r6   zLoading data filesc                    s   g | ]} | qS r+   )	load_file).0r&   )r6   selfr+   r,   
<listcomp>   s    z/_TextMemMapDataset.__init__.<locals>.<listcomp>zTime loading  mem-mapped files: secondszComputing global indicesc                    s   g | ]
\}}t |  qS r+   )r!   )r9   _r)   )r1   r+   r,   r;      s    r   N)!super__init__mdata_midx_list
isinstancestrr!   
ValueError_newline_int_header_lines_files_list_workerr3   _sort_dataset_pathssortedloggerinfotorchdistributedis_availableis_initializedget_rankbuild_index_fileslightning_prepare_databarrierr   
local_ranktimedatetime	timedeltar   cumsum	midx_bins_size)r:   r0   r'   r1   r2   r3   r4   r5   r6   is_distributed
start_timerB   r[   	__class__)r1   r6   r:   r,   rA   Y   sb   








z_TextMemMapDataset.__init__c                 C   s(   | j r| j D ]\}}|j  qd S d S N)rB   r$   r%   )r:   r(   r)   r+   r+   r,   __del__   s
   z_TextMemMapDataset.__del__c                 C      | j S ra   )r\   r:   r+   r+   r,   __len__      z_TextMemMapDataset.__len__c                 C   sl  |t | ks
|dk rtd| dt |  dtj|| jdd}|dkr+| j|d  nd}|| | j }| j| \}}|dkrFd}|d }n||d  d }|| }z	| |||}	W n' ty }
 zt	
d|
  t	
d	| d
| d| d|  |
d}
~
ww z| |	}W |S  ty }
 zt	
d|
  t	
d|	 d| d
| d| d| 
 |
d}
~
ww )z4
        Return a string from binary memmap
        r   zIndex z if out of dataset range with z samplesF)rightr   z)Error while fetching sample from memmap: z	file_id: z, file_idx: z, i: z, j: NzQError while building data from text, possible issue with sample expected format: zsample: z, file_id: )r!   
IndexErrorr   digitizer[   rG   rB   _fetch_sample_from_memmap	ExceptionrL   error_build_data_from_text)r:   r   file_idbase_idxfile_idxr(   r)   ijsampleedatar+   r+   r,   __getitem__   s8   
"(z_TextMemMapDataset.__getitem__c                 C   s   |||   d}|S )z
        Fetchs the text sample.
        Can be overriden by child-classes to support loading of partial samples and alternative decode methods.
        zutf-8)tobytesdecode)r:   r(   rq   rr   textr+   r+   r,   rj      s   z,_TextMemMapDataset._fetch_sample_from_memmapc                 C   s"   | j dur| j |}|S |}|S )zMAllows child-classes to modify the parsing of raw text, prior to tokenizationN)r3   text_to_ids)r:   ry   ru   r+   r+   r,   rm      s
   
z(_TextMemMapDataset._build_data_from_textc           
   	   C   s$  t d|  t||}tj|tjdd}t|rtj|d ddd}t|| j	k r4t
d| j	 dt|d	 d
}t|}W d   n1 sKw   Y  d|v ri|d }| j|krit d| j d|  |dd}	t|	krt
dt dt d|	 d| ||fS td| d| d)z
        Loads a text file as np.int8.

        Returns:
            mdata - memorymap of np.int8
            midx - indices pointing to the end-of-line (or end of file) position
            size - number of lines in file
        zLoading r   r   .npyTallow_pickle	mmap_modezMissing header, expected z header lines.inforbNr'   z$Mismatch in newline_int, expected = z but loaded versionz0.0z+Version mismatch: Please delete existing '.z' files. Expected version = z, but file version = z. File path = zMemory Map for z- is not found, missing one or more of files: z.{.npy,.info})rL   rM   	_index_fnr   r   r   _index_file_existsloadr!   rG   RuntimeErroropenpicklerF   warningget__idx_version____idx_suffix__rE   )
r:   r&   r6   idx_fnr(   r)   fpidx_info_dictr'   idx_versionr+   r+   r,   r8     s:   	

	z_TextMemMapDataset.load_filera   )__name__
__module____qualname____doc__r-   r   rD   r   intr   r   boolrA   rb   re   rv   rj   rm   r8   __classcell__r+   r+   r_   r,   r.   T   s@    
	i'
r.   c                       st   e Zd ZdZ						ddee dee dee d	ee d
eed  dee	 dee f fddZ
dd Z  ZS )_JSONLMemMapDatasetz4
    Memory-mapped iteration over a JSONL file.
    r/   r   NTr0   r'   r1   r2   r3   r
   r5   r6   c              	      s   t  j|||||||d dS )a2  
        Args:
            dataset_paths: list of JSONL file paths.
            newline_int: ASCII code to use to interpret newlines in file.
            header_lines: number of header lines in JSON files.
            workers: number of workers to use for creating index files.
            tokenizer: tokenizer to use to convert text to tokens.
            sort_dataset_paths: whether to sort datasets by paths.
            index_mapping_dir: directory to save the index mapping to.
                If None, will write to the same folder as the dataset.
        )r0   r'   r1   r2   r3   r5   r6   N)r@   rA   )r:   r0   r'   r1   r2   r3   r5   r6   r_   r+   r,   rA   9  s   
z_JSONLMemMapDataset.__init__c              
   C   sN   zt |}W |S  ty& } ztd|  td|  |d}~ww )z8Return a dictionary of data based on a single JSON line.zException: zdatapoint: N)jsonloadsrk   rL   rl   )r:   ry   recordrt   r+   r+   r,   rm   X  s   z)_JSONLMemMapDataset._build_data_from_text)r/   r   NNTN)r   r   r   r   r   rD   r   r   r   r   rA   rm   r   r+   r+   r_   r,   r   4  s0    
r   c                   @   s   e Zd ZdZ					d deded	ed
edededefddZdd ZdedefddZdefddZ	dd Z
dd ZdedejfddZdS )!_OnlineSampleMappingax  
    This class replaces NeMo's get_samples_mapping function which pre-computes.
    It is used to create a sample mapping for certain number of samples, including
    pseudo-random shuffling.
    The sampler allows to down, or upsample a given dataset.
    Shuffling leads to pseudo-random shuffling, where blocks are shuffled,
    and each block is internally shuffled.
    @B r   r   TFdataset_sizenum_samples
block_sizecache_maxsizeseedshuffletruncate_to_block_boundaryc                 C   s\  || _ || _|dur|n| j | _|| _|| _|| _|| _t| j| j | _t	| j| j | _| j| j }t
t| j| j | _| jr`|r`| j| jkrP|  j|8  _|  jd8  _|  j|8  _d}t| j}	t| j| j}
|rx||
d< d| _nd| _|rtjj| jd}|t| j}|	| }	|
| }
|	| _|
| _t|
| _t|dd| j| _dS )	a  
        Args:
            dataset_size (int): Size of the dataset.
            num_samples (int): Number of samples the dataset should contain.
            block_size (int): Size of each sample block. This is used to shuffle the samples.
                              None will be replaced with dataset size.
            cache_maxsize (int): Maximum size of the blocks cache for the get_sample_block function.
            seed (int): Seed for the random number generator used for shuffling.
            shuffle (bool): Whether to shuffle the samples.
            truncate_to_block_boundary (bool): Whether to truncate the last block to the block boundary.
        Nr   r   r   TFr   )maxsizetyped)r   r   r   r   r   r   r   maxrequired_samplesminr   r   ceil
num_blocksarangefulluse_digitizerandomRandomStatepermutationblock_idx_listblock_size_listrZ   
block_binsr   get_sample_block)r:   r   r   r   r   r   r   r   last_block_sizer   r   	local_rngr   r+   r+   r,   rA   m  s@   
z_OnlineSampleMapping.__init__c                 C   s>   d| j  d| j d| j d| j d| j d| j d| j dS )	Nz!OnlineSampleMapping(dataset_size=z, num_samples=z, block_size=z, cache_maxsize=z, seed=z
, shuffle=z, truncate_to_block_boundary=))r   r   r   r   r   r   r   rd   r+   r+   r,   __str__  s   z_OnlineSampleMapping.__str__r   returnc                    s  t |trP|}|j|j|j}}}t |d ur|nd}| jkr% j}t |d ur-|n j}| jkr9 j}|d ur?|nd} fddt|||D }|S | jkrYtd|dk rj| j7 }|dk rjtd j	rut
| j}n| j } |}| j|  }	||	 }
|
d d fS )Nr   r   c                    s   g | ]} | qS r+   r+   )r9   r   rd   r+   r,   r;         z4_OnlineSampleMapping.__getitem__.<locals>.<listcomp>zIndex out of range)rC   slicestartstopstephandle_indexr   rangerh   r   r   ri   r   r   r   )r:   r   slcr   r   r   sample_slice	block_idxsample_block	local_idx
sample_idxr+   rd   r,   rv     s2   







z _OnlineSampleMapping.__getitem__c                 C   rc   ra   )r   rd   r+   r+   r,   re     rf   z_OnlineSampleMapping.__len__c                 C   s&   | j | j| j| j| j| j| j| jffS )z<Add support for pickling. Needed due to functools.lru_cache.)r`   r   r   r   r   r   r   r   rd   r+   r+   r,   
__reduce__  s   z_OnlineSampleMapping.__reduce__c                 C   s   |   S ra   )r   )r:   protocolr+   r+   r,   __reduce_ex__  s   z"_OnlineSampleMapping.__reduce_ex__r   c                 C   s   || j krtd| d| j d  | j| | j }|| j|  }t||}| jr9tjj	| j
| d}||}|| j }|S )z
        Returns a block of samples of size self.block_size, shuffled if needed.
        NOTE: This method will be cached using functools.lru_cache for efficiency during construction.
        z
block_idx z' is out of range. Maximum block_idx is r   r   )r   rh   r   r   r   r   r   r   r   r   r   r   r   )r:   r   	start_idxend_idxr   r   r+   r+   r,   r     s   


z%_OnlineSampleMapping.get_sample_blockN)r   r   r   TF)r   r   r   r   r   r   rA   r   rv   re   r   r   r   ndarrayr   r+   r+   r+   r,   r   c  s8    
H*r   r6   c           	   
   C   s   t | dk r
td|du rtdt d }tdt |  d| d t }t	d}|
|}|tt|||d	| }W d   n1 sJw   Y  td
t| dt | dtjt | d  dS )z.Auxiliary method to build multiple index filesr   r7   Nr   zProcessing z data files using z workersfork)r6   zTime building z / r<   r=   )r!   rE   r   os	cpu_countrL   rM   rW   mpget_contextPoolmapr   _build_memmap_index_filessumrX   rY   )	r0   r'   r2   r4   r6   r^   ctxpbuild_statusr+   r+   r,   rS     s.   
rS   c                 C   sD   |dk r|t |  d krt | | }|S |dk r td| |S )a  
    Remaps negative indices and handles numpy int indices.

    Arguments:
        dataset (Dataset): dataset to index into
        idx (int): Index. Can include negative indices.
    Returns:
        int: Remapped and fully qualified index.

    Raises:
        IndexError: If a negative index is out of range.

    Examples:
        >>> import numpy as np
        >>> import torch
        >>> from torch.utils.data import TensorDataset
        >>> from nemo_chem.data.fasta_dataset import handle_index
        >>> dataset = TensorDataset(torch.tensor(-np.arange(5)))
        >>> handle_index(dataset, 1)
        1
        >>> handle_index(dataset, -2)
        3

    r   r   zIndex out of range: )r!   rh   )datasetr   r+   r+   r,   r   8  s   r   c                  C   s   ddl } tdd |  D S )a"  
    This function checks whether it is invoked in lightning's hook "prepare_data", which is run only on rank 0.
    TextMemMapDataset contains a torch.distributed.barrier operation, so when run inside the single-process hook
    prepare_data, the barrier operation would hang forever.
    r   Nc                 S   s$   g | ]}|j d kod|jd v qS )prepare_dataprepare_packed_sequence_datar   )functioncode_context)r9   framer+   r+   r,   r;   b  s    z*lightning_prepare_data.<locals>.<listcomp>)inspectanystack)r   r+   r+   r,   rT   Y  s   rT   Tsamples_mappingsanity_check_dist_workersc                 C   s$  ddl m} |s|stdttjjd }|s"ttjjd }|	dur2tj	
|	tj	|}n|}|d|7 }|ttjjd krM|d|7 }|ttjjd kr_|d|7 }|d	|7 }|d
|7 }|d|7 }|d7 }|
du rtj dkrtj	|st| dddu rt| dddu rt|  td| | jjtjksJ | jjtjksJ tj dk}t }td| zddlm} W n ty   tdw || j| j|||||||rdnd	}
td tj||
dd td| tdt |  |rPtj  tj !dg}tjj"||j#ddd tjj"||$ d |d % tj& tjj&|' d ksPJ |
du rtd| t }tj(|ddd}
tdt |  td|
j)d  t*| drt*| drt+|  |
S ) z`Get a list that maps a sample index to a starting sentence index, end sentence index, and lengthr   )parallel_statez4Need to specify either max_num_samples or num_epochsr   Nz_{}_indexmapz_{}epz_{}mnsz_{}mslz_{:0.2f}sspz_{}sr{   doc_idxsizeszP > WARNING: could not find index map file {}, building the indices on rank 0 ...z, > building samples index mapping for {} ...)helpers_cppzhCould not compile megatron dataset C++ helper functions and therefore cannot import helpers python file.r   z% > done building samples index mapingTr}   z  > saved the index mapping in {}zB > elasped time to build and save samples mapping (seconds): {:4f})with_context_parallel)groupz" > loading indexed mapping from {}r   r|   z*    loaded indexed file in {:3.3f} secondsz    total number of samples: {}),megatron.corer   rE   r   iinfoint32r   int64r   pathjoinbasenameformatrN   rO   rR   isfilegetattr#_make_indexed_dataset_compatibilityprintr   r   r   rW   rL   rM   megatron.core.datasetsr   ImportErrorbuild_mappingsaverU   cuda
LongTensor
all_reduceget_data_parallel_group!get_pipeline_model_parallel_groupitemget_world_sizeget_tensor_model_parallel_groupr   shapehasattr"_deallocate_indexed_dataset_memory)indexed_datasetdata_prefix
num_epochsmax_num_samplesmax_seq_lengthshort_seq_probr   namebinary_headr6   r   r   r   indexmap_filenameverboser^   r   countsr+   r+   r,   _get_samples_mappingi  s   ( 





r  c                 C   s\   t | dddust | dddurtdtjt| d tjd| _tjt| tjd| _	| S )zMMake any dataset compatible with IndexedDataset for Megatron samples mapping.r   Nr   z0Dataset already has doc_idx or sizes attributes.r   r   )
r   AttributeErrorr   r   r!   r   r   onesr   r   )r   r+   r+   r,   r     s
    r   sourcer3   name_end_token_idslabel_start_idsspecial_tokensnum_turn_start_tokensc                 C   s  t | |\}}}}	||}
t|
}||}t|}g }g }tt|d| t|s?t	d|d| d| | d D ]'}|t
|d  }|t
}|t|d }|t| |t| qCdd | d D }|	d}|D ]
}||v sJ d	q{t|}t|d|< t|
}
t|||||||	||||| |tk }|  d
ksJ dt|tkd  d }|
d| }|
|d }t|
|||dS )a&  
    Given a conversation list. This transform:
    1. Add signal '### ' at the beginning each sentence, with end signal '
';
    2. Concatenate conversations together;
    3. Tokenize the concatenated conversation;
    4. Make a deepcopy as the target. Mask human words with IGNORE_INDEX.
    Nz`First few tokens of the conversation are not the same as the header tokens. target[:header_len]=z
 header_tokens=conversationsvaluec                 S   s   g | ]}|d  qS )fromr+   )r9   sentencer+   r+   r,   r;     r   z_preprocess.<locals>.<listcomp>,z!mask role not in the conversationr   zmask is emptyr   r   	input_ids	loss_maskcontext_ids
answer_ids)'_get_header_conversation_type_mask_rolerz   copydeepcopyr!   rN   equaltensorrL   r   
PREFIX_STRappendsplitr   IGNORE_INDEX_mask_targetsr   r   r  nonzerodict)r  r3   r  r  r  r  headerconversation	data_type	mask_roler   targetheader_tokens
header_lenidstokenized_lenssid1id2tokenized_sentencespeakers
split_maskmasklast_ignore_index_posr"  r#  r+   r+   r,   _preprocess  s`   


 




rA  r   c                 C   sj   t | tr1| dr%dd | d D }| dr#|dd| d d |S | dr/| d}|S | }|S )aL  
    Input
        source - HuggingFace AutoTokenizer messages format
            {"messages": [
                {"role": "system","content":"<text>"},
                {"role": "user","content":"<text>"},
                {"role": "assistant","content":"<text>"}
            ]}
        source - can also be conversation format, these are converted to HF messages format
            Mask and type are ignored. Mask will apply to all non-assistant output tokens.
            {"conversations": [
                {"from": "User","value":"<text>"},
                {"from": "Assistant","value":"<text>", "mask": "User", "system": "<text>", "type": "TEXT_TO_VALUE"}
            ]}

    Output

    [
        {"role": "system","content":"<text>"},
        {"role": "user","content":"<text>"},
        {"role": "assistant","content":"<text>"}
    ]
    r  c                 S   s"   g | ]}|d    |d dqS )r  r  rolecontent)lower)r9   convor+   r+   r,   r;   D  s   " z/_convert_to_openai_messages.<locals>.<listcomp>systemr   rB  messages)rC   r/  r   insert)r  chatr+   r+   r,   _convert_to_openai_messages(  s   




rK  tool_schemasc                 C   s$  t |jds
tdt| }d}t| tr| dp|}n|}t|jj	du}|jj
||dd|d}|d}|r?|d }nd	gt| }|jr[|d
 |jkr[||jg7 }|d	g7 }d|v rnt||ddd
 d }	nt|}	|d|	 }
||	d }tt|t|t|
t|dS )a  
    Preprocess messages to apply chat template and tokenize. Returns a dictionary of tokens
    Input:
        source - HuggingFace AutoTokenizer messages format
            {"messages": [
                {"role": "system","content":"<text>"},
                {"role": "user","content":"<text>"},
                {"role": "assistant","content":"<text>"}
            ]}
        source - can also be conversation format, these are converted to HF messages format
            Mask and type are ignored. Mask will apply to all non-assistant output tokens.
            {"conversations": [
                {"from": "User","value":"<text>"},
                {"from": "Assistant","value":"<text>", "mask": "User", "system": "<text>", "type": "TEXT_TO_VALUE"}
            ]}
        tokenizer - tokenizer to apply chat templates to
        tool_schemas - Optional tool_schemas to supply to apply_chat_template, these will be superseeded
           by tools supplied with the message

    Output
    {
        "input_ids": torch.LongTensor(),
        "mask": torch.BoolTensor(),
        "context_ids": torch.LongTensor(),
        "answer_ids": torch.LongTensor(),
    }

    * input_ids contain tokenized messages with chat template applied
    * mask corresponds to tokens of input_ids where 1 represents output tokens for the role `assistant` in both
    context and answer for multi-turn, and 0 to mask all other tokens, e.g. system, user, and tool calling.
    * context_ids contain tokenized messages with chat template applied for all messages except assistant's last
    * answer_ids contain tokenized messages with chat template applied for only the assistant's last generated
    output
    apply_chat_templatezQCannot apply chat template with tokenizer that is not a HuggingFace AutoTokenizerNtoolsT)rN  tokenizereturn_dictreturn_assistant_tokens_maskr   assistant_masksr   r   r   r  )r  r3   rE   rK  rC   r/  r   GENERATION_REGEXsearchchat_templaterM  r!   eos_idindexrN   r   
BoolTensor)r  r3   rL  rJ  rN  template_has_generation_kwdtokenized_chatr   r?  context_end_idxr"  r#  r+   r+   r,   _chat_preprocessQ  sB   #




r\  c                 C   s  |	d }|	d }t |
}
t |}|}| jd }tt|||D ]\}\}}}|t}|t| | | }t|t| }t|
|}|dkr~|dkrt||ksSJ t|||d }|dk rit	| ||| < q!|t| }||7 }n
|dkr~|t|
 }||kr dS || |k rt 
| |d ||  |dd std |dkr|dks|du rt	| ||| < nC||v r|dkr|dkrt	| || || < n,||v r|dkrt	| || || < n||v r|dkrt	| ||| < nt	| ||| < ||7 }q!dS )	a  This function masks the tokens so the loss is computed only on the non-masked role's responses.
    For 'TEXT_TO_VALUE' type, the loss is computed on the value attributes.

    Args:
        target (Tensor): input ids
        tokenized_lens (List[int]): array of lengths of each turns
        speakers (List[str]): array of speakers of each turns
        header_len (int): the system prompt length
        s_ids (List[Tensor]): array of tokenized ids of each turns
        tokenizer (TokenizerSpec): tokenizer object
        mask_role (str): the speaker id to be masked from loss computation.
        gtype (str): either 'TEXT_TO_VALUE' or 'VALUE_TO_TEXT'
        name_end_token_ids (int): end of name token ids
        special_tokens (dict): special tokens used for the chat prompt.
        label_start_ids (list): list of label start token ids,
        num_turn_start_tokens (int): number of tokens of the turn_start str
    
turn_startend_of_namer   r   Nr   r   zAa sentence mismatches the corresponding piece in the conversation)rN   r(  r  	enumerateziprz   r)  r!   $_identify_start_index_of_subsequencer,  r'  rL   r   )r4  r8  r=  r6  s_idsr3   r3  gtyper  r  r  r  
TURN_TOKENEND_NAME_SIGNALcur_idxtgt_lenrq   tokenized_lenspeakers_idr:  r;  skip_name_lenlocationnewline_locmore_skip_lenr+   r+   r,   r-    sP   





$

r-  c                 C   s   |d }|d }d }d| v r"| d }|d ur"|t v s"J d| d| d }|d ur8t | dkr8|d t |  }| d	d
}|d  t | | | }t|| d |||}||||fS )Nend_of_turnr^  typesource type z not supportedrG  r   
r?  Usersystem_turn_startr  )TYPE_INSTRUCTIONr   SYSTEM_TOKEN_add_speaker_and_signal)r  r  
END_SIGNALre  r2  r1  r3  r0  r+   r+   r,   r$    s   r$  c                 C   s>  |d }|d }|d }|d }	 d}	| }
t |D ]\}}|d }|}|d u r8|	| | | |d  | |d< nN|dkr[|	| | | d	|v rOt|d	 ||nd |d  | |d< n+|d
kr~|	| | | |d  | d	|v rxt|d	 ||nd |d< ntd| d|
|d 7 }
||vr|t|d kr|
|7 }
q|
S )Nr]  ro  label_startr^  r   r  r  r   labelr   rq  zF not supported, only 'VALUE_TO_TEXT' and 'TEXT_TO_VALUE' are supportedr   )r_  _response_value_formaterrE   r!   )r0  r  r3  rc  r  rd  rx  LABEL_STARTre  BEGIN_SIGNALr1  rq   r  sentence_from
role_tokenr+   r+   r,   rw    sn   	

rw  c                 C   s6   t | tr||  | S | d u rdS tdt|  d)Nr   zUnknown label type z, only str type is supported)rC   rD   rE   rp  )rz  ry  
end_signalr+   r+   r,   r{  T  s
   
r{  c              	   C   sL   t |d| d d D ]}t|||| d  | r#|  S qdS )a  find the location of the small tensor in the large tensor.
        e.g.  small = [1,3], large = [2,3,1,3], returns 2
              small = [3,2], large = [2,3,1,3], returns -1
    Args:
        small (tensor): small tensor
        large (tensor): large tensor
    r   r   r   )r   sizerN   r'  )subsequencesequencerq   r+   r+   r,   ra  ]  s
    ra  c                 C   s   t ||}t|rdS td|  ||| }t|}t|jtjs-t	d|j t
| td}td| d tj|d |dd td	| d
 t|t|d
 d dS )z&Helper function to build an index fileFzBuilding indexing for fn = z.midx must be an integer array, but got type = )r'   r   zSaving idx file = r{   Tr   zSaving metadata file = r   wb)r   r   rL   rM   r   r#   
issubdtyper   integer	TypeErrorr/  r   r   r   dumpr   )r'   r4   r&   r6   r   r)   ru   r+   r+   r,   r   k  s   


r   r&   c                 C   s   |r9|  dr |  dr| d} |  dr| d} |  dstj||  dt }tjtj|dd |S |  dt }|S )a  Return base file name of index files.

    This returns the base file name associated with specified index
    files. This base name is the base on top of which suffixes
    like .npy or .info are added.

    The parent directory is created if it does not already exist.

    fn may be specified in multiple ways:
    1. file name: data.jsonl,
    2. relative path to a file: relative/path/to/data.jsonl,
    3. absolute path to a file: /absolute/path/to/data.jsonl.

    This function returns paths in the pattern of:
    1. /path/to/input_mapping_dir/data.jsonl.idx
    2. /path/to/input_mapping_dir/relative/path/to/data.jsonl.idx
    3. /path/to/input_mapping_dir/absolute/path/to/data.jsonl.idx

    Args:
        fn: filename to get base name for.
        index_mapping_dir: directory to save the index mapping to.
                If None, will write to the same folder as the dataset.
    )/..r  r  .T)exist_ok)
startswithlstripr   r   r   r   makedirsdirname)r&   r6   r   r+   r+   r,   r     s   





r   c                 C   s(   t j| d rt j| d rdS dS )z,Helper function to test if index file existsr{   r   TF)r   r   exists)r   r+   r+   r,   r     s    r   c                 C   s   d| _ d| _dS )z'Deallocate memory of an IndexedDataset.N)r   r   )r  r+   r+   r,   r    s   
r  c              
   C   s  zddl m} W n ttfy   td ddlm} Y nw t| tr+| | 9 } | S t| t	s2J | dks:|du r<| S t
|}t
|t	dkr| dkrP|} | S t||  }|dkrw| dkrwdt
| }td|  d	|  d
t
| d| d	|| k r| } | S | | |   } | S )z?
    Reconfigure trainer.limit_val_batches for pretraining
    r   )get_num_microbatcheszCMegatron num_microbatches_calculator not found, using Apex version.g        Ninfg      ?zYou requested to check z of the val_dataloader but z * zX < 1. Please increase the `limit_val_batches` argument. Try at least `limit_val_batches=`))megatron.core.num_microbatches_calculatorr  r   ModuleNotFoundErrorloggingr   (apex.transformer.pipeline_parallel.utilsrC   r   floatr!   rE   )limit_batches
dataloaderr  dl_len_in_micro_batcheslimit_micro_batchesmin_percentager+   r+   r,   _reconfigure_limit_batches  sD   



r  )NNTra   )Dr%  rX   r   r  multiprocessingr   r   r   rerW   	functoolsr   r   typingr   r   r   r   r   r	   numpyr   rN   1nemo.collections.common.tokenizers.tokenizer_specr
   nemo.core.classesr   
nemo.utilsr   	getLoggerr   rL   r)  r,  rv  ru  compilerS  r   r   r-   r.   r   r   rD   rS   r   rT   r   r  r   r/  r   listrA  rK  r\  r-  r$  rw  r{  ra  r   r   r   r  r  r  r+   r+   r+   r,   <module>   s    

 a/ 6
#!

n
F$)U`9	'