o
    wi1k                     @   s  d dl Z d dlZd dlZd dlZd dlZd dlmZmZmZm	Z	m
Z
 d dlmZ d dlZd dlZd dlm  mZ d dlmZmZ d dlmZ d dlmZ d dlmZmZmZ d dl m!Z!m"Z" d dl#m$Z$ d d	l%m&Z&m'Z' d d
l(m)Z* d dl+m,Z,m-Z- d dl.m/Z/ zd dl0Z0W n e1y   e2d Y nw G dd dZ3G dd dZ4d ddZ5d!ddZ6d"ddZ7G dd deZ8G dd de8Z9G dd dej:Z;dS )#    N)AnyDictListOptionalSequence)EVAL_DATALOADERSTRAIN_DATALOADERS)Image)data)
DataLoaderDatasetdefault_collate)CLIPImageProcessorSiglipImageProcessor)get_ltor_masks_and_position_ids)
DataConfigImageDataConfig)conv_templates)IGNORE_INDEXSPECIAL_TOKEN_MAP)MegatronDataSamplerz;The package `decord` was not installed in this environment.c                   @   s(   e Zd ZdZdd Zdd Zdd ZdS )	TarOrFolderImageLoadera  
    A class for loading images from a tar archive or a regular folder.

    This class provides functionality to open and read images from either a tar archive
    (.tar file) or a standard directory with image files. It builds an index of images
    if the source is a tar archive for efficient access.

    Attributes:
        image_folder (str): The path to the tar archive or image folder.
        tar_index (dict): A dictionary that maps file names to their tarfile member
                          objects if the image source is a tar archive.

    Methods:
        __init__(self, image_folder): Initializes the loader with the specified image folder.
        build_index(self): Builds an index of image file names and their corresponding
                           tarfile member objects for a tar archive.
        open_image(self, file_name): Opens and returns an image by its file name. The image
                                     is returned as an RGB PIL Image object.
    c                 C   s(   || _ i | _| j dr|   d S d S N.tar)image_folder	tar_indexendswithbuild_index)selfr    r   e/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/vlm/neva/data/preloaded.py__init__C   s
   zTarOrFolderImageLoader.__init__c                 C   N   t | jd}| D ]}|| j|j< qW d    d S 1 s w   Y  d S Nr)tarfileopenr   
getmembersr   namer   tarmemberr   r   r    r   I   
   "z"TarOrFolderImageLoader.build_indexc                 C   s   | j dr<t| j d%}| j|}|r*||}t|dW  d    S W d    d S 1 s5w   Y  d S tt	j
| j |dS )Nr   r$   RGB)r   r   r%   r&   r   getextractfiler	   convertospathjoin)r   	file_namer*   r+   fr   r   r    
open_imageN   s   

z!TarOrFolderImageLoader.open_imageN)__name__
__module____qualname____doc__r!   r   r6   r   r   r   r    r   .   s
    r   c                   @   s0   e Zd ZdZdd Zdd Zdd Zdd	 Zd
S )TarOrFolderVideoLoadera  
    A class for loading videos from a tar archive or a regular folder.

    This class provides functionality to open and read videos from either a tar archive
    (.tar file) or a standard directory with video files. It builds an index of videos
    if the source is a tar archive for efficient access.

    Attributes:
        video_folder (str): The path to the tar archive or video folder.
        data_config (dict): A dictionary of configuration options for video decoding to frames
        tar_index (dict): A dictionary that maps file names to their tarfile member
                          objects if the video source is a tar archive.

    Methods:
        __init__(self, video_folder): Initializes the loader with the specified video folder.
        build_index(self): Builds an index of image file names and their corresponding
                           tarfile member objects for a tar archive.
        open_video(self, file_name): Opens and returns an video by its file name. The video
                                     is returned as a list of RGB PIL Image objects.
        flatten_frames(self, cap): Converts decord VideoReader video object to list of frame
                                   images based on data config information.
    c                 C   s.   || _ || _i | _| j dr|   d S d S r   )video_folderdata_configr   r   r   )r   r<   r=   r   r   r    r!   r   s   zTarOrFolderVideoLoader.__init__c                 C   r"   r#   )r%   r&   r<   r'   r   r(   r)   r   r   r    r   y   r,   z"TarOrFolderVideoLoader.build_indexc                 C   s   | j dr>t| j d'}| j|}|r,||}t|}| 	|W  d    S W d    d S 1 s7w   Y  d S tt
j| j |}| 	|S )Nr   r$   )r<   r   r%   r&   r   r.   r/   decordVideoReaderflatten_framesr1   r2   r3   )r   r4   r*   r+   r5   capr   r   r    
open_video~   s   





z!TarOrFolderVideoLoader.open_videoc                    s0  | j jdkr d  }t|dS | j jdkr, t d   }t|dS | j jdkr@ d  }t|dS | j jdkr`g } D ]}| }t|d}|| qJ|S t	t | j j}t
jdt d |td	} fd
d|D }t|| j jk r||d  t|| j jk s|S )Nfirstr   r-   middle   last   dtypec                    s$   g | ]}t  |  d qS )r-   )r	   	fromarrayasnumpyr0   ).0irA   r   r    
<listcomp>   s   $ z9TarOrFolderVideoLoader.flatten_frames.<locals>.<listcomp>)r=   splice_single_framerL   r	   rK   r0   len
num_framesappendminnplinspaceint)r   rA   frameframes	rgb_frameimgrS   indicesr   rO   r    r@      s.   z%TarOrFolderVideoLoader.flatten_framesN)r7   r8   r9   r:   r!   r   rB   r@   r   r   r   r    r;   Z   s    r;   squarec           
      C   s   t | ts
t | trj|dkr;t|jt|j}}|| }d\}}tt|| |}| j|ddd|idd d }|S |d	kr]d
d }	|	|tdd | j	D }| j|ddd d }|S | j|ddd d }|S |dksrJ d| |}|S )Nkeep)i     ptFshortest_edge)return_tensorsdo_center_cropsizepixel_valuesr   padc                 S   s~   | j \}}||kr| S ||kr't| j||f|}|| d|| d f |S t| j||f|}|| || d df |S )Nr   rE   )re   r	   newmodepaste)pil_imgbackground_colorwidthheightresultr   r   r    expand2square   s   
z$process_image.<locals>.expand2squarec                 s   s    | ]	}t |d  V  qdS )   N)rX   )rM   xr   r   r    	<genexpr>   s    z process_image.<locals>.<genexpr>)rc   r^   zCNeMo image transform with setting `image_process_mode` to `square`.)

isinstancer   r   maxre   rU   rX   
preprocesstuple
image_mean)
	processorimageimage_process_modemax_hwmin_hwaspect_ratiomax_lenmin_lenrb   rp   r   r   r    process_image   s0   r   c                 C   s   |du rt }dd |D }dddd | D  d }t|| }g }|D ]}||v r5|||  q't|d	krE|||d
dj q't	j
|t	jdS )a  
    Tokenizes a given prompt with special handling for multiple special tokens.

    This function splits the prompt at special tokens, tokenizes each chunk separately,
    and then reassembles the chunks with the corresponding special token inserted in place of the placeholders.

    Parameters:
    prompt (str): The input prompt containing text and special token placeholders.
    tokenizer: The tokenizer object used to tokenize the prompt chunks.
    special_token_map (list, optional): A list containing tuples of special token strings
                                        and their corresponding token indices. Defaults to SPECIAL_TOKEN_MAP.

    Returns:
    torch.Tensor: A tensor of token IDs representing the tokenized prompt with special tokens.
    Nc                 S   s   i | ]\}}||qS r   r   )rM   tokenindexr   r   r    
<dictcomp>       z*tokenize_special_token.<locals>.<dictcomp>(|c                 s   s    | ]}t |V  qd S N)reescape)rM   r   r   r   r    rs      s    z)tokenize_special_token.<locals>.<genexpr>)r   F)add_special_tokensrI   )r   r3   keysr   splitrT   rR   extend	input_idstorchtensorlong)prompt	tokenizerspecial_token_mapspecial_token_dictregex_patternchunkstokenized_chunkschunkr   r   r    tokenize_special_token   s    r   Fc                 C   sp   t | }t |}t||| d D ]$}| |||  |k}t|s-|r5t|dd  r5||| f  S qdS )NrH   )rG   rG   )rR   ranger   all)templatepatternsearch_start_indexallow_first_token_mismatchtemplate_lenpattern_lenrN   matchr   r   r    find_pattern_indices   s    r   c                       s^   e Zd Z fddZdd Zdeeejf fddZ	dd	 Z
dddZdd Zdd Z  ZS )LazySupervisedDatasetc           
         s   t    |d ur$t|d}t|}W d    n1 sw   Y  ng }td || _|| _ddl	m
} t| j|rB| jj| _|| _|j| _t| j | _|j| _|| _t|dd }t|dd }	|rht|nd | _|	rut|	|| _d S d | _d S )Nr$   z%Formatting inputs...Skip in lazy moder   AutoTokenizerr   r<   )superr!   r&   jsonloadloggingwarningr=   r   =nemo.collections.common.tokenizers.huggingface.auto_tokenizerr   rt   image_processorconv_templatesupported_conv_templatesconvr{   list_data_dictgetattrr   image_loaderr;   video_loader)
r   	data_pathr=   r   r   filer   r   r   r<   	__class__r   r    r!      s,   


zLazySupervisedDataset.__init__c                 C   s
   t | jS r   )rR   r   r   r   r   r    __len__   s   
zLazySupervisedDataset.__len__returnc                 C   sH   | j | }| j|| jdkd}| |\}}| |}t|||d}|S )Nplain)	use_plain)rz   tokenslabels)r   _apply_prompt_templatesr   _tokenize_and_label_process_imagesdict)r   rN   sourceconversationsr   r   media_tensors	data_dictr   r   r    __getitem__#  s   

z!LazySupervisedDataset.__getitem__c                 C   s   t g }d|v rGt|d ts|d g|d< g }|d D ]"}| j|}|d u r2td| d t| j	|| j
}|| q|rGt |}|S )Nrz   zImage z could not be found!)r   r   rt   listr   r6   r   r   r   r   r{   rT   stack)r   r   r   images
image_filerz   r   r   r    r   0  s   

z%LazySupervisedDataset._process_imagesFc                    s   | j   jd  jd d}d  fdd}||}g  _tD ] \}}||d  }| j|d  ks<J |  ||d	  q$|rdt jdksRJ d
d jd d v s]J d jd d<   S )Nr   rH   )humangptr   c                    s8   t dk r| S d d  jd d d  jd iS )NrE   r   fromrH   )rR   roles)r   r   r   r   r    
_fix_rolesI  s   (zALazySupervisedDataset._apply_prompt_templates.<locals>._fix_rolesr   rE   valuez,Plain template requires image-caption pairs.<image>)r   r   messages	enumerateappend_messagerR   
get_prompt)r   r   r   r   r   jsentenceroler   r   r    r   B  s   z-LazySupervisedDataset._apply_prompt_templatesc           
      C   s   t || j}t|t }d}tdt| jjdD ]O}t	| jdd }|d us*J d| jj
| jj| d |d u r:dn| ddd	d }t|||\}}	|dk r\td
| jj|||  n|||	 |||	< |	}q|d d }|dd  }||fS )Nr   rH   rE   stop_strzPIf `stop_str` is not provided, issues might occur in labeling the answer tokens. Fra   )r   rc   zUnable to find a valid answer in the conversation. Details: 
- Messages: %s
- Tokens: %s
- Answer Tokens: %s
- Search Start Index: %drG   )r   r   r   	ones_liker   r   rR   r   r   r   encoder   r   r   )
r   r   r   r   r   rN   r   answer_tokensanswer_start
answer_endr   r   r    r   ]  s<   
z)LazySupervisedDataset._tokenize_and_labelc                 C   s(   t | jtr| jjd | jjd gS t)Nrn   rm   )rt   r   r   	crop_sizeNotImplementedErrorr   r   r   r    _get_crop_size  s   z$LazySupervisedDataset._get_crop_size)F)r7   r8   r9   r!   r   r   strr   Tensorr   r   r   r   r   __classcell__r   r   r   r    r      s    "
#r   c                       sF   e Zd ZdZ		d
 fdd	Zdee deeej	f fdd	Z
  ZS )NevaDatasetz#Dataset for supervised fine-tuning.F@  c                    s0  | drt |||| n| drt d ||| td |jdkr|j}t|dD ]W}t	|}	g |	d< |	d D ]A}
t
d|
d }|D ])}|d	d
d }tj||}tj|sltd|  qJ|	d | qJt
dd|
d |
d< q>| j|	 q/ntd| d|| _|| _d S )Nz.jsonz.jsonlz,Loading image inputs from SteerLM Dataset...rz   r$   r   z<img src="([^"]+)"r   rH   /rG   zImage not found: z<img src="([^"]+)">r   zFormatting of z is not supported in Neva.)r   r   r!   r   r   
media_typer   r&   r   loadsr   finditergroupr   r1   r2   r3   isfilerT   subr   
ValueErrorpacked_sequencenum_image_embeddings_per_tile)r   r   r=   r   r   r   r   r   linerecordturnmatchesr   
image_name
image_pathr   r   r    r!     s2   






zNevaDataset.__init__	instancesr   c                 C   s  | j }| j}|j}|dkrdd |D }tj|dd}n|dkr(dd |D }ntd| |rXdd	lm} | j jj	}|d
d |D dd |D | j
|td\}}	}
}}d }nXtdd |D }|D ]'}||d jd  }t|d d|fdd|d< t|d d|fdt|d< qct|}| j}|d }|d }	t||j|j|j|jd\}}}
d||	dk < ||	|||
|d}|r||d< |S )Nrz   c                 S   s   g | ]}| d qS )rz   poprM   instancer   r   r    rP     r   z*NevaDataset.collate_fn.<locals>.<listcomp>r   )dimvideoc                 S   s   g | ]}| d dqS )r  Nr   r  r   r   r    rP     s    zUnsupported media type )convert_to_packedc                 S      g | ]}|d  qS )r   r   r  r   r   r    rP         c                 S   r  )r   r   r  r   r   r    rP     r  )r   r   r   media_token_indexignore_indexc                 s   s    | ]
}|d  j d V  qdS )r   r   N)shaper  r   r   r    rs     s    z)NevaDataset.collate_fn.<locals>.<genexpr>r   constantr   )r
   	eod_tokeneod_mask_lossreset_attention_maskreset_position_idsg        )r   r   attention_mask	loss_maskposition_idsmediapacked_seq_params)r=   r   r   r   catr   /nemo.collections.vlm.neva.data.sequence_packingr  media_tokentoken_indexr   r   ru   r
  Frg   r   r   r   eos_token_idr  r  r  )r   r   r=   r   r   r  r  media_token_idr   r   r  r  r  r  r   r  pad_lenbatchr   r   r   r    
collate_fn  s^   
zNevaDataset.collate_fn)Fr   )r7   r8   r9   r:   r!   r   r   r   r   r   r  r   r   r   r   r    r     s    (,r   c                '       s"  e Zd Zdeddddddddddddddd	fd
eee B deee  dee de	dee	 dedede	de	de	de	de	de	de
de
de
de	de	ddf& fddZd0d eddfd!d"Zdefd#d$Zdefd%d&Zdefd'd(Zdefd)d*Zdeeef fd+d,Zd-eeef ddfd.d/Z  ZS )1NevaPreloadedDataModuleNi         i'  TFr   i  pathsweightsr=   
seq_lengthdecoder_seq_lengthr   r   micro_batch_sizeglobal_batch_sizenum_train_samplesnum_val_samplesnum_test_samplesnum_workers
pin_memorypersistent_workersr   r   seedr   c                    sv  ddl m} |   t   t|ttfs|g}|d ur1t	|t	|ks)J t	|dkr1d }|| _
|| _|| _|| _|| _|| _|	| _|| _|| _|
| _|| _|| _|| _|| _|| _|| _|| _|| _d| _|d u sr|d u rtd ddlm } ddl!m"} |#d}|p|ddd	| _|p|j| _| jrdd l$  fd
d}|t%_&t%| j| j||	dd| _'| (  d S )Nr   )CallbackGrouprH   zRProcessor and tokenizer are not provided! Fall back to `llava-hf/llava-1.5-7b-hf`.)AutoProcessorr   zllava-hf/llava-1.5-7b-hfF)use_fastc                    s    j || jd| j| jdS )NrH   )r$  r&  num_microbatchesr%  )replaceseq_lenr2  decoder_seq_len)r   stepdataclassesr   r    custom_on_megatron_step_start4  s   zGNevaPreloadedDataModule.__init__.<locals>.custom_on_megatron_step_startcyclic)r4  r5  r&  r'  dataloader_type))nemo.lightning.callback_groupr/  get_instanceon_dataloader_init_startr   r!   rt   r   rw   rR   r"  r#  r=   r$  r%  r&  r'  r   r   r(  r)  r*  r+  r,  r-  r.  r   r   init_global_stepr   r   transformersr0  r   r   from_pretrainedr8  r   on_megatron_step_startdata_sampleron_dataloader_init_end)r   r"  r#  r=   r$  r%  r   r   r&  r'  r(  r)  r*  r+  r,  r-  r   r   r.  r/  r0  r   ry   r9  r   r7  r    r!     s^   


	z NevaPreloadedDataModule.__init__r   stagec                 C   sf   t | jdksJ dt| jd | j| j| j| j| jd| _t| jd | j| j| j| j| jd| _	d S )NrH   z*not yet support blend dataset in Neva 2.0!r   )r   r   )
rR   r"  r   r=   r   r   r   r   	_train_ds_validation_ds)r   rE  r   r   r    setupI  s"   zNevaPreloadedDataModule.setupc                 C      |  | jS r   )_create_dataloaderrF  r   r   r   r    train_dataloader\     z(NevaPreloadedDataModule.train_dataloaderc                 C   rI  r   )rJ  rG  r   r   r   r    val_dataloader_  rL  z&NevaPreloadedDataModule.val_dataloaderc                 C   rI  r   )rJ  _test_dsr   r   r   r    test_dataloaderb  rL  z'NevaPreloadedDataModule.test_dataloaderc              	   K   s@   | j j| _| j| j_t|f| j| j| jt|dt	j
jd|S )Nr  )r+  r,  r-  r  )trainerglobal_stepr?  rC  r   r+  r,  r-  r   r
   
dataloaderr   )r   datasetkwargsr   r   r    rJ  e  s   

z*NevaPreloadedDataModule._create_dataloaderc                 C   s   | j | jj| j }d|iS )zCalled when saving a checkpoint, implement to generate and save datamodule state.

        Returns:
            A dictionary containing datamodule state.

        consumed_samples)rC  compute_consumed_samplesrP  rQ  r?  )r   rU  r   r   r    
state_dictq  s   z"NevaPreloadedDataModule.state_dictrW  c                 C   sp   zddl m} W n ty   ddlm} Y nw |d }|| j_|| j_d| _|dur6|}|j|dd dS dS )zCalled when loading a checkpoint, implement to reload datamodule state given datamodule stat

        Args:
            state_dict: the datamodule state returned by ``state_dict``.

        r   )#_GLOBAL_NUM_MICROBATCHES_CALCULATORrU  rH   NF)rU  consistency_check)	(apex.transformer.pipeline_parallel.utilsrX  ModuleNotFoundErrornemo.lightning.apex_utilsrC  init_consumed_samplesprev_consumed_samplesif_first_stepupdate)r   rW  rX  rU  num_microbatch_calculatorr   r   r    load_state_dict{  s    
z'NevaPreloadedDataModule.load_state_dict)r   )r7   r8   r9   r   r   r   r   floatr   rX   boolr!   rH  r   rK  r   rM  rO  r   rJ  r   r   rW  rb  r   r   r   r   r    r    s    

	
W"
r  )r^   r   )r   F)<r   r   r1   r   r%   typingr   r   r   r   r   lightning.pytorchpytorchplnumpyrV   r   torch.nn.functionalnn
functionalr  !lightning.pytorch.utilities.typesr   r   PILr	   torch.utilsr
   torch.utils.datar   r   r   r@  r   r   2nemo.collections.nlp.modules.common.megatron.utilsr   %nemo.collections.vlm.neva.data.configr   r   +nemo.collections.vlm.neva.data.conversationr   r   0nemo.collections.vlm.neva.data.multimodal_tokensr   r   nemo.lightning.pytorch.pluginsr   r>   	Exceptionr   r   r;   r   r   r   r   r   LightningDataModuler  r   r   r   r    <module>   sD   ,
M
$
'
 j