o
    wi?3                     @   s   d dl Z d dlZd dlZd dlZd dlmZmZmZmZm	Z	 d dl
mZ d dlZd dlm  mZ d dlmZmZ d dlmZ d dlmZmZ d dlmZ d dlmZ d dlmZm Z  d d	l!m"Z"m#Z# d d
l$m%Z% G dd de#Z&G dd dej'Z(dS )    N)AnyDictListOptionalSequence)EVAL_DATALOADERSTRAIN_DATALOADERS)data)
DataLoaderdefault_collate)get_ltor_masks_and_position_idscreate_vision_mask_tensor)
DataConfigImageDataConfig)IGNORE_INDEXLazySupervisedDataset)MegatronDataSamplerc                       s`   e Zd ZdZ fddZdeeejf fddZ	dd Z
d	ee deeejf fd
dZ  ZS )MLlamaDatasetz#Dataset for supervised fine-tuning.c                    s*  | drt |||| n| drt d ||| td |jdkr|j}t|dD ]W}t	|}g |d< |d D ]A}	t
d|	d }
|
D ])}|d	d
d }tj||}tj|sltd|  qJ|d | qJt
dd|	d |	d< q>| j| q/ntd| d|| _d S )Nz.jsonz.jsonlz,Loading image inputs from SteerLM Dataset...imagerconversationsz<img src=["\']([^"\']+)["\']value   /zImage not found: z<img src=["']([^"']+)["']<image>zFormatting of z is not supported in MLlama.)endswithsuper__init__loggingwarning
media_typeimage_folderopenjsonloadsrefinditergroupsplitospathjoinisfileappendsublist_data_dict
ValueErrorsequence_length)self	data_pathdata_config	tokenizerimage_processorr3   r#   linerecordturnmatchesmatch
image_name
image_path	__class__ g/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/vlm/mllama/data/preloaded.pyr   '   s0   
	




zMLlamaDataset.__init__returnc                 C   s^   | j | }| j|| jdkd}|dd}| |\}}| |}tdi |||d}|S )Nplain)	use_plainr   z	<|image|>)tokenslabelsrB   )r1   _apply_prompt_templatesconv_templatereplace_tokenize_and_label_process_imagesdict)r4   isourcer   rG   rH   
image_dict	data_dictrB   rB   rC   __getitem__Q   s   


zMLlamaDataset.__getitem__c              	   C   s   g }d|v r3t |d ts|d g|d< |d D ]}| j|}|d u r-td| d || qt|dkrL| jj	|dd}dd |
 D }|S ttd	d
d| jjd | jjd tjdgtjddgd}|S )Nr   zImage z could not be found!r   pt)return_tensorsc                 S   s"   i | ]\}}|d v r||d qS )pixel_valuesaspect_ratio_ids	num_tilesr   rB   ).0kvrB   rB   rC   
<dictcomp>l   s    z1MLlamaDataset._process_images.<locals>.<dictcomp>r         heightwidth)dtyperV   )
isinstancelistimage_loader
open_imager    r!   r/   lenr8   
preprocessitemsrN   torchzerossizetensorlong)r4   rP   images
image_filer   rQ   rB   rB   rC   rM   _   s.   zMLlamaDataset._process_images	instancesc                 C   s  | j }tdd |D d d d d }|| jkr)td| d| j d | j}tdd |D }|D ]o}||d	 jd
  }t|d	 d
|fdd
|d	< t|d d
|fdt	|d< ||d jd
  }t|d d
d
d
d
d
d
d
d
d
|f
dd
|d< t|d d
t|d d
fdd
|d< tt
|d d
t|d d
fdd
|d< q4dd |D }t|}	| j}
|	d	 }|	d }t||
j|j|j|jd\}}}d||d
k < |||	d ||	d ||	d ||d	}	|	S )Nc                 s       | ]
}|d  j d V  qdS )rG   r   NshaperZ   instancerB   rB   rC   	<genexpr>|       z+MLlamaDataset.collate_fn.<locals>.<genexpr>r   @   zTruncating sequence length z to .c                 s   rr   )rW   r   Nrs   ru   rB   rB   rC   rw      rx   rG   r   constantrH   rW   rX   rY   c                 S   s   g | ]	}t |d  dqS )rG   i  r   ru   rB   rB   rC   
<listcomp>   s    z,MLlamaDataset.collate_fn.<locals>.<listcomp>)r	   	eod_tokeneod_mask_lossreset_attention_maskreset_position_idsg        )	rG   rH   batch_imagesbatch_masks
num_chunksattention_maskrX   	loss_maskposition_ids)r6   maxr3   r    r!   
seq_lengthrt   Fpadr   rj   rm   r   r7   r   eos_token_idr~   r   r   )r4   rq   r6   max_lenmax_num_concurrent_mediarv   pad_lenpad_num_imagesr   batchr7   rG   rH   r   r   r   rB   rB   rC   
collate_fnz   sX   "
  
zMLlamaDataset.collate_fn)__name__
__module____qualname____doc__r   r   strrj   TensorrS   rM   r   r   __classcell__rB   rB   r@   rC   r   $   s    *(r   c                %       s  e Zd Zdeddddddddddddddfd	eee B d
eee  dee de	dee	 dedede	de	de	de	de	de	de
de
de
de	ddf$ fddZd.deddfdd Zdefd!d"Zdefd#d$Zdefd%d&Zdefd'd(Zdeeef fd)d*Zd+eeef ddfd,d-Z  ZS )/MLlamaPreloadedDataModuleNi   r^      i'  TFi  pathsweightsr6   r   decoder_seq_lengthr7   r8   micro_batch_sizeglobal_batch_sizenum_train_samplesnum_val_samplesnum_test_samplesnum_workers
pin_memorypersistent_workersuse_packed_sequenceseedrD   c                    s&  t    t|ttfs|g}|d ur%t|t|ksJ t|dkr%d }|| _|| _|| _|| _	|| _
|| _|	| _|| _|| _|
| _|| _|| _|| _|| _|| _|| _|| _d| _|| _|| _|d u si|d u rtd ddlm} |d}|p}|j| _|p|j| _t| j	| j
||	dd| _d S )Nr   r   zbProcessor and tokenizer are not provided! Fall back to `meta-llama/Llama-3.2-11B-Vision-Instruct`.)AutoProcessorz(meta-llama/Llama-3.2-11B-Vision-Instructcyclic)seq_lendecoder_seq_lenr   r   dataloader_type)r   r   rc   rd   tuplerg   r   r   r6   r   r   r   r   r7   r8   r   r   r   r   r   r   r   r   init_global_stepr    r!   transformersr   from_pretrainedr   data_sampler)r4   r   r   r6   r   r   r7   r8   r   r   r   r   r   r   r   r   r   r   r   	processorr@   rB   rC   r      sT   

z"MLlamaPreloadedDataModule.__init__ stagec                 C   sd   t | jdksJ d| jrd S t| jd | j| j| j| j| _t| jd | j| j| j| j| _	d S )Nr   z,not yet support blend dataset in MLlama 2.0!r   )
rg   r   r   r   r6   r7   r8   r   	_train_ds_validation_ds)r4   r   rB   rB   rC   setup   s   
zMLlamaPreloadedDataModule.setupc                 C      |  | jS N)_create_dataloaderr   r4   rB   rB   rC   train_dataloader     z*MLlamaPreloadedDataModule.train_dataloaderc                 C   r   r   )r   r   r   rB   rB   rC   val_dataloader  r   z(MLlamaPreloadedDataModule.val_dataloaderc                 C   r   r   )r   _test_dsr   rB   rB   rC   test_dataloader
  r   z)MLlamaPreloadedDataModule.test_dataloaderc              	   K   s@   | j j| _| j| j_t|f| j| j| jt|dt	j
jd|S )Nr   )r   r   r   r   )trainerglobal_stepr   r   r
   r   r   r   getattrr	   
dataloaderr   )r4   datasetkwargsrB   rB   rC   r     s   

z,MLlamaPreloadedDataModule._create_dataloaderc                 C   s   | j | jj| j }d|iS )zCalled when saving a checkpoint, implement to generate and save datamodule state.

        Returns:
            A dictionary containing datamodule state.

        consumed_samples)r   compute_consumed_samplesr   r   r   )r4   r   rB   rB   rC   
state_dict  s   z$MLlamaPreloadedDataModule.state_dictr   c                 C   sp   zddl m} W n ty   ddlm} Y nw |d }|| j_|| j_d| _|dur6|}|j|dd dS dS )zCalled when loading a checkpoint, implement to reload datamodule state given datamodule stat

        Args:
            state_dict: the datamodule state returned by ``state_dict``.

        r   )#_GLOBAL_NUM_MICROBATCHES_CALCULATORr   r   NF)r   consistency_check)	(apex.transformer.pipeline_parallel.utilsr   ModuleNotFoundErrornemo.lightning.apex_utilsr   init_consumed_samplesprev_consumed_samplesif_first_stepupdate)r4   r   r   r   num_microbatch_calculatorrB   rB   rC   load_state_dict#  s    
z)MLlamaPreloadedDataModule.load_state_dict)r   )r   r   r   r   r   r   r   floatr   intboolr   r   r   r   r   r   r   r
   r   r   r   r   r   r   rB   rB   r@   rC   r      sz    

	
D"
r   ))r%   r    r+   r'   typingr   r   r   r   r   lightning.pytorchpytorchplrj   torch.nn.functionalnn
functionalr   !lightning.pytorch.utilities.typesr   r   torch.utilsr	   torch.utils.datar
   r   2nemo.collections.nlp.modules.common.megatron.utilsr   'nemo.collections.vlm.mllama.model.utilsr   %nemo.collections.vlm.neva.data.configr   r   (nemo.collections.vlm.neva.data.preloadedr   r   nemo.lightning.pytorch.pluginsr   r   LightningDataModuler   rB   rB   rB   rC   <module>   s&    