o
    }oi8                  	   @   s   d dl mZmZmZ d dlZd dlmZmZmZm	Z	m
Z
mZmZ d dlmZ d dlmZmZmZmZ d dlmZmZmZmZ d dlmZ G dd	 d	eee
eee	f eeef ZdS )
    )DictListUnionN)CaptioningSampleDefaultTaskEncoderInterleavedSampleSimilarityInterleavedSample	VQASample
batch_listbatch_pad_stack)	stateless)ImageTextRawBatchImageTextSamplePackedImageTextRawBatchPackedImageTextSample)InterleavedSampleEncoderSampleEncoderSimilarityInterleavedEncoderVQASampleEncoder)loggingc                   @   s   e Zd ZdZ				dddZded	ed
dfddZede	e
eeef d
efddZdee	eef  d
e	eef fddZded
efddZdd Zedd ZdS )MultiModalTaskEncodera+  
    A task encoder that handles multiple modalities including VQA, captioning, interleaved samples,
    and similarity interleaved samples.

    This class extends the DefaultTaskEncoder and provides a flexible mechanism to handle and encode
    different types of multimodal data. Support for VQA, captioning and interleaved samples is provided by default.
    It supports registering custom encoders for each sample type
    and provides methods for encoding individual samples, batching them, and further processing the batch
    for model input.
    F@  Nc                 C   sd   || _ || _|| _|| _|| _|| _tjt||||dt	jt
||||dtjt||||di| _dS )a  
        Initialize the MultiModalTaskEncoder with specific encoders for different sample types.

        Parameters:
        tokenizer (Tokenizer): The tokenizer used for processing textual components across sample types.
        image_processor (ImageProcessor): The image processor responsible for preprocessing image data.
        multimodal_sample_config (MultiModalSampleConfig): Configuration object defining properties and
            requirements for multimodal samples.
        packed_sequence (bool, optional): Flag indicating whether packed sequences are used. Default is False.
        packed_sequence_size (int, optional): The size of packed sequences, used when `packed_sequence` is True.
            Default is -1.
        num_image_embeddings_per_tile (int, optional): Number of image embeddings per image tile. Determines
            the granularity of image features. Default is 576.
        )	tokenizerimage_processormultimodal_sample_configimage_tag_typeN)r   sample_configpacked_sequencenum_image_embeddings_per_tiler   packed_sequence_sizer	   __name__r   r   r   r   r   encoders)selfr   r   r   r   r    r   r    r$   i/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/multimodal/data/energon/task_encoder.py__init__?   s2   
zMultiModalTaskEncoder.__init__sample_typeencoderreturnc                 C   s   || j |< dS )a}  
        Registers a custom encoder for a specific sample type.

        This method allows adding or overriding encoders for specific sample types.

        Parameters:
        sample_type (str): The name of the sample type for which the encoder is being registered.
        encoder (SampleEncoder): The custom encoder instance that will handle the specified sample type.
        N)r"   )r#   r'   r(   r$   r$   r%   register_encoderr   s   
z&MultiModalTaskEncoder.register_encodersamplec                 C   s<   t |j}| j|}|std| |j|t d}|S )aX  
        Encode an individual sample based on its type.

        This method selects the appropriate encoder based on the sample type and encodes the sample
        into a format suitable for further processing.

        Parameters:
        sample (Union[VQASample, InterleavedSample, SimilarityInterleavedSample, CaptioningSample]):
            The sample to be encoded. The sample type is used to determine the appropriate encoder.

        Returns:
        ImageTextSample: The encoded sample.

        Raises:
        NotImplementedError: If no encoder is registered for the sample type.
        z'No encoder implemented for sample type )input_sampleoutput_sample)typer!   r"   getNotImplementedErrorencoder   )r#   r+   r'   r(   encoded_sampler$   r$   r%   encode_sample~   s   
z#MultiModalTaskEncoder.encode_samplesamplesc                 C   sf  | j r>t|dkr&tdt| dt| dt| dt| d| j d|d }t|j|j|j|j|j	|j
|j|jd	S g g g g g f\}}}}}g }|D ]+}||j ||j ||j ||j ||j	 |jd
ury||j qNt|dkrd
}t|}	t|}
|
jdkr|
jdg|
jdd
 R  }
t|}t|}t|}t|	|
||||dS )a  
        Batch a list of encoded samples into a single raw batch.

        This method collates individual encoded samples into a batch, preparing them for model input.

        Parameters:
        samples (List[ImageTextSample]): A list of encoded samples to be batched.

        Returns:
        ImageTextRawBatch: The batched data, including images, tokens, labels, and loss masks.
           z^Micro batch size should be 1 when training with packed sequence, but your micro batch size is z. 
The following config is equivalent to your current setting for a packed dataset. Please update your config to the following: 
Set micro batch size to 1 (currently z1)
Set global batch size to `global_batch_size // z;` Set packed sequence length to `original_sample_seq_len * z` (currently zr) 
For details please visit https://docs.nvidia.com/nemo-framework/user-guide/latest/sft_peft/packed_sequence.htmlr   )__keys__imagestokenslabels	loss_maskposition_idspacked_seq_paramsnum_image_tilesN   r      )r6   r7   r8   r9   r:   r=   )r   len
ValueErrorr    r   __key__r7   r8   r9   r:   r;   r<   r=   appendextendr
   r   ndimreshapeshaper   )r#   r4   r+   keysr7   r8   r9   r:   batch_num_image_tiles
batch_keysbatch_imagesbatch_prompt_tokensbatch_labelsbatch_loss_maskr$   r$   r%   batch   sl   

zMultiModalTaskEncoder.batch
batch_datac                 C   s   |j }d|v r|d |d< |d= d|v o|d duo|d dk}d|v o/|d duo/|d dk}|r8|r8J d|rC|d |d< |d= |d  \}}tj|tjd	}|d|d
}||d< d|vrhd|d< |S )a  
        Encode a batched set of samples for model input.

        This method transforms the raw batched data into a format ready for model input, including
        generating position IDs and other necessary fields.

        Parameters:
        batch_data (ImageTextRawBatch): The raw batch of data to be encoded.

        Returns:
        dict: A dictionary containing the encoded batch data, ready for model input.
        r7   mediar=   Nr   num_media_tileszJnum_image_tiles and num_media_tiles should not be present at the same timer8   )dtyper5   r;   attention_mask)__dict__sizetorcharangelong	unsqueezerepeat)r#   rP   
batch_dictis_num_image_tiles_presentis_num_media_tiles_presentmicro_batch_size
seq_lengthr;   r$   r$   r%   encode_batch   s<   



z"MultiModalTaskEncoder.encode_batchc              
      s   ddl m}m jjj  fdd|D }|||j}tt|t| }t	
dj dt| dt| d|  |S )	zSelects which samples will be packed together.

        NOTE: Energon dataloader calls this method internally if packing is used.
        Please see https://nvidia.github.io/Megatron-Energon/packing.html
        r   )greedy_knapsackpredict_seq_lenc                    s   g | ]}|j  jd qS ))media_token_indexr   )r8   r   .0r+   media_token_idrc   r#   r$   r%   
<listcomp>  s    z@MultiModalTaskEncoder.select_samples_to_pack.<locals>.<listcomp>z&[Seq Packing Info] - Packing seq len: z, Buffered samples: z, Total number of bins: z, Average samples per bin: )/nemo.collections.vlm.neva.data.sequence_packingrb   rc   r   image_tokentoken_idr    roundr@   r   info)r#   r4   rb   lengthspacked_samplesavg_samples_per_binr$   rg   r%   select_samples_to_pack  s"   

z,MultiModalTaskEncoder.select_samples_to_packc                 C   s   ddl m} tjdd |D dd}| jjj}|dd |D dd |D | j|| jjd\}}}}}	g }
|D ]}|j	d	urC|

|j	 q6t|
dkrLd	}
td
dd |D d||||||	|
d	S )a  
        Function to pack a list of ImageTaskSample into a single ImageTaskSamplePacked.

        NOTE: Energon dataloader calls this method internally if packing is used.
        Please see https://nvidia.github.io/Megatron-Energon/packing.html

        Args:
            samples: List of ImageTaskSample instances to pack into one sample.

        Returns:
            ImageTaskSamplePacked instance.
        r   )convert_to_packedc                 S      g | ]}|j qS r$   )r7   re   r$   r$   r%   ri   5      z?MultiModalTaskEncoder.pack_selected_samples.<locals>.<listcomp>)dimc                 S   rt   r$   )r8   re   r$   r$   r%   ri   8  ru   c                 S   rt   r$   )r9   re   r$   r$   r%   ri   9  ru   )r8   r9   r   rd   ignore_indexN,c                 S   rt   r$   )rB   )rf   sr$   r$   r%   ri   F  ru   r$   )	rB   __restore_key__r8   r9   r7   r;   r:   r<   r=   )rj   rs   rW   catr   rk   rl   r   ignore_place_holderr=   rD   r@   r   join)r#   r4   rs   packed_imagesrh   packed_tokenspacked_labelspacked_position_idspacked_loss_maskr<   rI   r+   r$   r$   r%   pack_selected_samples%  s8   

z+MultiModalTaskEncoder.pack_selected_samples)Fr   r   N)r!   
__module____qualname____doc__r&   strr   r*   r   r   r	   r   r   r   r   r3   r   r   r   r   rO   dictra   rr   r   r$   r$   r$   r%   r   ,   s.    
3

F,r   )typingr   r   r   rW   megatron.energonr   r   r   r   r	   r
   r   "megatron.energon.task_encoder.baser   /nemo.collections.multimodal.data.energon.configr   r   r   r   7nemo.collections.multimodal.data.energon.sample_encoderr   r   r   r   
nemo.utilsr   r   r   r$   r$   r$   r%   <module>   s    $	
