o
    }oiN                  	   @   s,  d dl Z d dlmZmZ d dlmZmZmZ d dlZd dl	m
  mZ d dlmZ d dlmZmZmZmZmZmZmZmZ d dlmZ d dlmZmZ d dlmZ d d	lm Z m!Z!m"Z"m#Z# d d
l$m%Z% eG dd deZ&eG dd deZ'eG dd dZ(G dd deeeeeef e&e'e)f Z*dS )    N)	dataclassfield)ListOptionalUnion)PackedSeqParams)BatchCaptioningSampleDefaultTaskEncoderInterleavedSampleSampleSimilarityInterleavedSample	VQASamplegeneric_batch)	stateless)AutoImageProcessorAutoProcessor)AutoTokenizer)_find_pattern_indicesconvert_to_packedgreedy_knapsackpredict_seq_len)loggingc                   @   s   e Zd ZU dZedd dZejed< edd dZ	ejed< edd dZ
ejed	< ed
d dZejed< edd dZejed< dZee ed< edd dZeed< dS )
DataSamplea>  DataSample for multimodal data.

    This class represents a single data sample in a multimodal dataset, containing
    both image and text data along with their associated labels and masks.

    Attributes:
        images (torch.Tensor): Input images with shape (N, C, H, W), where N is typically 1
            for a single sample, C is channels, H is height, and W is width.
        tokens (torch.Tensor): Input token IDs for text data.
        labels (torch.Tensor): Target labels for the tokens.
        loss_mask (torch.Tensor): Mask indicating which tokens should contribute to the loss.
        position_ids (torch.Tensor): Position embeddings for the tokens.
        packed_seq_params (PackedSeqParams, optional): Parameters for packed sequence processing.
        seqlen (int): Length of the sequence before padding.
    c                   C   
   t dS Nr   torchempty r   r   Z/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/vlm/data/task_encoder.py<lambda>:      
 zDataSample.<lambda>default_factoryimagesc                   C      t jdt jdS Nr   dtyper   r   longr   r   r   r    r!   ;       tokensc                   C   r&   r'   r*   r   r   r   r    r!   <   r,   labelsc                   C   r&   r'   r   r   floatr   r   r   r    r!   =   r,   	loss_maskc                   C   r&   r'   r/   r   r   r   r    r!   >   r,   position_idsNpacked_seq_paramsc                   C   s   dS r   r   r   r   r   r    r!   @   s    seqlen)__name__
__module____qualname____doc__r   r%   r   Tensor__annotations__r-   r.   r1   r2   r3   r   r   r4   intr   r   r   r    r   (   s   
 r   c                   @   s   e Zd ZU dZedd dZejed< edd dZ	ejed< edd dZ
ejed	< ed
d dZejed< edd dZejed< dZee ed< dS )	DataBatcha7  DataBatch for multimodal data.

    This class represents a batch of data samples in a multimodal dataset. It maintains
    the same structure as DataSample but with an additional batch dimension.

    Attributes:
        images (torch.Tensor): Batched input images with shape (B, N, C, H, W), where B is
            batch size, N is typically 1, C is channels, H is height, and W is width.
        tokens (torch.Tensor): Batched input token IDs with shape (B, L), where L is sequence length.
        labels (torch.Tensor): Batched target labels with shape (B, L).
        loss_mask (torch.Tensor): Batched loss masks with shape (B, L).
        position_ids (torch.Tensor): Batched position embeddings with shape (B, L).
        packed_seq_params (PackedSeqParams, optional): Parameters for packed sequence processing.
    c                   C   r   r   r   r   r   r   r    r!   T   r"   zDataBatch.<lambda>r#   r%   c                   C   r&   r'   r*   r   r   r   r    r!   U   r,   r-   c                   C   r&   r'   r*   r   r   r   r    r!   V   r,   r.   c                   C   r&   r'   r/   r   r   r   r    r!   W   r,   r1   c                   C   r&   r'   r/   r   r   r   r    r!   X   r,   r2   Nr3   )r5   r6   r7   r8   r   r%   r   r9   r:   r-   r.   r1   r2   r3   r   r   r   r   r   r    r<   C   s   
 r<   c                   @   s   e Zd ZU dZdZee ed< dZee	 ed< dZ
ee ed< dZeed< dZeed	< d
Zeed< dZee ed< edd dZee ed< dZee ed< dZeed< dZeed< dZeed< dZeed< dZee ed< dd ZdS )TaskEncoderConfigaV  Configuration for multimodal processing.

    This class consolidates all configuration needed for multimodal processing,
    including model paths, tokenization, image processing, and sequence packing parameters.

    Args:
        hf_path (str, optional): HuggingFace model path used to load tokenizer and image processor.
        tokenizer (AutoTokenizer, optional): Pre-initialized tokenizer instance.
        image_processor (AutoImageProcessor, optional): Pre-initialized image processor instance.

    Note:
        Either hf_path or both tokenizer and image_processor must be provided.
    Nhf_path	tokenizerimage_processor<image>image_token_stri8image_token_idiignore_place_holderz</s>stop_stringc                   C   s   ddgS )Nuser	assistantr   r   r   r   r    r!   x   s    zTaskEncoderConfig.<lambda>r#   roleschat_templateTimage_following_textFpacked_sequencei@  num_image_embeddings_per_tilepacked_sequence_size@   pad_to_multiple_ofc                 C   s   | j s| jr	| jstd| j r`t| j | _| js)t| j | _t	d| j   | js<t
| j | _t	d| j   t| jD ]}tt| j|s_|ds_t| |s_t| |t| j| qA| jdu rhd| _| jrt| jdu rvtddS dS )zInitialize tokenizer and image processor if not provided.

        Raises:
            ValueError: If neither hf_path nor both tokenizer and image_processor are provided.
        zEEither hf_path or both tokenizer and image_processor must be providedzLoaded tokenizer from zLoaded image processor from _N zPpad_to_multiple_of must be provided when using packed sequence. We recommend 64.)r>   r?   r@   
ValueErrorr   from_pretrainedhf_processorr   r   infor   dircallablegetattr
startswithhasattrsetattrrE   rK   rP   )selfattrr   r   r    __post_init__   s0   
zTaskEncoderConfig.__post_init__)r5   r6   r7   r8   r>   r   strr:   r?   r   r@   r   rB   rC   r;   rD   rE   r   rH   r   rI   rJ   boolrK   rL   rN   rP   r_   r   r   r   r    r=   \   s"   
 r=   c                   @   s   e Zd ZdZdefddZedeee	f de
fddZd	ee
 defd
dZdedefddZdd Zedd Zdede
fddZdS )TaskEncodera  TaskEncoder for multimodal data processing.

    This class handles the processing of different types of multimodal samples,
    including Visual Question Answering (VQA), Captioning, and Interleaved samples.
    It provides functionality for encoding individual samples, batching them together,
    and handling packed sequences for efficient processing.

    The encoder supports:
    - VQA samples: Processing image-question pairs with corresponding answers
    - [In progress] Interleaved samples: Processing alternating image and text content
    - [In progress] Similarity interleaved samples: Processing image-text pairs for similarity tasks
    - [In progress] Packed sequences: Efficient processing of multiple samples in a single sequence

    Args:
        config (TaskEncoderConfig): Configuration object containing processing parameters

    Note:
        When using packed sequences, the micro batch size must be 1, and the global batch
        size and sequence length must be adjusted accordingly.
    configc                 C   s*   || _ | j j| _| j j| _d| ji| _dS )zInitialize the multimodal processor.

        Args:
            config (TaskEncoderConfig): Configuration for processing
        r   N)rc   rU   r?   encode_vqa_sampleencoders)r]   rc   r   r   r    __init__   s
   


zTaskEncoder.__init__samplereturnc                 C   s6   t |j}| j|}|std| ||d}|S )zProcess a sample based on its type.

        Args:
            sample: Input sample to process. Can be a dictionary or an object.

        Returns:
            dict: Processed sample in a standardized format
        z'No encoder implemented for sample type )input_sample)typer5   re   getrS   )r]   rg   sample_typeencoderencoded_sampler   r   r    encode_sample   s   

zTaskEncoder.encode_samplesamplesc                 C   s   | j jr-t|dkr(tdt| dt| dt| dt| d| j j d|d }nt|}t }|jD ]}t||t	|| q7|S )	z*
        Batch a list of samples.
           z^Micro batch size should be 1 when training with packed sequence, but your micro batch size is z. 
The following config is equivalent to your current setting for a packed dataset. Please update your config to the following: 
Set micro batch size to 1 (currently z1)
Set global batch size to `global_batch_size // z;` Set packed sequence length to `original_sample_seq_len * z` (currently zr) 
For details please visit https://docs.nvidia.com/nemo-framework/user-guide/latest/sft_peft/packed_sequence.htmlr   )
rc   rK   lenrS   rN   r   r<   __dict__r\   rY   )r]   rp   batch_samplebatchr^   r   r   r    ru      s*   

zTaskEncoder.batch
batch_datac                 C   s   |j }d|v r|d |d< |d= |d  \}}tj|tjd}|d|d}||d< d|vr5d	|d< td
d |d D rDd	|d< |S )a  
        Encode a batched set of samples for model input.

        This method transforms the raw batched data into a format ready for model input, including
        generating position IDs and other necessary fields.

        Parameters:
        batch_data (DataBatch): The raw batch of data to be encoded.

        Returns:
        dict: A dictionary containing the encoded batch data, ready for model input.
        r%   mediar-   r(   r   rq   r2   attention_maskNc                 s   s    | ]}|d u V  qd S )Nr   ).0paramr   r   r    	<genexpr>%  s    z+TaskEncoder.encode_batch.<locals>.<genexpr>r3   )rs   sizer   aranger+   	unsqueezerepeatall)r]   rv   
batch_dictmicro_batch_size
seq_lengthr2   r   r   r    encode_batch  s   zTaskEncoder.encode_batchc              
      sp   j j  fdd|D }t||j}tt|t| }tdj j dt| dt| d|  |S )zSelects which samples will be packed together.

        NOTE: Energon dataloader calls this method internally if packing is used.
        Please see https://nvidia.github.io/Megatron-Energon/packing.html
        c                    s    g | ]}t |j jjd qS ))media_token_indexrL   )r   r-   rc   rL   ry   rg   media_token_idr]   r   r    
<listcomp>1  s    z6TaskEncoder.select_samples_to_pack.<locals>.<listcomp>z&[Seq Packing Info] - Packing seq len: z, Buffered samples: z, Total number of bins: z, Average samples per bin: )rc   rC   r   rN   roundrr   r   rV   )r]   rp   lengthspacked_samplesavg_samples_per_binr   r   r    select_samples_to_pack)  s    z"TaskEncoder.select_samples_to_packc                 C   s|   t dd |D }tdd |D dd |D dd |D d\}}}}}tddd |D d	d
|d j||||||d
S )ay  
        Function to pack a list of DataSample into a single DataBatch.

        NOTE: Energon dataloader calls this method internally if packing is used.
        Please see https://nvidia.github.io/Megatron-Energon/packing.html

        Args:
            samples: List of DataSample instances to pack into one sample.

        Returns:
            DataBatch instance.
        c                 S      g | ]}|j qS r   )r%   r   r   r   r    r   Q      z5TaskEncoder.pack_selected_samples.<locals>.<listcomp>c                 S   r   r   )r-   r   r   r   r    r   S  r   c                 S   r   r   )r.   r   r   r   r    r   T  r   c                 S   r   r   )r4   r   r   r   r    r   U  r   )r-   r.   seqlens,c                 S   r   r   )__key__)ry   sr   r   r    r   Y  r   r   Nr   )
r   __restore_key____subflavor____subflavors__r-   r.   r%   r2   r1   r3   )r   stackr   r<   joinr   )r]   rp   packed_imagespacked_tokenspacked_labelspacked_position_idspacked_loss_maskr3   r   r   r    pack_selected_samplesB  s$   z!TaskEncoder.pack_selected_samplesri   c                 C   s  g }| j jr|d| j jd t|jtr|jn|jg}t|jtr&|jn|jg}tt|t|}t	|D ]&}|| 
d| j j}|| j jd |d || j jd || d q7| j jj|ddd}dt| j j d	}	t|	|}
g }|
D ]"}|| j jkr|| j j q}t|dkr|| j j|dd
j q}tj|tjd}t|| j j }t|jtr|jn|jg}d}|D ]>}|| j jpd }| jj|dd
d }tj||jd}t|||\}}|dkr||| |||< |}qtd||||  |dd   }|dd   }t!|| j j"| j j}| j j#rO|| j j# d | j j# | j j# }|| }|dkrOt$%|d|fdd}t$%|d|fd| j j}tj|tj&d}d||| j jk< | j j'j(|j)dddd d }|*d*d}t+|j,|j-|j.|j/|||||d	S )a  Encode a VQA sample into a DataSample format.

        Args:
            input_sample (VQASample): Input VQA sample containing image, context and answers

        Returns:
            DataSample: Encoded sample with processed image, tokens, labels and loss mask
        system)rolecontentrA   r   rq   F)tokenizeadd_generation_prompt())add_special_tokensr(   rR   	input_ids)devicezUnable to find answer segment in the tokenized conversation. Skipping labeling for this and subsequent answers. Details: 
- Processed Text: %s
- Tokens: %s
- Target Answer Tokens: %s
- Search Start Index: %dNrM   constantg        pt)return_tensors
do_rescalepixel_values)	r   r   r   r   r%   r-   r.   r1   r4   )0rc   system_promptappend
isinstancecontextlistanswersminrr   rangereplaceimage_tokenrH   r?   apply_chat_templatereescaperB   splitrC   extendr   r   tensorr+   	ones_likerD   rE   r   r   r   warning
contiguousr   rL   rP   Fpadr0   r@   
preprocessimager~   r   r   r   r   r   )r]   ri   messagescontextsr   
min_lengthicontext_with_placeholderconversation_promptregex_patternchunkstokenized_chunkschunkr-   r.   search_start_indexansweranswer_with_stopanswer_tokensanswer_tokens_tensoranswer_start
answer_endr4   seqlen_paddedpad_lenr1   r   processed_imager   r   r    rd   e  s   


zTaskEncoder.encode_vqa_sampleN)r5   r6   r7   r8   r=   rf   r   r   dictobjectr   ro   r   r<   ru   r   r   r   r   rd   r   r   r   r    rb      s    
"rb   )+r   dataclassesr   r   typingr   r   r   r   torch.nn.functionalnn
functionalr   megatron.core.packed_seq_paramsr   megatron.energonr   r	   r
   r   r   r   r   r   "megatron.energon.task_encoder.baser   transformersr   r   "nemo.collections.common.tokenizersr   nemo.collections.vlm.data.utilsr   r   r   r   
nemo.utilsr   r   r<   r=   r   rb   r   r   r   r    <module>   s6   (

M