o
    }oi(                     @   s  d dl Z d dlmZ d dlmZ d dlZd dlm  mZ	 d dl
mZ d dlmZmZ d dlmZ d dlmZ d dlmZ eG d	d
 d
eZG dd deZedkrd dlZd dl
mZmZmZ e Zejdeddd e ZdZ eee ddZ!e"d Z#eeej$ddde!e#de#dZ%e&de'e%  e(e%D ]2\Z)Z*e&d e*D ]Z+e&de) de+ de,e*e+ ej-re*e+ j.ne*e+   qe)dkr dS qdS dS )     N)	dataclass)Optional)	VQASample)	DataBatch
DataSample)TaskEncoder)TaskEncoderConfig)_find_pattern_indicesc                   @   s2   e Zd ZU dZdZee ed< dZee ed< dS )r   zConfiguration for llama4 processing.

    This class consolidates all configuration needed for llama4 processing,
    including model paths, tokenization, image processing, and sequence packing parameters.

     stop_stringNsystem_prompt)	__name__
__module____qualname____doc__r   r   str__annotations__r    r   r   a/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/vlm/llama4/data/task_encoder.pyr      s   
 r   c                       sJ   e Zd ZdZdefddZdedef fddZd	e	de
fd
dZ  ZS )r   a  TaskEncoder for llama4 data processing.

    This class handles the processing of different types of llama4 samples,
    including Visual Question Answering (VQA), Captioning, and Interleaved samples.
    It provides functionality for encoding individual samples, batching them together,
    and handling packed sequences for efficient processing.

    The encoder supports:
    - VQA samples: Processing image-question pairs with corresponding answers
    - [In progress] Interleaved samples: Processing alternating image and text content
    - [In progress] Similarity interleaved samples: Processing image-text pairs for similarity tasks
    - [In progress] Packed sequences: Efficient processing of multiple samples in a single sequence

    Args:
        config (TaskEncoderConfig): Configuration object containing processing parameters

    Note:
        When using packed sequences, the micro batch size must be 1, and the global batch
        size and sequence length must be adjusted accordingly.
    configc                 C   s*   || _ | j j| _| j j| _d| ji| _dS )z}Initialize the llama4 processor.

        Args:
            config (TaskEncoderConfig): Configuration for processing
        r   N)r   hf_processor	tokenizerencode_vqa_sampleencoders)selfr   r   r   r   __init__@   s
   


zTaskEncoder.__init__
batch_datareturnc                    s8   t  |}|d jdg|d jdd R  |d< |S )a  Encode a batched set of samples for model input.

        This method transforms the raw batched data into a format ready for model input, including
        generating position IDs and other necessary fields.

        Parameters:
            batch_data (DataBatch): The raw batch of data to be encoded.

        Returns:
            dict: A dictionary containing the encoded batch data, ready for model input.
        media   N)superencode_batchreshapeshape)r   r   	__class__r   r   r"   Q   s   (zTaskEncoder.encode_batchinput_samplec                 C   s  g }| j jr|d| j jd t|jtr|jn|jg}t|jtr&|jn|jg}tt|t|}t	|D ]&}|| 
d| j j}|| j jd |d || j jd || d q7| jj|dd}| j|j|dd	}	|	d
 d}
|	d}t|
| j j}d}|D ]>}|| j jpd }| jj|ddd
 }tj||
jd}t|
||\}}|dkr|
|| |||< |}qtd||
||  |
dd  }
|dd  }t|
}| j jr|| j j d | j j | j j }|| }|dkrt|
d|fdd}
t|d|fd| j j}tj |tj!d}d||| j jk< |dur6|" dkr6|j#|
jtj$d}|}n
tj%dtj$|
jd}t&|j'|j(|j)|j*||
|||d	S )a  Encode a VQA sample into a DataSample format.

        Args:
            input_sample (VQASample): Input VQA sample containing image, context and answers

        Returns:
            DataSample: Encoded sample with processed image, tokens, labels and loss mask
        system)rolecontentz<image>r      F)tokenizept)imagestextreturn_tensors	input_idspixel_valuesr
   )add_special_tokens)devicezUnable to find answer segment in the tokenized conversation. Skipping labeling for this and subsequent answers. Details: 
- Processed Text: %s
- Tokens: %s
- Target Answer Tokens: %s
- Search Start Index: %dNr   constant)dtypeg        )r4   r6   )r      P  r8   )r6   r4   )	__key____restore_key____subflavor____subflavors__r.   tokenslabels	loss_maskseqlen)+r   r   append
isinstancecontextlistanswersminlenrangereplaceimage_tokenrolesr   apply_chat_templateimagesqueezegettorch	full_likeignore_place_holderr   r   tensorr4   r	   loggingwarning
contiguouspad_to_multiple_ofFpad	ones_likefloatnumeltobfloat16emptyr   r9   r:   r;   r<   )r   r'   messagescontextsrE   
min_lengthicontext_with_placeholderconverted_messagesoutputsr=   r.   r>   search_start_indexansweranswer_with_stopanswer_tokensanswer_tokens_tensoranswer_start
answer_endr@   seqlen_paddedpad_lenr?   processed_imager   r   r   r   a   s~   	


zTaskEncoder.encode_vqa_sample)r   r   r   r   r   r   r   dictr"   r   r   r   __classcell__r   r   r%   r   r   *   s
    r   __main__)WorkerConfig
get_loaderget_train_datasetz--data_pathTzpath to the dataset directory)typerequiredhelpz)meta-llama/Llama-4-Scout-17B-16E-Instruct)hf_path)r      d   )
batch_sizeshuffle_buffer_sizemax_samples_per_sequencetask_encoderworker_config)r   zData loader length: z2==================================================zbatch index z 'z' shape r    )/rT   dataclassesr   typingr   rP   torch.nn.functionalnn
functionalrX   megatron.energonr   &nemo.collections.vlm.data.task_encoderr   r   r   BaseTaskEncoderr   BaseTaskEncoderConfignemo.collections.vlm.data.utilsr	   r   argparsert   ru   rv   ArgumentParserparseradd_argumentr   
parse_argsargsmodel_idr   default_worker_configr   	data_pathtrain_loaderprintrG   	enumerateindex
each_batchkeyrB   Tensorr$   r   r   r   r   <module>   sh    %
 "