o
    }oi#                     @   s   d dl Z d dlmZmZ d dlmZ d dlZd dlm  m	Z
 d dlmZ d dlmZmZ d dlmZ d dlmZ d dlmZ eG d	d
 d
eZeG dd deZG dd deZdS )    N)	dataclassfield)Optional)	VQASample)	DataBatch
DataSample)TaskEncoder)TaskEncoderConfig)_find_pattern_indicesc                   @   s2   e Zd ZU dZdZee ed< dZee ed< dS )r	   zConfiguration for Gemma 3 processing.

    This class consolidates all configuration needed for Gemma 3 processing,
    including model paths, tokenization, image processing, and sequence packing parameters.

     stop_stringNsystem_prompt)	__name__
__module____qualname____doc__r   r   str__annotations__r    r   r   c/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/vlm/gemma3vl/data/task_encoder.pyr	      s   
 r	   c                   @   sZ   e Zd ZU dZedd dZejed< edd dZ	ejed< edd dZ
ejed	< d
S )Gemma3DataSamplezData Sample for Gemma3c                   C      t jdt jdS Nr   dtypetorchemptylongr   r   r   r   <lambda>.       zGemma3DataSample.<lambda>)default_factory	input_idsc                   C   r   r   r   r   r   r   r   r   /   r    position_idsc                   C   s
   t dS )Nr   )r   r   r   r   r   r   r   0   s   
 pixel_valuesN)r   r   r   r   r   r"   r   Tensorr   r#   r$   r   r   r   r   r   *   s
   
 r   c                       sJ   e Zd ZdZdefddZdedef fddZd	e	de
fd
dZ  ZS )r   a  TaskEncoder for Gemma 3 data processing.

    This class handles the processing of different types of Gemma 3 samples,
    including Visual Question Answering (VQA), Captioning, and Interleaved samples.
    It provides functionality for encoding individual samples, batching them together,
    and handling packed sequences for efficient processing.

    The encoder supports:
    - VQA samples: Processing image-question pairs with corresponding answers
    - [In progress] Interleaved samples: Processing alternating image and text content
    - [In progress] Similarity interleaved samples: Processing image-text pairs for similarity tasks
    - [In progress] Packed sequences: Efficient processing of multiple samples in a single sequence

    Args:
        config (TaskEncoderConfig): Configuration object containing processing parameters

    Note:
        When using packed sequences, the micro batch size must be 1, and the global batch
        size and sequence length must be adjusted accordingly.
    configc                 C   s*   || _ | j j| _| j j| _d| ji| _dS )z~Initialize the Gemma 3 processor.

        Args:
            config (TaskEncoderConfig): Configuration for processing
        r   N)r&   hf_processor	tokenizerencode_vqa_sampleencoders)selfr&   r   r   r   __init__I   s
   


zTaskEncoder.__init__
batch_datareturnc                    s8   t  |}|d jdg|d jdd R  |d< |S )a  Encode a batched set of samples for model input.

        This method transforms the raw batched data into a format ready for model input, including
        generating position IDs and other necessary fields.

        Parameters:
            batch_data (DataBatch): The raw batch of data to be encoded.

        Returns:
            dict: A dictionary containing the encoded batch data, ready for model input.
        media   N)superencode_batchreshapeshape)r+   r-   	__class__r   r   r3   X   s   (zTaskEncoder.encode_batchinput_samplec              
   C   s  g }| j jr|d| j jd t|jtr|jn|jg}t|jtr&|jn|jg}tt|t|}t	|D ]&}|| 
d| j j}|| j jd |d || j jd || d q7| jj|dd}| j|j|dd	did
}	|	d d}
|	d}t|
| j j}d}|D ]>}|| j jpd }| jj|ddd }tj||
jd}t|
||\}}|dkr|
|| |||< |}qtd||
||  |
dd  }
|dd  }t|
}| j jr|| j j d | j j | j j }|| }|dkrt|
d|fdd}
t|d|fd| j j}tj |tj!d}d||| j jk< |dur9|" dkr9|j#|
jtj$d}|}nd}t%|j&|j'|j(|j)||
||dS )a  Encode a VQA sample into a DataSample format.

        Args:
            input_sample (VQASample): Input VQA sample containing image, context and answers

        Returns:
            DataSample: Encoded sample with processed image, tokens, labels and loss mask
        system)rolecontentz<image>r      F)tokenizept
do_rescale)imagestextreturn_tensorsimages_kwargsr"   r$   r   )add_special_tokens)devicezUnable to find answer segment in the tokenized conversation. Skipping labeling for this and subsequent answers. Details: 
- Processed Text: %s
- Tokens: %s
- Target Answer Tokens: %s
- Search Start Index: %dNr0   constantr   g        )rE   r   )__key____restore_key____subflavor____subflavors__r$   r"   labels	loss_mask)*r&   r   append
isinstancecontextlistanswersminlenrangereplaceimage_tokenrolesr'   apply_chat_templateimagesqueezegetr   	full_likeignore_place_holderr   r(   tensorrE   r
   loggingwarning
contiguouspad_to_multiple_ofFpad	ones_likefloatnumeltobfloat16r   rG   rH   rI   rJ   )r+   r8   messagescontextsrQ   
min_lengthicontext_with_placeholderconverted_messagesoutputstokensr@   rK   search_start_indexansweranswer_with_stopanswer_tokensanswer_tokens_tensoranswer_start
answer_endseqlenseqlen_paddedpad_lenrL   processed_imager   r   r   r)   h   s   		


zTaskEncoder.encode_vqa_sample)r   r   r   r   r	   r,   r   dictr3   r   r   r)   __classcell__r   r   r6   r   r   3   s
    r   )r_   dataclassesr   r   typingr   r   torch.nn.functionalnn
functionalrc   megatron.energonr   &nemo.collections.vlm.data.task_encoderr   r   r   BaseTaskEncoderr	   BaseTaskEncoderConfignemo.collections.vlm.data.utilsr
   r   r   r   r   r   <module>   s   