o
    }oi                     @   s   d dl mZmZ d dlmZmZmZ d dlZd dlm	Z	 d dl
mZmZ eG dd deZeG dd	 d	eZeG d
d deZeG dd deZdS )    )	dataclassfield)ListOptionalUnionNPackedSeqParams)ImageTextRawBatchImageTextSamplec                   @   sH   e Zd ZU dZdZeed< edd dZe	j
ed< dZee	j
 ed	< dS )
LlavaNextTextSampleaz  
    Sample type for LLaVA-Next, extending ImageTextSample to support tiled image data.

    This class adds additional attributes for handling high-resolution images processed as tiles,
    along with metadata about the tiled images.

    Attributes:
        num_media_tiles (int): The number of tiles used to represent the high-resolution image.
        image_sizes (torch.Tensor): A tensor representing the sizes of the tiled images.
        attention_mask (Optional[torch.Tensor]): An optional attention mask for the sample,
        used to determine which tokens or tiles are attended to during processing. Defaults to None.
    r   num_media_tilesc                   C   
   t g S Ntorchtensor r   r   _/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/vlm/llava_next/data/sample.py<lambda>'      
 zLlavaNextTextSample.<lambda>default_factoryimage_sizesNattention_mask)__name__
__module____qualname____doc__r   int__annotations__r   r   r   r   r   r   r   r   r   r   r      s
   
 r   c                   @   s^   e Zd ZU dZdZeeeeef df e	d< e
dd dZeje	d< e
d	d dZee	d
< dS )PackedLlavaNextTextSamplez(Sample type for packed image text sampler   .__restore_key__c                   C      t jdt jdS Nr   )dtyper   emptyfloatr   r   r   r   r   0       z"PackedLlavaNextTextSample.<lambda>r   position_idsc                   C      t  S r   r   r   r   r   r   r   1       packed_seq_paramsN)r   r   r   r   r!   tupler   strr   r   r   r)   r   Tensorr,   r   r   r   r   r   r    +   s
   
 r    c                   @   sR   e Zd ZU dZeedZee e	d< edd dZ
eje	d< dZeej e	d< dS )	LlavaNextTextRawBatchaI  
    Batch type for raw LLaVA-Next samples, supporting tiled image data.

    This class aggregates multiple `LlavaNextTextSample` instances into a batch for processing.
    It includes attributes for managing tiled images and associated metadata for each sample in the batch.

    Attributes:
        num_media_tiles (List[int]): A list containing the number of tiles for each image in the batch.
        image_sizes (torch.Tensor): A tensor containing the sizes of all tiled images in the batch.
        attention_mask (Optional[torch.Tensor]): Attention mask. Defaults to None.
    r   r   c                   C   r   r   r   r   r   r   r   r   C   r   zLlavaNextTextRawBatch.<lambda>r   Nr   )r   r   r   r   r   listr   r   r   r   r   r   r   r   r   r   r   r   r   r0   4   s
   
 r0   c                   @   s@   e Zd ZU dZedd dZejed< edd dZ	e
ed< dS )	PackedLlavaNextTextRawBatchz$Sample type for image text raw batchc                   C   r"   r#   r%   r   r   r   r   r   K   r(   z$PackedLlavaNextTextRawBatch.<lambda>r   r)   c                   C   r*   r   r   r   r   r   r   r   L   r+   r,   N)r   r   r   r   r   r)   r   r/   r   r,   r   r   r   r   r   r2   G   s   
 r2   )dataclassesr   r   typingr   r   r   r   megatron.core.packed_seq_paramsr   /nemo.collections.multimodal.data.energon.configr	   r
   r   r    r0   r2   r   r   r   r   <module>   s   