o
    }oi                     @   s   d dl mZmZ d dlmZmZ d dlZd dlm  m	Z
 d dlmZmZmZ d dlmZ d dlmZ d dlmZ d dlmZmZ d	ed
efddZeG dd dZG dd deZdS )    )	dataclassfield)DictListN)	VQASample
batch_listbatch_pad_stack)pad_sequence)SampleEncoder)MultiModalTaskEncoder)Llama3SampleEncoderLlamaImageTextSample
seq_lengthpadding_valuec                 C   sL   |  d|k r||  d }tj| d|f|d} | S | d d d |f } | S )N   r   )value)sizeFpad)sequence_batchr   r   pad_size r   a/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/vlm/mllama/data/task_encoder.pypad_or_truncate   s   r   c                   @   s   e Zd ZU eedZee ed< edd dZ	e
jed< edd dZe
jed< edd dZe
jed	< ed
d dZe
jed< edd dZe
jed< edd dZe
jed< edd dZe
jed< edd dZe
jed< dS )LlamaImageTextRawBatch)default_factory__keys__c                   C      t jdt jdS Nr   )dtypetorchemptylongr   r   r   r   <lambda>-       zLlamaImageTextRawBatch.<lambda>tokensc                   C   r   r   r    r   r   r   r   r$   .   r%   labelsc                   C   r   r   r!   r"   floatr   r   r   r   r$   /   r%   	loss_maskc                   C   
   t dS Nr   r!   r"   r   r   r   r   r$   1      
 batch_imagesc                   C   r+   r,   r-   r   r   r   r   r$   2   r.   batch_masksc                   C   r   r   r(   r   r   r   r   r$   4   r%   aspect_ratio_idsc                   C   r   r   r(   r   r   r   r   r$   5   r%   aspect_ratio_maskc                   C   r   r   r(   r   r   r   r   r$   6   r%   
num_chunksN)__name__
__module____qualname__r   listr   r   str__annotations__r&   r!   Tensorr'   r*   r/   r0   r1   r2   r3   r   r   r   r   r   )   s   
 r   c                       s4   e Zd Zd fdd	Zdee defddZ  ZS )	LlamaTaskEncoderNc                    s6   t  ||| tjt|||i| _|| _|j| _d S N)	super__init__r   r4   r   encodersr   ignore_place_holderignore_index)self	tokenizerimage_processormultimodal_sample_configr   	__class__r   r   r>   :   s
   zLlamaTaskEncoder.__init__samplesreturnc                 C   sx  g g g g g g f\}}}}}}g g g }}	}
|D ]8}| |j | |j | |j | |j | |j | |j | |j |	 |j |
 |j	 qt
|}t|}t|dd}t|d| jd}t|}| jd urt| j}n|dd d d d }t||d}t||| j}t||d}| dksJ dt|}t|}t|	}t|
}t|||||||||d	S )	NT)batch_first)rJ   r   r   @   r   z;This batch has nothing to predict! Will trigger a nan loss.)	r   r/   r0   r&   r'   r*   r1   r2   r3   )append__key__imagesr&   r'   r*   vision_maskr1   r2   	num_tilesr   r   r	   rA   r   r   r   sumr!   tensorr   )rB   rH   keysrN   r&   r'   r*   rO   r1   r2   rP   sample
batch_keysr/   batch_tokensbatch_labelsbatch_loss_maskr   batch_vision_maskbatch_aspect_ratio_idsbatch_aspect_ratio_maskbatch_num_tilesr   r   r   batchB   sN   

zLlamaTaskEncoder.batchr<   )	r4   r5   r6   r>   r   r   r   r]   __classcell__r   r   rF   r   r;   9   s    r;   )dataclassesr   r   typingr   r   r!   torch.nn.functionalnn
functionalr   megatron.energonr   r   r   torch.nn.utils.rnnr	   7nemo.collections.multimodal.data.energon.sample_encoderr
   5nemo.collections.multimodal.data.energon.task_encoderr   /nemo.collections.vlm.mllama.data.sample_encoderr   r   intr   r   r;   r   r   r   r   <module>   s   