o
    }oi                     @   s   d dl Z d dlmZ d dlmZ d dlZd dlmZ d dlm	Z	m
Z
 d dlmZ d dlmZ d dlmZ G d	d
 d
e	ZG dd deZdS )    N)field)Dict)	VQASample)ImageTextSampleMultiModalSampleConfig)VQASampleEncoder)create_vision_mask_tensor)loggingc                   @   sr   e Zd ZU dZedd dZejed< edd dZ	ejed< edd dZ
ejed	< ed
d dZejed< dS )LlamaImageTextSamplezLlama Image Text Samplec                   C      t jdt jdS Nr   dtypetorchemptyfloat r   r   c/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/vlm/mllama/data/sample_encoder.py<lambda>       zLlamaImageTextSample.<lambda>)default_factoryvision_maskc                   C   r   r   r   r   r   r   r   r       r   aspect_ratio_idsc                   C   r   r   r   r   r   r   r   r   !   r   aspect_ratio_maskc                   C   r   r   r   r   r   r   r   r   "   r   	num_tilesN)__name__
__module____qualname____doc__r   r   r   Tensor__annotations__r   r   r   r   r   r   r   r
      s   
 r
   c                       s|   e Zd ZdZe f fdd	Zdeeej	f fddZ
ddefd	d
Zdedej	fddZdedefddZdd Z  ZS )Llama3SampleEncoderzMLlama Sample Encoderc                    s   t  ||| |j| _dS )a  
        Initialize the VQASampleEncoder.

        Parameters:
        tokenizer (Tokenizer): The HF tokenizer used for processing text.
        image_processor (ImageProcessor): The HF image processor used for preprocessing images.
        multimodal_sample_config (MultiModalSampleConfig, optional): Configuration object for multimodal samples.
            Defaults to MultiModalSampleConfig().
        N)super__init__conversation_template_config)self	tokenizerimage_processormultimodal_sample_config	__class__r   r   r$   (   s   
zLlama3SampleEncoder.__init__returnc                 C   s   | j j|dd}|S )Npt)return_tensors)r(   
preprocess)r&   image
image_dictr   r   r   process_image5   s   z!Llama3SampleEncoder.process_imageF
input_textc                 C   s  | j jr| j j| j_n
| jjd u rtdtd|j d|j  g }| j jr6|	dd| j jdgd t
|jtr{t
|jtr{tt|jt|j}t|D ](}|	| j jd d|j| dgd |	| j jd	 d|j| dgd qQn3t
|jtrt
|jtr|	| j jd d|jdgd |	| j jd	 d|jdgd ntd
| jj|ddd}td|  |S )NzBoth tokenizer and conversation template does not have chat template defined. Refer to https://huggingface.co/docs/transformers/main/en/chat_templating z$apply_conversation_template context z answer systemtext)typer5   )rolecontentr      zYVQA Sample context/answers should either be a List[str] or str. Other types not supportedF)tokenizeadd_generation_promptz'apply prompt template templated_prompt )r%   chat_templater'   
ValueErrorr	   debugcontextanswersr4   append
isinstancelistminlenrangerolesstrapply_chat_template)r&   r3   	use_plainmessages
min_lengthitemplated_promptr   r   r   apply_prompt_template9   sV   



z)Llama3SampleEncoder.apply_prompt_templatepromptc                 C   s   dd dd | jjfD  d }t||}g }|D ]!}|| jjkr+|| jj qt|dkr<|| j	|ddj
 qtj|tjd	S )
N(|c                 s   s    | ]}t |V  qd S )N)reescape).0tokenr   r   r   	<genexpr>q   s    z/Llama3SampleEncoder.tokenize.<locals>.<genexpr>)r   F)add_special_tokensr   )joinimage_token	token_strrS   splitrA   token_idrE   extendr'   	input_idsr   tensorlong)r&   rP   regex_patternchunkstokenized_chunkschunkr   r   r   r:   p   s   "zLlama3SampleEncoder.tokenizeinput_sampleoutput_samplec           	      C   s   |  |}td|  | |}| ||}|d d  }|dd   }td|  td|  | |}t|| jj	d}| 
|j}|j|_|d d |_|d	 d |_|d
 d |_|d d |_||_||_||_||_|S )Nz9[Energon] task encoder encode_sample conversation_prompt r9   zB[Energon] task encoder encode_sample after tokenize prompt tokens z,[Energon] task encoder encode_sample labels )tokensvision_token_idpixel_valuesr   r   r   r   )rO   r	   r>   r:   compute_labels
contiguouscompute_loss_maskr   r[   r^   r2   r0   __key__imagesr   r   r   rj   labels	loss_maskr   )	r&   rg   rh   conversation_promptrj   rr   rs   r   processed_image_dictr   r   r   encode}   s*   


zLlama3SampleEncoder.encodec                 C   s   ||d u rd S | S )N r   )r&   answerstop_strr   r   r   process_answer_str   s   z&Llama3SampleEncoder.process_answer_str)F)r   r   r   r   r   r$   r   rH   r   r    r2   r   rO   r:   r
   rv   rz   __classcell__r   r   r*   r   r"   %   s    7r"   )rS   dataclassesr   typingr   r   megatron.energonr   /nemo.collections.multimodal.data.energon.configr   r   7nemo.collections.multimodal.data.energon.sample_encoderr   'nemo.collections.vlm.mllama.model.utilsr   
nemo.utilsr	   r
   r"   r   r   r   r   <module>   s   	