o
    }oiI                     @   s\  d dl Z d dlZd dlZd dlZd dlmZ d dl mZ d dlmZm	Z	 d dl
Zd dlZd dlmZmZ d dlmZ d dlmZmZ d dlmZ d d	lmZ d d
lmZmZmZmZmZmZ d dl m!Z!m"Z" d dl#m$Z$ eG dd deZ%eG dd dZ&eG dd deZ'd de(de(de(fddZ)de*de%fddZ+G dd dee%e&e'e*f Z,dS )!    N)defaultdict)	dataclass)DictList)BatchDefaultTaskEncoder)Sample)Cookerbasic_sample_keys)Image)get_ltor_masks_and_position_ids)HF_IMAGE_TOKEN_INDEXHF_VIDEO_TOKEN_INDEXIGNORE_INDEXIMAGE_TOKEN_INDEXPAD_TOKEN_INDEXVIDEO_TOKEN_INDEX)find_pattern_indicesprocess_vision)loggingc                   @   s@   e Zd ZU dZeej ed< eeje	ej B  ed< e
ed< dS )ChatMLSamplezIntermediate Sample FormatimgsvideosconversationN)__name__
__module____qualname____doc__r   r   __annotations__torchTensorliststr r#   r#   b/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/vlm/qwen2vl/data/task_encoder.pyr   +   s
   
 r   c                   @   s   e Zd ZU dZeed< eed< eej	 ed< eej	 ed< eej	 ed< eej	 ed< ej	ed< ej	ed	< ej	ed
< ej	ed< dS )Qwen2VLTaskSamplez!Encoded Sample Format For Qwen2VL__key____subflavors__r   r   image_thw_gridsvideo_thw_gridsimage_input_maskvideo_input_masktexttargetN)
r   r   r   r   r"   r   r   r   r   r    r#   r#   r#   r$   r%   6   s   
 


r%   c                   @   s   e Zd ZU dZee ed< ee ed< ej	ed< ej	ed< ej	ed< ej	ed< ej	ed< ej	ed	< ej	ed
< ej	ed< ej	ed< dS )Qwen2VLTaskBatchz Encoded Batch Format For Qwen2VL__keys__r'   pixel_valuespixel_values_videosimage_grid_thwvideo_grid_thwr*   r+   	input_idslabels	loss_maskN)
r   r   r   r   r   r"   r   r   r   r    r#   r#   r#   r$   r.   H   s   
 







r.   <image><video>
user_inputimage_patternvideo_patternc           
      C   s
  dj ||d}g }d}tt}t|| D ]V}| \}}	||kr0|d| || dd |d|j||	 dd	 |j||	 dd	 t	||j||	 dd	  i |	}||j||	 dd	   d7  < q|t
| k r|d| |t
|  dd |S )
z7Split user input into format Qwen2VL tokenizer accepts.z({image}|{video}))imagevideor   r,    )typer,   r?      )formatr   intrefinditerspanappendstripstringr"   len)
r9   r:   r;   patterncontentscurmm_idxmatchedstartendr#   r#   r$   convert_to_qwen2vl_content\   s$   0$"rR   samplereturnc                 C   s   |  dd}|r"t|}t|tr t|dkr dd |D }nd}|  dd}|rDt|}t|trBt|dkrBdd |D }nd}d| d	 v rS|du rStd
 d| d	 v rb|du rbtd tdi t	| ||| d	 d}|S )z
    Convert crude sampel to ChatMLSample.

    Args:
        sample: Crude sample in pickle serialized format

    Returns:
        sample in ChatMLSample format
    jpgsNr   c                 S   s   g | ]}t |qS r#   )r   	fromarray.0dr#   r#   r$   
<listcomp>   s    z&cook_chatml_sample.<locals>.<listcomp>r   c                 S   s   g | ]	}d d |D qS )c                 S      g | ]}|qS r#   r#   rW   r#   r#   r$   rZ          z1cook_chatml_sample.<locals>.<listcomp>.<listcomp>r#   rX   r=   r#   r#   r$   rZ      s    r7   jsonz.<image> in conversation text but no image datar8   z.<video> in conversation text but no video data)r   r   r   r#   )
getpickleloads
isinstancer!   rJ   r   warningr   r
   )rS   r   r   chat_sampler#   r#   r$   cook_chatml_samplex   s0   





re   c                	       s|   e Zd ZdZeegZ				ddedededef fd	d
Zde	fddZ
dee defddZdedefddZ  ZS )Qwen2VLTaskEncoderz%A simple task encoder for captioning.         temporal_patch_sizespatial_merge_size
patch_sizemax_padding_lengthc                    sH   t    |j| _|| _|| _|| _|| _|| _|| _	t
t| _| _d S N)super__init__	tokenizerhf_tokenizerimage_processor
seq_lengthrj   
merge_sizerl   seq_lenr   r   image_token_idvideo_token_id)selfrq   rs   rj   rk   rl   rm   	__class__r#   r$   rp      s   
	zQwen2VLTaskEncoder.__init__rS   c           *         sn  t | j|j|j}|d }|d }|d }|d }t|jttfr't	|jn|j}d|d v }d|d v r8dnd}	d|d v rBdnd	}
g }t
|d
 dkrW|ddd n|d|d |
 d |dd }|rddg}t|D ]:\}}||	 }|||t
|  krtd| d|  ||
 }|dkrd}t|}n|dkrd}|||d qsn:ddg}t|D ]1\}}||	 }|||t
|  krtd| d|  ||
 }|dkrt|}|||d q|}| jj|dddd }| jj  fddtt
|D }d}t|dd D ]9\}}|d dkrI|d	 d d }| jj|dd}t|||\}}|dks=J d ||| |||< |}q| jd
 }| j| j}}t||kd }|dur|durt
|t
|ksJ d!t
| d"t
| d#t||kd }|dur|durt
|t
|ksJ d!t
| d$t
| d%|dur|durtj|tjd&tj|tjd&}}|jd |jd  |jd'd( |  |jd  |jd'd( |  }nI|durtj|tjd&}|jd |jd  |jd'd( |  }n'|dur0tj|tjd&}|jd |jd  |jd'd( |  }n|jd }|| j krDtd)| d* tj!||j"d&}|# }d+\} }!t$t%||g}"d+\}#}$|"D ]d}%||% }&|&|kr|||   | }'| d7 } n|&|kr||!  | }'|!d7 }!||#|% ||$|$|% |# < ||#|% ||$|$|% |# < |$|%|# 7 }$|&||$|$|' <  ||$|$|' < |$|'7 }$|%d }#qd|#t
|k r||#d ||$d< ||#d ||$d< tj&|d'd,} |d'< | k' rtd- t()||k}(t()||k})t*|j+|j,|r|d. ng |r|d/ ng |r#|ng |r)|ng |(|)t()|t()|d0
S )1z
        Encode sample to meet training requirement.

        Args:
            sample.imgs: list[PIL.Image.Image]
            sample.videos: list[Tensor]

        Returns:
            sample with necessary fields
        r2   r3   image_inputsvideo_inputsfromr   rolevaluecontentrg   systemzYou are a helpful assistant.)r   r   r@   NhumangptzSExpect conversation organized in order: [sys] human gpt human gpt...,but got role 'z
' in turn user	assistantz^Expect conversation organized in order: [sys] user assistant user assistant..., but got role 'Tnp)tokenizereturn_tensorsc                    s   g | ]} qS r#   r#   )rX   _pad_token_idr#   r$   rZ     r\   z4Qwen2VLTaskEncoder.encode_sample.<locals>.<listcomp>z
<|im_end|>
F)add_special_tokensz'Not found valid answer in conversation.zWith z images in the sample, but z image placeholders!z videos in the sample, but z video placeholders!dtyperA   )axiszLong sequence with length z found, dropped...)r   r   )shiftz&Sample with all masked label, dropped.r0   r1   )
r&   r'   r   r   r(   r)   r*   r+   r,   r-   )-r   rs   r   r   rb   r   r"   bytesr^   ra   rJ   rG   	enumerater   rc   rR   rr   apply_chat_templater   rangeencoder   ru   rw   rx   r   wherearrayint64shapeprodsumrv   zerosr   copysortconcatenaterollallr   
from_numpyr%   r&   r'   )*ry   rS   processed_visionr(   r)   flattened_imgsflattened_videosr   _from_system_role_keycontent_keyconverted_conversationEXPECTED_ROLEturn_idxturnr   r   r4   r-   search_start_indexansweranswer_tokensanswer_start
answer_endmerge_lengthrw   rx   image_token_indicesvideo_token_indicestarget_lengthfinal_input_idsfinal_input_masks	image_idx	video_idxindicescur_xcur_yidxtoken_idsizer*   r+   r#   r   r$   encode_sample   s$   





&
&




z Qwen2VLTaskEncoder.encode_samplesamplesrT   c                 C   s`  g g }}|D ]7}t |jdkr*dd |jdD }tdd |D }|| t |jdkr>dd |jD }|| qg g }}	|D ]7}t |jdkridd |jdD }
tdd |
D }|| t |j	dkr}dd |j	D }|	| qFt
d	d
 |D }|| jkrtd tjt ||f| jjtjd}tjt ||f| jjtjd}tj|td}tj|td}t|D ]_\}}t|t |j}t|t |j}t|jd| ||d|f< |jdurt|jd| ||d|f< |jdurt|jd| ||d|f< t|jd| ||d|f< qt|}t||| jk< t||| j k< d||t!k< t|}t"||t!k< t#|| jj$dddd\}}}d||dk < t%dd |D dd |D t |dkrrt&|ndt |dkrt&|ndt |dkrtt|ndt |	dkrtt|	ndt|t||||d}|S )z
        Put encoded sample into Batch, do padding, add labels and visual input masks

        Args:
            samples: List of encoded samples

        Returns:
            Batch with necessary fields
        r   c                 S   r[   r#   r#   rX   imgr#   r#   r$   rZ   ~  r\   z,Qwen2VLTaskEncoder.batch.<locals>.<listcomp>c                 S   r[   r#   r#   r   r#   r#   r$   rZ     r\   c                 S   r[   r#   r#   rX   	thw_gridsr#   r#   r$   rZ     r\   c                 S   r[   r#   r#   r]   r#   r#   r$   rZ     r\   c                 S   r[   r#   r#   r]   r#   r#   r$   rZ     r\   c                 S   r[   r#   r#   r   r#   r#   r$   rZ     r\   c                 s   s    | ]}t |jV  qd S rn   )rJ   r,   rX   sr#   r#   r$   	<genexpr>  s    z+Qwen2VLTaskEncoder.batch.<locals>.<genexpr>z0max sequence length larger than passed parameterr   NF)data	eod_tokeneod_mask_lossreset_attention_maskreset_position_idsg        c                 S      g | ]}|j qS r#   )r&   r   r#   r#   r$   rZ         c                 S   r   r#   )r'   r   r#   r#   r$   rZ     r   )r/   r'   r0   r1   r2   r3   r*   r+   r4   r5   r6   )'rJ   r   	unsqueezer   catrG   r(   extendr   r)   maxrv   r   rc   r   fullrr   r   r   
zeros_likeboolr   minr,   r-   r   r*   r+   r   r   rw   r   rx   r   r   r   eos_token_idr.   vstack)ry   r   r   r(   r   s_imgscat_imgss_image_thw_gridsr   r)   s_videos
cat_videoss_video_thw_gridsmax_seq_lentext_mat
target_matimage_input_masksvideo_input_masksitext_len
target_lentokensr5   attention_maskr6   position_idsbatchr#   r#   r$   r   q  s   








 
  "

  zQwen2VLTaskEncoder.batchr   c                 C   s   t |}|d= |S )zEncode batch in dictr'   )dataclassesasdict)ry   r   rawr#   r#   r$   encode_batch  s   
zQwen2VLTaskEncoder.encode_batch)rg   rg   rh   ri   )r   r   r   r   r	   re   cookersrC   rp   r   r   r   r%   r.   r   dictr   __classcell__r#   r#   rz   r$   rf      s*     7Yrf   )r7   r8   )-r   r^   r`   rD   collectionsr   r   typingr   r   numpyr   r   megatron.energonr   r   %megatron.energon.flavors.base_datasetr   %megatron.energon.task_encoder.cookingr	   r
   PILr   2nemo.collections.nlp.modules.common.megatron.utilsr   3nemo.collections.vlm.qwen2vl.data.multimodal_tokensr   r   r   r   r   r   +nemo.collections.vlm.qwen2vl.data.preloadedr   r   
nemo.utilsr   r   r%   r.   r"   rR   r   re   rf   r#   r#   r#   r$   <module>   s4    
 &