o
    پiV                     @   s   d dl Z d dlmZ d dlmZ d dlZd dlZd dlm	Z	m
Z
mZ d dlmZ d dlmZmZ d dlmZ d dlmZ d d	lmZmZ e eZG d
d deZdS )    N)	lru_cache)List)VideoReadercpugpu)Image)ModalityMultimodalDataItem) InternS1ForConditionalGeneration)InternVLChatModel)BaseMultimodalProcessorMultimodalSpecialTokensc                
       s
  e Zd ZeegZg dZg dZdZdZ	dZ
dZdZdZd	Zd
ZdZdZdZeedddejfddZ fddZededfddZededefddZdedededefddZd edefd!d"Zd#ed$ed%ed&edef
d'd(Z d)d* Z!d+d, Z"d-d. Z#  Z$S )/InternVLProcessor)g
ףp=
?gv/?gCl?)gZd;O?gy&1?g?          Fi      z<image>z<video>z<img>z</img>z<IMG_CONTEXT>)maxsizecudac                 C   s@   t jtj| |dddd}t jtj| |dddd}||fS )N)devicedtyper   )torchtensorr   IMAGENET_MEANviewIMAGENET_STD)r   r   meanstd r   ]/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/multimodal/processors/internvl.py_get_normalize_tensors-   s   

z(InternVLProcessor._get_normalize_tensorsc                    s  t  j|||g|R i | t|dd p|jj}|jj}t|tr&|d }t|tr/|d }t| j	dr:| j	j
}n| j	}|| _
t|dd pMt|dd pM|}	t|	dg pVd gd }
|
| _ddddd	}||
d | _| jrs|| jnd | _| jr|| jnd | _t|| d
 |jd
  | _|| j| _|| j| _t| j| j| j| jd|| _|| j| _t| j| jd|| _t|dd pt|dd pt|dd pt|	dd p| j | _!d S )Nforce_image_sizer   	tokenizer
llm_configtext_configarchitecturesz<|video_pad|>z<|reserved_200000|>)Qwen2ForCausalLMQwen3ForCausalLMQwen3MoeForCausalLMGptOssForCausalLM   )image_tokenimage_token_idvideo_tokenvideo_token_id)r,   r-   context_lengthmax_context_lenmax_position_embeddings)"super__init__getattrvision_config
image_size
patch_size
isinstancelisthasattr
_processorr#   llm_archgetVIDEO_CONTEXT_TOKENconvert_tokens_to_idsr/   IMG_CONTEXTr-   intdownsample_rationum_image_token	IMG_STARTimg_start_token_idIMG_ENDimg_end_token_idr   IMAGE_PLACEHOLDER_TOKENVIDEO_PLACEHOLDER_TOKENbuild	mm_tokensimg_context_token_idmm_tokens_internlm2CONTEXT_FALLBACKr1   )self	hf_configserver_args_image_processorargskwargsr7   r8   r#   text_cfgr=   video_token_map	__class__r   r    r4   8   s~   






zInternVLProcessor.__init__  c                    s  | j \}}}|| }t fddtd d D }t|dd d}td}	d}
|D ]1\}}|| }t|| }|| }|
d	 |
d  }||	k rO|}	||f}
q*||	kr[||kr[||f}
q*||
d	  ||
d  }}|
d	 |
d  }tjjj	| 
d	||fd
ddd	}g }t|D ](}||
d	  | }||
d	  | }|d d ||| ||| f }|| q|rt|dkrtjjj	| 
d	||fd
ddd	}|| t|tjS )Nc                 3   sL    | ]!}t d |d  D ]}t d |d  D ]}||  kr||fV  qqqdS )r   N)range).0nijmax_numr   r    	<genexpr>   s    
z7InternVLProcessor.dynamic_preprocess.<locals>.<genexpr>r   c                 S   s   | d | d  S Nr   r   r   )xr   r   r    <lambda>   s    z6InternVLProcessor.dynamic_preprocess.<locals>.<lambda>)keyinf)r   r   r   bicubicF)sizemodealign_corners)shapesetr[   sortedfloatabsr   nn
functionalinterpolate	unsqueezesqueezeappendlenstacktobfloat16)r   r7   ra   use_thumbnailCHWaspect_ratiotarget_ratiosbest_ratio_diff
best_ratiord   y	target_ardiffblocksbest_blockstarget_wtarget_hresizedtilesr^   tilethumbr   r`   r    dynamic_preprocess   s\   
"
z$InternVLProcessor.dynamic_preprocesspathreturnc              
   C   s\   z
t | tdddW S  ttfy- } ztd| t | tdddW  Y d }~S d }~ww )Nr   r   )ctxnum_threadsz;[internvl] VideoReader gpu decode failed (%s), fallback CPU)r   r   RuntimeErrorOSErrorloggerwarningr   )r   er   r   r    _open_video_reader   s   z$InternVLProcessor._open_video_readerpromptplaceholderwantc           	      C   s   |dkr|S |p	d |}|| }|dkr|S dd|g|  d }d}|p(d|}|dkrB|p3dd | | |p<d|d   S |pEd| S )Nr    
z<|im_start|>assistantr   )countjoinrfind)	rP   r   r   r   havemissinginsertmarkeridxr   r   r    %_ensure_placeholders_before_assistant   s   $z7InternVLProcessor._ensure_placeholders_before_assistanttextc                 C   s:   z| j |ddd  }t| W S  ty   Y dS w )Nptreturn_tensors	input_idsr   )r#   flattenrB   numel	Exception)rP   r   idsr   r   r    
_token_len   s   zInternVLProcessor._token_len	requested
num_videostext_lenimage_tile_cntc          	      C   s   |dkrdS | j r| jsdS || j }t| jt| t| t| j }|dkr+dS td|| j }td|t|d }tdtt|t|S rc   )r?   r/   rD   rB   r1   CONTEXT_RESERVEDmaxmin)	rP   r   r   r   r   image_tokensbudgetmax_total_framesframes_per_videor   r   r    _resolve_video_num_frames   s$   
z+InternVLProcessor._resolve_video_num_framesc                    sL   | j dk}|r| jd|||d|I d H S | jd|||d|I d H S )NInternLM2ForCausalLM)
image_data
input_textrequest_objr   )r=   process_internlm2_mm_data_asyncprocess_qwen_mm_data_async)rP   r   r   r   rU   is_internlm2r   r   r    process_mm_data_async  s"   
z'InternVLProcessor.process_mm_data_asyncc           4         s  t |dd pt |dd p|dp|dp| j}tdt|}t |dd p9t |dd p9|dp9|dp9| j}tdt|}|pDd}t |dd pLg }|rY| || jt|}|re| || j	t|}t
d|| j|| j	 | j|||| jdd	}	t
d
t|	jt|	j | jdd\}
}g }g }|	jD ]C}t|tjrt|d}t|ddd  d }n| }||
 | }| j|d|dd}|| |t|jd  q|r|std|rtj |ddnd }d }g }g }t|d| j!}| j"|t|	j| #|	j$p||rtt%|ndd}|	jr|dkr| j&d ur|	jD ]}t|t'r9|n| (t)|}t|d }|dkrNdgn
tj*d||td+ }g }g }|D ]G}|t| }t,|drq|- nt|}t|ddd  d } | |
 | } | j| d|| j.d}|| |t|jd  q_tj |dd}!||! || q/|rtj |ddnd }d}"d}#|	j$p|}$|$/| j|"}$|$/| j0|"}$| j1r| j&d ur|$/| j	|#}$n|$/| j	d}$|$}%|D ]}&| j2| j0| j3t|&   | j4 }'|%/|"|'d}%q|r^| j1r^|D ]>}(g })t5|(D ]&\}*}+t| j3t|+ },| j2| j1|,  | j4 }-|)d|*d  d|-  q'd6|)d }.|%/|#|.d}%q| j7|%ddd  8 }/|/+ }0g }1|d ur| j9|/:d| j;d!}1g }2|d ur| j&d ur| j9|/:d| j&d!}2g }3|d ur|3t<|t=j>|1d" |d ur|3t<|t=j?|2d" |0|3| j@| jA| j;| j&d#S )$Nimage_max_dynamic_patchmax_dynamic_patchr   video_max_dynamic_patchr   
video_dataz/[internvl][qwen] placeholders image=%d video=%dT)r   r   r   multimodal_tokensdiscard_alpha_channelz+[internvl][qwen] loaded images=%d videos=%dr   r   RGBr+   r        o@rZ   r7   ra   r{   zR[internvl][qwen] image_data provided but no images parsed from prompt placeholdersdimvideo_num_frames)r   r   r   r   )numr   asnumpyz<<<__IMG_PLACEHOLDER__>>>z<<<__VID_PLACEHOLDER__>>>zFrame z: r   r   r   r   r   mm_token_idfeaturemodalityoffsetsr   mm_itemsim_start_id	im_end_idim_token_idr/   )Br5   r>   IMAGE_MAX_NUMr   rB   VIDEO_MAX_NUMr   rI   rw   rJ   r   infor   load_mm_datarL   imagesvideosr!   r9   r   nparrayconvertr   
from_numpypermuter   ro   r   rv   rl   
ValueErrorcatDEFAULT_VIDEO_NUM_FRAMESr   r   r   sumr/   r   r   strlinspacetolistr;   r   VIDEO_USE_THUMBNAILreplacerA   r?   rE   rD   rG   	enumerater   r#   r   get_mm_items_offsetry   rM   r	   r   IMAGEVIDEOrF   rH   )4rP   r   r   r   rU   img_max_numvid_max_numr   r   base_outputr   r   num_patches_listpixel_values_listimageimg_npr   r   image_tensorvideo_tensorvideo_patch_listsvideo_pixel_valuesrequested_frames
num_framesvideovr	max_frameframe_indicesper_video_tilesper_video_patch_cntfiframeframe_tpvimg_phvid_phinput_text_midinput_text_updatednum_patchesr   frame_patch_listframe_linesr^   	patch_cntctx_cntframe_tokensvideo_tokensinput_ids_tensorr   image_offsetsvideo_offsetsitemsr   r   r    r     sv  












z,InternVLProcessor.process_qwen_mm_data_asyncc                    s  |pd}t |dd pg }|rtd || j| j}|r(| || jt|}td|	| j | j
||| jdd}| jdd\}}	g }
g }|jD ]C}t|tjrlt|d	}t|d
dd  d }n| }|| |	 }| j|dddd}|| |
t|jd  qK|r|std|rtj|ddnd }d}|jp|| j|}|}|
D ]}| j| j| j t|   | j! }|||d}q| j"|ddd # }|$ }g }|d ur| j%|&d| j'd}g }|d ur|t(|t)j*|d ||| j+| j,| j'| j-dS )Nr   r   z>[internvl][internlm2] video input ignored for InternLM2 branchz1[internvl][internlm2] placeholders img_context=%dT)r   r   r   r   r   r   r   r+   r   r   r   rZ   r   r   zW[internvl][internlm2] image_data provided but no images parsed from prompt placeholdersr   z!<<<__IMG_CONTEXT_PLACEHOLDER__>>>r   r   r   r   r   r   ).r5   r   r   r   rI   rA   r   rw   r   r   r   rN   r!   r   r9   r   r   r   r   r   r   r   r   ro   r   rv   rB   rl   r   r   r   rE   rD   rG   r#   r   r   r   ry   rM   r	   r   r   rF   rH   r/   )rP   r   r   r   rU   r   r   r   r   r   r   r   r   r   r   r   pixel_valuesphinput_text_baser  r  r   r  r   r  r  r   r   r    r     s   



z1InternVLProcessor.process_internlm2_mm_data_async)%__name__
__module____qualname__r   r
   modelsr   r   r   r   r   r   rO   r   rI   rJ   rE   rG   rA   staticmethodr   r   float32r!   r4   r   r   r   r   rB   r   r   r   r   r   r   __classcell__r   r   rX   r    r      sb    	R@	

 hr   )logging	functoolsr   typingr   numpyr   r   decordr   r   r   PILr   "sglang.srt.managers.schedule_batchr   r	   sglang.srt.models.interns1r
   sglang.srt.models.internvlr   /sglang.srt.multimodal.processors.base_processorr   r   	getLoggerr   r   r   r   r   r   r    <module>   s   
