o
    پilH                     @   s  d dl Z d dlZd dlmZ d dlmZmZmZ d dlZ	d dl
Z
d dlmZ d dlmZ d dlmZ d dlmZmZmZ d dlmZ d d	lmZ d d
lmZ d dlmZ eejeej ee dB f Z G dd de
j!j"Z#G dd dZ$G dd dZ%G dd dZ&G dd deZ'dS )    N)product)ListOptionalUnion)Image)
transforms)InterpolationMode)BatchFeatureProcessorMixin
TensorType)Step3VLForConditionalGeneration)StepVLForConditionalGeneration)BaseMultimodalProcessor)MultimodalSpecialTokensc                   @   s,   e Zd Zdeejejf dejfddZ	dS )GPUToTensor	raw_imagereturnc                 C   s   t |tjrt |S |jdkr |d d d d d f dd}tj r+t	d}nt	d}t
||}t|d }|jtjkrO|tjd}|S )N      cudacpu)r   r         )
isinstancer   r   ToTensorndimrepeattorchr   is_availabledevice
from_numpytopermute
contiguousdtypeuint8float32div)selfr   r    image_tensor r+   ]/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/multimodal/processors/step3_vl.pyforward   s   


zGPUToTensor.forwardN)
__name__
__module____qualname__r   npndarrayr   r   Tensorr-   r+   r+   r+   r,   r      s    $r   c                   @   s    e Zd ZdddZd	ddZdS )
Step3VisionProcessorbicubicNc              	   C   s   g d}g d}|d ur|n|}t t t ||t j||f|dkr%tjntjddg| _|d urRt t t ||t j||f|dkrGtjntjddg| _	d S d | _	d S )Ng3<4'?gwgM?gy{ ?gB91?gwt.?g	U?r5   T)interpolation	antialias)
r   Composer   	NormalizeResizer   BICUBICBILINEAR	transformpatch_transform)r)   sizeinterpolation_mode
patch_sizemeanstdr+   r+   r,   __init__,   s>   
 
zStep3VisionProcessor.__init__Fc                 C   s,   |rd|  |diS d| |diS Npixel_valuesr   )r@   	unsqueezer?   )r)   imageis_patchr+   r+   r,   __call__U   s   zStep3VisionProcessor.__call__)r5   NF)r.   r/   r0   rF   rL   r+   r+   r+   r,   r4   +   s    
)r4   c                   @   sT  e Zd ZdededefddZ	d%deded	eeeef  d
eeeef  dedeeeeeeef  eeef f fddZde	j	de	j	fddZ
dededeeef fddZdededeeef fddZdededefddZde	j	dedededef
ddZdededeeef fd d!Zde	j	dee	j	ee	j	 ee d"B f fd#d$Zd"S )&ImagePatcherlongshortr   c                 C   s6   |dkr|| dkr|S dS || dkrt |dS dS )N  g      ?r        )min)r)   rO   rP   r+   r+   r,   determine_window_size^   s   z"ImagePatcher.determine_window_size333333?widthheightsizesstepsimg_rate_thrc                    s  d|  krdksJ d J dg }t ||D ]\}}|\}	}
|\ ||	kr+dn
t||	  d }fddt|D }t|dkrU|d |	 |krU||	 |d< ||
kr[dn
t||
   d } fddt|D }t|dkr|d |
 |kr||
 |d< tjtt||t	d}|d d ddgf |d d ddgf< |
tj||| gdd	 qtj|dd	}d
d |D ||ffS )Nr   r   z$The `img_rate_thr` should lie in 0~1c                       g | ]} | qS r+   r+   .0i)step_wr+   r,   
<listcomp>s       z-ImagePatcher.slide_window.<locals>.<listcomp>r   c                    r\   r+   r+   r]   )step_hr+   r,   ra   x   rb   r%   )axisc              	   S   sH   g | ] }t |d  t |d t |d |d   t |d |d  fqS )r   r   r   r   int)r^   boxr+   r+   r,   ra      s    :)zipmathceilrangelenr1   arraylistr   rg   appendconcatenate)r)   rW   rX   rY   rZ   r[   windowsrA   stepsize_wsize_hx_numx_starty_numy_startstartr+   )rc   r`   r,   slide_windowc   s,   $""$zImagePatcher.slide_windowimgc                 C   sD   |j \}}||kr|S t||}t|j||fd}||d |S )Nr   r   r   )rA   maxr   newmodepaste)r)   r|   whrA   paddedr+   r+   r,   
square_pad   s   

zImagePatcher.square_pad	img_width
img_heightc                 C   s@   || }t ||dk r|dks|dk rt||}||fS ||fS )N    rR   g      ?)rT   r~   )r)   r   r   rationew_sizer+   r+   r,   get_image_size_for_padding   s
   
z'ImagePatcher.get_image_size_for_paddingc                 C   sD   t ||dkrdt || }t|| }t|| }||fS ||fS )Ni  )r~   rg   )r)   r   r   scale_factorr+   r+   r,   get_image_size_for_preprocess   s   z*ImagePatcher.get_image_size_for_preprocesswindow_sizec           
      C   s   || }|| }|dk r|}n|||  }|dkrt |d nt |}|| }|dk r.|}n|||  }	|	dkr>t |d nt |}|| }t |t |fS )Nr   g?rf   )
r)   r   r   r   w_ratioh_ratio	width_new	decimal_w
height_new	decimal_hr+   r+   r,   get_image_size_for_crop   s   z$ImagePatcher.get_image_size_for_cropr_   jthtwc                 C   s   | |||| || f}|S N)crop)r)   r|   r_   r   r   r   targetr+   r+   r,   
patch_crop   s   zImagePatcher.patch_cropc                 C   s   |  ||\}}| ||\}}| t||t||}|dkr"dS | |||\}}| ||||fg||fg\}\}}t|d | d }t|dkrYt|| dkrY|d8 }t||fS )Nr   r}   r   )r   r   rU   r~   rT   r   r{   rm   )r)   r   r   r   center_listrv   rx   	full_rowsr+   r+   r,   get_num_patches   s,   zImagePatcher.get_num_patchesNc                    s  |j \}}| ||\}}||ks||kr| |}|j \}}| ||\}}|||ftjj}| t	||t
||}|dkrF|g d fS | |||\}}||f||fkrb|||ftjj}n|}g }g  | ||||fg||fg\}	\}
}t|	D ]%\}}|\}}}}| |||||}|| |d |
 dkr | q~ r d t|d kr   ||t|dkrʇ fddtt|D fS d fS )Nr   r   r   c                    s   g | ]}| v qS r+   r+   r]   newlinesr+   r,   ra   
  rb   z)ImagePatcher.__call__.<locals>.<listcomp>)rA   r   r   r   resizer   
Resamplingr>   rU   r~   rT   r   r{   	enumerater   rp   rm   poprl   )r)   r|   r   r   new_img_widthnew_img_heightr   img_for_croppatchesr   rv   rx   patch_idcenter_lf_pointxypatch_wpatch_h	big_patchr+   r   r,   rL      s`   





zImagePatcher.__call__)rV   )r.   r/   r0   rg   rU   ro   tuplefloatr{   r   r   r   r   r   r   r   boolrL   r+   r+   r+   r,   rN   \   sZ    "
#	

	


 rN   c                       sr  e Zd Z		d$ fddZedefddZdededefd	d
Zdee	j	 dee
 fddZ	d%dee	j	 dedeej fddZdedee dB deeee f fddZdedeeee f fddZdededeee  deeee f fddZdededee defdd Z			d&deeeee f  deee	j	ee	j	 f  d!eeeef  defd"d#Z  ZS )'Step3VLProcessorr   Nc                    s~   t    || _t|tr|j}|| _d| _d| _t| jd| j| _	d| _
d| _d| _| j| j
 | _| j| j | _t | _d S )NrQ   rS   bilinear   Q   
<im_patch>)superrF   configr   r
   	tokenizer
image_sizerC   r4   image_preprocessornum_image_feature_sizenum_patch_feature_sizeimage_tokenimage_feature_placeholderpatch_feature_placeholderrN   patcher)r)   r   r   	__class__r+   r,   rF     s    


zStep3VLProcessor.__init__c                 C   s   | j  | j S r   )r   	get_vocabr   )r)   r+   r+   r,   image_token_id,  s   zStep3VLProcessor.image_token_idr   r   c                 C   s.   | j ||\}}|| jd  | j d | S )Nr   )r   r   r   r   )r)   r   r   num_patchesnum_newlinesr+   r+   r,   get_num_image_tokens0  s   z%Step3VLProcessor.get_num_image_tokensimagesc                 C   s"   g }|D ]
}| | | q|S r   )rp   r   )r)   r   resultr|   r+   r+   r,   _split_images:  s   zStep3VLProcessor._split_imagesFrK   c                    s    fdd|D S )Nc                    s   g | ]}j | d d qS )rK   rH   )r   )r^   r|   rK   r)   r+   r,   ra   E  s    zDStep3VLProcessor._convert_images_to_pixel_values.<locals>.<listcomp>r+   )r)   r   rK   r+   r   r,   _convert_images_to_pixel_values@  s   z0Step3VLProcessor._convert_images_to_pixel_valuesr   patch_newline_maskc                 C   s   d}g }t |D ]>}t||ksJ |d| j d7 }|| jdg| jg| j  | jdg  |rF|| rF|d7 }|| jd q||fS )N z<patch_start>z<patch_end>z<patch_newline>)	rl   rm   r   extendr   convert_tokens_to_idsr   r   rp   )r)   r   r   text	token_idsr_   r+   r+   r,   _get_patch_replJ  s&   
z Step3VLProcessor._get_patch_repl
num_imagesc                 C   sH   d| j  d}| jdg| jg| j  | jdg }|| || fS )Nz
<im_start>z<im_end>)r   r   r   r   r   )r)   r   r   r   r+   r+   r,   _get_image_repl`  s   z Step3VLProcessor._get_image_replpatch_new_line_idxc                 C   s@   |dkr|  ||\}}nd}g }| |\}}|| || fS )Nr   r   )r   r   )r)   r   r   r   
patch_replpatch_repl_ids
image_replimage_repl_idsr+   r+   r,   _get_image_repl_featuresl  s   
z)Step3VLProcessor._get_image_repl_featuresr   placeholderreplsc                 C   sh   | |}t|d t|krtd|d g}t|D ]\}}|| |||d   qd|S )Nr   zEThe number of placeholders does not match the number of replacements.r   r   )splitrm   
ValueErrorr   rp   join)r)   r   r   r   partsr   r_   replr+   r+   r,   replace_placeholder|  s   



z$Step3VLProcessor.replace_placeholderreturn_tensorsc                    sn  |d u rg }t |ts|g}|d u rg }t |ts|g}t|dkr*i }|}n|}g }	g }
g }g  g }g }|D ]C\}}}|	|g t|dkr[|
j|dd |t| dt||\}} | || |d ur|| q=t	
|	|d}|
rt	
|
|d< |rt	j|t	jd|d<  fd	d
|D }|}ti |||dS )Nr   Tr   r   )rH   r   patch_pixel_valuesrd   r   c                    s   g | ]
} |j qS r+   )r   r   )r^   timage_repl_str_lstr)   r+   r,   ra     s    z-Step3VLProcessor.__call__.<locals>.<listcomp>)tensor_type)r   ro   rm   r   r   r   r   rp   r   r   cattensorr   r	   )r)   r   r   r   argskwargsimage_inputstext_inputssplitted_images_datapixel_values_lstpatch_pixel_values_lstpatch_newline_mask_lstimage_repl_ids_lstr   raw_imgimg_patchesr   image_repl_strr   r+   r   r,   rL     st   








zStep3VLProcessor.__call__)r   NrM   )NNN)r.   r/   r0   rF   propertyrg   r   r   ro   r   ImageWithPatchesr   r   r   r3   r   r   strr   r   r   r   r   r   r   r	   rL   __classcell__r+   r+   r   r,   r     sd    
	






r   c                       sZ   e Zd ZeegZ fddZdd Zdd Zde	e
eef  dee	e B fd	d
Z  ZS )Step3VLImageProcessorc           	         sv   t ||}t j|||g|R i | d| _| jj | j | _t| j| jt	
dd|| _g d}g d}d S )Nr   z(?:<im_patch>))r   r   image_token_regexr6   r7   )r   r   rF   IM_TOKEN
_processorr   r   IM_TOKEN_IDr   recompilebuild	mm_tokens)	r)   	hf_configserver_argsr  r   r   	processorrD   rE   r   r+   r,   rF     s   
zStep3VLImageProcessor.__init__c                 C   s   d|  |diS rG   )r?   rI   r)   rJ   r+   r+   r,   
preprocess  s   z Step3VLImageProcessor.preprocessc                 C   s
   |  |S r   )r  r  r+   r+   r,   rL     s   
zStep3VLImageProcessor.__call__
image_data
input_textc           
         s@   | j |||j| jd}| || j\}}}	| || jjdS )N)promptr  
video_datamultimodal_tokens)	input_idsmm_itemsim_token_id)load_mm_datar  r  process_and_combine_mm_datatolistr   )
r)   r  r  request_objr   r   base_outputr  r  retr+   r+   r,   process_mm_data_async  s   
z+Step3VLImageProcessor.process_mm_data_async)r.   r/   r0   r   r   modelsrF   r  rL   r   r   r   bytesrg   r  r   r+   r+   r   r,   r     s    
r   )(rj   r  	itertoolsr   typingr   r   r   numpyr1   r   PILr   torchvisionr   torchvision.transformsr   transformersr	   r
   r   sglang.srt.models.step3_vlr   sglang.srt.models.step3_vl_10br   /sglang.srt.multimodal.processors.base_processorr   SGLangBaseProcessorr   r   ro   rg   r   nnModuler   r4   rN   r   r   r+   r+   r+   r,   <module>   s,    1 6 J