o
    
۾ib                     @   s:  U d dl Z d dlmZmZmZ d dlmZ d dl mZmZ d dl	m
Z
mZmZmZ d dlZd dlZd dlmZ d dlm  mZ d dlmZ d dlmZ d dlmZ d d	lmZmZmZ d d
l m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z) d dl*m+Z+ d dl,m-Z-m.Z.m/Z/ d dl0m1Z1 d dl2m3Z3 d dl4m5Z5m6Z6m7Z7 d dl8m9Z9m:Z: d dl;m<Z<m=Z=m>Z>m?Z?m@Z@mAZA d dlBmCZC d dlDmEZE d dlFmGZG d dlHmIZImJZJ ddlKmLZLmMZMmNZN ddlOmPZPmQZQmRZRmSZS ddlTmUZUmVZV G dd deIZWG d d! d!eIZXeWeXB ZYeeZd"< e[eje\ej e\e] dB f Z^d#Z_e`eZd$< G d%d& d&ZaG d'd( d(ZbG d)d* d*ZcG d+d, d,e>ZdG d-d. d.e<ed ZeG d/d0 d0e=ed Zfd1d2 ZgG d3d4 d4ejhZiG d5d6 d6ejhZjG d7d8 d8ejhZkG d9d: d:ejhZlG d;d< d<ejhZmG d=d> d>ejhZne3joefedeed?G d@dA dAejheMeNZpdS )B    N)IterableMappingSequence)product)ceilsqrt)	AnnotatedAnyLiteral	TypeAlias)Image)
transforms)InterpolationMode)BatchFeaturePretrainedConfig
TensorType)
VllmConfig)BaseDummyOptions)$get_tensor_model_parallel_world_size)
get_act_fn)MMEncoderAttention)Conv2dLayer)ColumnParallelLinearQKVParallelLinearRowParallelLinear)QuantizationConfig)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItems)	ImageSizeMultiModalDataItems)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdatePromptUpdateDetails)IntermediateTensors)TokenizerLike)Step3VisionEncoderConfig)TensorSchemaTensorShape   )MultiModalEmbeddingsSupportsMultiModal
SupportsPP)AutoWeightsLoaderWeightsMapperinit_vllm_registered_modelmaybe_prefix)is_vit_use_data_parallelrun_dp_sharded_vision_modelc                   @   sl   e Zd ZU dZed ed< eeje	ddddf ed< eeje	ddd	d
f ed< eeje	df ed< dS )Step3VLImagePixelInputsa  
    Dimensions:
        - bn: Batch size * number of images
        - c: Number of channels (3)
        - h: Height
        - w: Width
        - bnp: Batch size * number of images * number of patches
        - hp: Height of patch
        - wp: Width of patch
    pixel_valuestypebn   hwbnphpwppatch_pixel_valuesnum_patchesN)
__name__
__module____qualname____doc__r
   __annotations__r   torchTensorr,    rJ   rJ   W/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/step3_vl.pyr7   <   s   
 r7   c                   @   s<   e Zd ZU dZdZed ed< eej	e
dddf ed< dS )	Step3VLImageEmbeddingInputsz
    Dimensions:
        - bn: Batch size * number of images
        - f: Image feature size
        - h: Hidden size (must match the hidden size of language model backbone)
    image_embedsr9   r:   fr<   dataN)rC   rD   rE   rF   r9   r
   rG   r   rH   rI   r,   rJ   rJ   rJ   rK   rL   N   s   
 rL   Step3VLImageInputs  MAX_IMAGE_SIZEc                   @   s    e Zd ZdddZd	ddZdS )
Step3VisionProcessorbicubicNc              	   C   s   g d}g d}|d ur|n|}t t  t ||t j||f|dkr&tjntjddg| _|d urTt t  t ||t j||f|dkrItjntjddg| _	d S d | _	d S )N)g3<4'?gwgM?gy{ ?)gB91?gwt.?g	U?rT   T)interpolation	antialias)
r   ComposeToTensor	NormalizeResizer   BICUBICBILINEAR	transformpatch_transform)selfsizeinterpolation_mode
patch_sizemeanstdrJ   rJ   rK   __init__b   s>   

zStep3VisionProcessor.__init__Fc                 C   s,   |rd|  |diS d| |diS )Nr8   r   )r^   	unsqueezer]   )r_   imageis_patchrJ   rJ   rK   __call__   s   zStep3VisionProcessor.__call__)rT   NF)rC   rD   rE   re   ri   rJ   rJ   rJ   rK   rS   a   s    
%rS   c                   @   sh  e Zd Zd)deddfddZdededefd	d
Z	d*dededeeeef  deeeef  de	deeeeeeef  eeef f fddZ
dejdejfddZdededeeef fddZdededeeef fddZdededefddZdejded ed!ed"ef
d#d$Zdededeeef fd%d&Zdejdeejeej ee dB f fd'd(ZdS )+ImagePatcherTenable_patchreturnNc                 C   s
   || _ d S Nrl   )r_   rl   rJ   rJ   rK   re         
zImagePatcher.__init__longshortc                 C   s6   |dk r|| dkr|S dS || dkrt |dS dS )N  g      ?r        )min)r_   rq   rr   rJ   rJ   rK   determine_window_size   s   z"ImagePatcher.determine_window_size333333?widthheightsizesstepsimg_rate_thrc                    s  d|  krdksJ d J dg }t ||D ]\}}|\}	}
|\ ||	kr+dn	t||	  d }fddt|D }t|dkrT|d |	 |krT||	 |d< ||
krZdn	t||
   d } fddt|D }t|dkr|d |
 |kr||
 |d< tjtt||td}|d d ddgf |d d ddgf< |	tj
||| gdd	 qtj
|dd	}d
d |D ||ffS )Nr-   r   z#The `in_rate_thr` should lie in 0~1c                       g | ]} | qS rJ   rJ   .0i)step_wrJ   rK   
<listcomp>       z-ImagePatcher.slide_window.<locals>.<listcomp>c                    r~   rJ   rJ   r   )step_hrJ   rK   r      r   dtype)axisc              	   S   sH   g | ] }t |d  t |d t |d |d   t |d |d  fqS )r   r-      r;   int)r   boxrJ   rJ   rK   r      s    :)zipr   rangelennparraylistr   r   appendconcatenate)r_   ry   rz   r{   r|   r}   windowsr`   stepsize_wsize_hx_numx_starty_numy_startstartrJ   )r   r   rK   slide_window   s,   $  $zImagePatcher.slide_windowimgc                 C   sD   |j \}}||kr|S t||}t|j||fd}||d |S )Nr   r   r   )r`   maxr   newmodepaste)r_   r   r=   r<   r`   paddedrJ   rJ   rK   
square_pad   s   

zImagePatcher.square_pad	img_width
img_heightc                 C   s@   || }t ||dk r|dks|dk rt||}||fS ||fS )N    rt   g      ?)rv   r   )r_   r   r   rationew_sizerJ   rJ   rK   get_image_size_for_padding   s
   
z'ImagePatcher.get_image_size_for_paddingc                 C   s<   t ||tkrtt || }t|| }t|| }||fS rn   )r   rR   r   )r_   r   r   scale_factorrJ   rJ   rK   get_image_size_for_preprocess   s
   z*ImagePatcher.get_image_size_for_preprocesswindow_sizec           
      C   s   || }|| }|dk r|}n|||  }|dkrt |d nt |}|| }|dk r.|}n|||  }	|	dkr>t |d nt |}|| }t |t |fS )Nr-   g?r   )
r_   r   r   r   w_ratioh_ratio	width_new	decimal_w
height_new	decimal_hrJ   rJ   rK   get_image_size_for_crop   s   z$ImagePatcher.get_image_size_for_cropr   jthtwc                 C   s   | |||| || f}|S rn   )crop)r_   r   r   r   r   r   targetrJ   rJ   rK   
patch_crop   s   zImagePatcher.patch_cropc                 C   s   |  ||\}}| ||\}}| t||t||}|dks#| js%dS | |||\}}| ||||fg||fg\}\}}t|d | d }t|dkr\t|| dkr\|d8 }t||fS )Nr   r   r-   )	r   r   rw   r   rv   rl   r   r   r   )r_   r   r   r   center_listr   r   	full_rowsrJ   rJ   rK   get_num_patches   s,   zImagePatcher.get_num_patchesc                    s  |j \}}| ||\}}||ks||kr| |}|j \}}| ||\}}|||ftjj}| t	||t
||}|dksD| jsI|g d fS | |||\}}||f||fkre|||ftjj}n|}g }g  | ||||fg||fg\}	\}
}t|	D ]%\}}|\}}}}| |||||}|| |d |
 dkr | q r d t|d kr   ||t|dkr͇ fddtt|D fS d fS )Nr   r-   r   c                    s   g | ]}| v qS rJ   rJ   r   newlinesrJ   rK   r   ;  r   z)ImagePatcher.__call__.<locals>.<listcomp>)r`   r   r   r   resizer   
Resamplingr\   rw   r   rv   rl   r   r   	enumerater   r   r   popr   )r_   r   r   r   new_img_widthnew_img_heightr   img_for_croppatchesr   r   r   patch_idcenter_lf_pointxypatch_wpatch_h	big_patchrJ   r   rK   ri     s`   





zImagePatcher.__call__)T)rx   )rC   rD   rE   boolre   r   rw   r   tuplefloatr   r   r   r   r   r   r   r   ri   rJ   rJ   rJ   rK   rk      s\    "
#	

	

	
 rk   c                       sn  e Zd Zdededdf fddZedefddZd	ed
edefddZ	de
ej de
e fddZ	d&de
ej dede
ej fddZdede
e dB deee
e f fddZdedeee
e f fddZdedede
e dB deee
e f fddZdeded e
e defd!d"Z			d'dee
e B dB deje
ej B dB d#eeB dB defd$d%Z  ZS )(Step3VLProcessorconfig	tokenizerrm   Nc                    s   t    || _|| _d| _d| _t| jd| j| _d| _d| _	d| _
| j
| j | _| j
| j	 | _t| jjdd}t|d	| _d S )
Nrs   ru   bilinear   Q   
<im_patch>rl   Tro   )superre   r   r   
image_sizerb   rS   image_preprocessornum_image_feature_sizenum_patch_feature_sizeimage_tokenimage_feature_placeholderpatch_feature_placeholdergetattrvision_configrk   patcher)r_   r   r   rl   	__class__rJ   rK   re   B  s   

zStep3VLProcessor.__init__c                 C   s   | j  | j S rn   )r   	get_vocabr   r_   rJ   rJ   rK   image_token_id\  s   zStep3VLProcessor.image_token_idr   r   c                 C   s.   | j ||\}}|| jd  | j d | S )Nr   )r   r   r   r   )r_   r   r   rB   num_newlinesrJ   rJ   rK   get_num_image_tokens`  s   z%Step3VLProcessor.get_num_image_tokensimagesc                 C   s"   g }|D ]
}| | | q|S rn   )r   r   )r_   r   resultr   rJ   rJ   rK   _split_imagesj  s   zStep3VLProcessor._split_imagesFrh   c                    s    fdd|D S )Nc                    s   g | ]}j | d d qS )rh   r8   )r   r   r   rh   r_   rJ   rK   r   u  s    zDStep3VLProcessor._convert_images_to_pixel_values.<locals>.<listcomp>rJ   )r_   r   rh   rJ   r   rK   _convert_images_to_pixel_valuesp  s   z0Step3VLProcessor._convert_images_to_pixel_valuesrB   patch_newline_maskc                 C   s   d}g }t |D ]>}t||ksJ |d| j d7 }|| jdg| jg| j  | jdg  |rF|| rF|d7 }|| jd q||fS )N z<patch_start>z<patch_end>z<patch_newline>)	r   r   r   extendr   convert_tokens_to_idsr   r   r   )r_   rB   r   text	token_idsr   rJ   rJ   rK   _get_patch_replz  s&   
z Step3VLProcessor._get_patch_repl
num_imagesc                 C   sH   d| j  d}| jdg| jg| j  | jdg }|| || fS )Nz
<im_start>z<im_end>)r   r   r   r   r   )r_   r   r   r   rJ   rJ   rK   _get_image_repl  s   z Step3VLProcessor._get_image_replpatch_new_line_idxc                 C   s@   |dkr|  ||\}}nd}g }| |\}}|| || fS )Nr   r   )r   r   )r_   r   rB   r   
patch_replpatch_repl_ids
image_replimage_repl_idsrJ   rJ   rK   _get_image_repl_features  s   
z)Step3VLProcessor._get_image_repl_featuresr   placeholderreplsc                 C   sh   | |}t|d t|krtd|d g}t|D ]\}}|| |||d   qd|S )Nr-   zEThe number of placeholders does not match the number of replacements.r   r   )splitr   
ValueErrorr   r   join)r_   r   r  r  partsr   r   replrJ   rJ   rK   replace_placeholder  s   



z$Step3VLProcessor.replace_placeholderreturn_tensorsc                    sz  |d u rg }t |ts|g}|d u rg }t |ts|g}t|dkr*i }|}n|}g }g }g }	g  g }
g }|D ]C\}}}||g t|dkr[|j|dd |t| dt||\}} | |
| |d ur|	| q=t	
|}j}|||rt	
|n|dd||ft	j|	t	jdd} fdd	|D }|}ti |||d
S )Nr   Tr   r-   r;   r   )r8   rB   rA   r   c                    s   g | ]
} |j qS rJ   )r  r   )r   timage_repl_str_lstr_   rJ   rK   r     s    z-Step3VLProcessor.__call__.<locals>.<listcomp>)tensor_type)
isinstancer   r   r   r   r   r   r   r  rH   catrb   	new_emptytensorr   r   )r_   r   r   r  image_inputstext_inputssplitted_images_datapixel_values_lstpatch_pixel_values_lstpatch_newline_mask_lstimage_repl_ids_lstrB   raw_imgimg_patchesr   image_repl_strr  r8   rb   rJ   r  rK   ri     sp   








zStep3VLProcessor.__call__rj   )NNN)rC   rD   rE   r   r)   re   propertyr   r   r   r   r   ImageWithPatchesr   r   rH   rI   r   r   strr   r   r  r  r   r   ri   __classcell__rJ   rJ   r   rK   r   A  sl    
	







r   c                   @   s   e Zd ZdefddZdeeedB f fddZdefddZ	d	ed
eeef deeef fddZ
defddZdedefddZdS )Step3VLProcessingInform   c                 C   s   t |  |  S rn   )r   get_hf_configget_tokenizerr   rJ   rJ   rK   get_hf_processor  s   z&Step3VLProcessingInfo.get_hf_processorNc                 C   s   dd iS Nrg   rJ   r   rJ   rJ   rK   get_supported_mm_limits  s   z-Step3VLProcessingInfo.get_supported_mm_limitsc                 C   s    |   }||  j|  jS rn   )r&  r   !get_image_size_with_most_featuresry   rz   )r_   hf_processorrJ   rJ   rK   get_max_image_tokens  s
   z*Step3VLProcessingInfo.get_max_image_tokensseq_len	mm_countsc                 C   s   d|   iS r'  )r+  )r_   r,  r-  rJ   rJ   rK   get_mm_max_tokens_per_item  s   z0Step3VLProcessingInfo.get_mm_max_tokens_per_itemc                 C   s
   t ddS )NrQ   )r    r   rJ   rJ   rK   r)    rp   z7Step3VLProcessingInfo.get_image_size_with_most_featuresmm_datac                    sN   t |dks
d|vrtd|d }t|ttfs|g}t fdd|D S )Nr-   rg   z5mm_data could only contain one key 'image' for steo1oc                 3   s$    | ]}   |j|jV  qd S rn   )r&  r   ry   rz   r   r   rJ   rK   	<genexpr>(  s
    
z:Step3VLProcessingInfo.get_num_mm_tokens.<locals>.<genexpr>)r   r  r  r   r   sum)r_   r/  
image_datarJ   r   rK   get_num_mm_tokens   s   z'Step3VLProcessingInfo.get_num_mm_tokens)rC   rD   rE   r   r&  r   r!  r   r(  r+  r.  r    r)  r   r3  rJ   rJ   rJ   rK   r#    s    


r#  c                	   @   sX   e Zd Zdeeef defddZ	d
dedeeef deeef dB defdd	Z	dS )Step3VLDummyInputsBuilderr-  rm   c                 C   s   | dd}d| S )Nrg   r   r   )get)r_   r-  r   rJ   rJ   rK   get_dummy_text/  s   z(Step3VLDummyInputsBuilder.get_dummy_textNr,  
mm_optionsc                 C   sB   | j  \}}|dd}|r|dnd }d| j||||diS )Nrg   r   )ry   rz   r   	overrides)infor)  r5  _get_dummy_images)r_   r,  r-  r7  target_widthtarget_heightr   image_overridesrJ   rJ   rK   get_dummy_mm_data3  s   z+Step3VLDummyInputsBuilder.get_dummy_mm_datarn   )
rC   rD   rE   r   r!  r   r6  r   r   r>  rJ   rJ   rJ   rK   r4  .  s    
r4  c                	   @   sX   e Zd Zdedeeef dedee	 fddZ
dedeeef deeef fdd	Zd
S )Step3VLMultiModalProcessormm_itemshf_processor_mm_kwargsout_mm_kwargsrm   c                    s@   | j jdi |  jdtf fdd}tdg|dgS )Nitem_idxc                    sd   d |  }t |d j}|dkr"|d j} d|| d }n	 ddd d }tj|dS )Nrg   rB   r   r   r-   )seqembed_token_id)r   rO   r  tolistr'   select_token_id)rC  out_itemrB   r   r  r*  image_placeholder_token_idrB  rJ   rK   get_replacement_step1oR  s   

zNStep3VLMultiModalProcessor._get_prompt_updates.<locals>.get_replacement_step1org   )modalityr   replacementrJ   )r9  r&  r   r   r%   )r_   r@  rA  rB  rK  rJ   rI  rK   _get_prompt_updatesI  s   z.Step3VLMultiModalProcessor._get_prompt_updates	hf_inputsc                 C   s>   | dtd}ttdtd|tdtd|dS )NrB   r   rg   )r8   rA   rB   r   )r5  rH   emptydictr   batchedflat_from_sizes)r_   rO  rA  rB   rJ   rJ   rK   _get_mm_fields_configi  s   z0Step3VLMultiModalProcessor._get_mm_fields_configN)rC   rD   rE   r!   r   r!  r	   r   r   r&   rN  r   objectr   rT  rJ   rJ   rJ   rK   r?  H  s"    

 

r?  c           
      C   s   |  d}| d}|d d |dd  }}tt|jd d }tt|}| j}||kr||d|||dddd	 }|
tj}tj|||fdddd	
|}|dddd}||| |}tj||gdd
}	|	d|| d |}	|	S | S )Nr   r   r-   r;   r   rT   TF)r`   r   rV   align_cornersdim)r`   squeezer   mathr   shaper   viewpermute
contiguoustorH   float32Finterpolater  )
abs_postgt_sizerX  abs_pos_new	cls_tokenold_pos_embedsrc_sizer   new_pos_embedvision_pos_embedrJ   rJ   rK   get_abs_pos|  s6   

rk  c                       s8   e Zd Zdef fddZdejdejfddZ  ZS )Step3VisionEmbeddingsr   c                    s   t    || _|j| _|j| _|j| _tt	
d| j| _t|j| j| j| jdd| _| j| j d | _d| _t	j| jd | j| _| jdt	| jd ddd	 d S )
Nr-   T)in_channelsout_channelskernel_sizestridebiasr   rt   position_ids)r-   r   F)
persistent)r   re   r   hidden_size	embed_dimr   rb   nn	ParameterrH   randnclass_embeddingr   num_channelspatch_embeddingrB   pad_tp_size	Embeddingposition_embeddingregister_bufferarangeexpand)r_   r   r   rJ   rK   re     s.   

zStep3VisionEmbeddings.__init__r8   rm   c                 C   s   |j d }| |}|ddd}| j|dd}tj||gdd}|t| 	| j
|d }tj|d d dd d f dd| jd d|gdd}|S )Nr   r   r-   r   rW  )r[  r{  flatten	transposery  r  rH   r  rk  r~  rr  r`   rf   repeatr|  )r_   r8   
batch_sizepatch_embedsclass_embeds
embeddingsrJ   rJ   rK   forward  s"   
*zStep3VisionEmbeddings.forward)	rC   rD   rE   r*   re   rH   rI   r  r"  rJ   rJ   r   rK   rl    s    rl  c                       sD   e Zd ZdZ		ddedB def fddZdejfd	d
Z	  Z
S )Step3VisionAttentionz=Multi-headed attention from 'Attention Is All You Need' paperNr   quant_configprefixc              	      s   t    || _|j| _|j| _| j| j | _| jd | _t	 }|r$dnt
 }| j| dks0J | j| | _| j| j | _t| j| j| jd|| d|d| _t| j| jd|| d|d| _t| j| j| j| dd	| _d S )
Ng      r-   r   Tz	.qkv_projrq  r  r  
disable_tpz	.out_projz.attnr  )r   re   r   rt  ru  num_attention_headstotal_num_headshead_dimscaler5   r   	num_headsq_sizer   qkv_projr   out_projr   attn)r_   r   r  r  use_data_paralleltp_sizer   rJ   rK   re     sD   
	
zStep3VisionAttention.__init__hidden_statesc           
      C   sP   |  \}}}| |\}}|jddd\}}}| |||}	| |	\}	}|	S )z#Input shape: Batch x Time x Channelr;   r   )chunksrX  )r`   r  chunkr  r  )
r_   r  bsztgt_len_qkvqkvattn_outputrJ   rJ   rK   r    s   zStep3VisionAttention.forwardNr   )rC   rD   rE   rF   r   r!  re   rH   rI   r  r"  rJ   rJ   r   rK   r    s    /r  c                       sF   e Zd Z		ddedB def fddZdejdejfd	d
Z  Z	S )Step3VisionMLPNr   r  r  c                    sf   t    || _t|j| _t }t|j|j	d|| d|d| _
t|j	|jd|| d|d| _d S )NTz.fc1r  z.fc2)r   re   r   r   
hidden_actactivation_fnr5   r   rt  intermediate_sizefc1r   fc2)r_   r   r  r  r  r   rJ   rK   re     s(   
zStep3VisionMLP.__init__r  rm   c                 C   s*   |  |\}}| |}| |\}}|S rn   )r  r  r  )r_   r  r  rJ   rJ   rK   r  2  s   
zStep3VisionMLP.forwardr  )
rC   rD   rE   r   r!  re   rH   rI   r  r"  rJ   rJ   r   rK   r    s    r  c                       sJ   e Zd Z		ddededB def fddZdejd	ej	fd
dZ
  ZS )Step3VisionEncoderLayerNr   r   r  r  c                    sj   t    |j| _t||| dd| _tj| j|jd| _	t
||| dd| _tj| j|jd| _d S )Nz
.self_attnr  )epsz.mlp)r   re   rt  ru  r  	self_attnrv  	LayerNormlayer_norm_epslayer_norm1r  mlplayer_norm2r_   r   r  r  r   rJ   rK   re   :  s   
z Step3VisionEncoderLayer.__init__r  rm   c                 C   s,   ||  | | }|| | | }|S rn   )r  r  r  r  r_   r  rJ   rJ   rK   r  O  s   zStep3VisionEncoderLayer.forwardr  )rC   rD   rE   r*   r   r!  re   rH   rI   FloatTensorr  r"  rJ   rJ   r   rK   r  9  s    r  c                       s<   e Zd Z		d
dededB def fddZdd	 Z  ZS )Step3VisionEncoderNr   r   r  r  c                    s8   t     | _t fddt jD | _d S )Nc                    s$   g | ]}t   d | dqS )z.layers.r  )r  r   r   r  r  rJ   rK   r   b  s    z/Step3VisionEncoder.__init__.<locals>.<listcomp>)r   re   r   rv  
ModuleListr   num_hidden_layerslayersr  r   r  rK   re   Y  s   

zStep3VisionEncoder.__init__c                 C   s   |}| j D ]}||}q|S rn   )r  )r_   inputs_embedsr  encoder_layerrJ   rJ   rK   r  l  s   

zStep3VisionEncoder.forwardr  )	rC   rD   rE   r*   r   r!  re   r  r"  rJ   rJ   r   rK   r  X  s    r  c                       sD   e Zd Z		ddededB def fddZdejfd	d
Z	  Z
S )Step3VisionTransformerNr   r   r  r  c                    sD   t    || _t | _|j| _t|| _t||| dd| _	d S )Nz.transformerr  )
r   re   r   r5   r  r   rl  r  r  transformerr  r   rJ   rK   re   w  s   

zStep3VisionTransformer.__init__r8   c                 C   s0   |  |}| jrt|| j}|S | j|d}|S N)r  )r  r  r6   r  )r_   r8   r  rJ   rJ   rK   r    s   
zStep3VisionTransformer.forwardr  )rC   rD   rE   r*   r   r!  re   rH   rI   r  r"  rJ   rJ   r   rK   r  v  s    r  )r9  dummy_inputsc                       s  e Zd ZeddddZdZedededed	B fd
dZ	ddde
dedd	f fddZedd Zedd Zdeded	B fddZdejdejfddZdejdejfddZdedeejd f fd!d"Zdefd#d$Z		d7d	dd%d&ejd'ed	B d(ejd	B d)edejf
 fd*d+Z				d8d&ejd	B d,ejd-ed	B d.ejd	B dedejeB fd/d0Zd1ejdejd	B fd2d3Zd4eeeejf  fd5d6Z   Z!S )9Step3VLForConditionalGenerationzlanguage_model.model.zlanguage_model.lm_head.)zmodel.zlm_head.)orig_to_new_prefixTrL  r   rm   Nc                 C   s   | drdS td)Nrg   r   z Only image modality is supported)
startswithr  )clsrL  r   rJ   rJ   rK   get_placeholder_str  s   
z3Step3VLForConditionalGeneration.get_placeholder_strr   r  vllm_configr  c                   s   t    |jj}|jj}|| _|| _|jdk| _| |dA t	|j
d t|dd| _t|j
j|j
jd|jd| _t|j
j|j
jd dddd	| _tj|j
jd |j|jd
| _W d    n1 scw   Y  | | t||jt|dd| _W d    n1 sw   Y  | jj| _d S )NrO   rg   vision_modelr  r   )ro  rp  r;   r-   )ro  rp  padding)rq  language_model)r  	hf_configr  )r   re   model_configr  multimodal_configr   mm_encoder_tp_moder  _mark_tower_modelr  r   r4   r  r   rt  output_hidden_sizeunderstand_projector_stridevit_downsamplervit_downsampler2rv  Linearprojector_biasvit_large_projector_mark_language_modelr3   text_configr  make_empty_intermediate_tensors)r_   r  r  r   r  r   rJ   rK   re     sP   




z(Step3VLForConditionalGeneration.__init__c                 C      t |  jS rn   )next
parametersdevicer   rJ   rJ   rK   r       z&Step3VLForConditionalGeneration.devicec                 C   r  rn   )r  r  r   r   rJ   rJ   rK   r     r  z%Step3VLForConditionalGeneration.dtypekwargsc                 K   s   | dd }| dd }| dd }| dd }|d u r"|d u r"d S |d ur:|d ur:td|| j|| j|dS |d urHtd|| jdS td)Nr8   rA   rB   rM   )r9   r8   rA   rB   )r9   rM   z This line should be unreachable.)r   r7   r_  r   rL   AssertionError)r_   r  r8   rA   rB   rM   rJ   rJ   rK   _parse_and_validate_image_input  s&   


z?Step3VLForConditionalGeneration._parse_and_validate_image_inputimage_featuresc                 C   s|   |j d d \}}tt|}|ddd|d||}| |}| |}|d}|||dddd}| |}|S )Nr   r   r-   r   )	r[  r   r   r]  r\  r  r  r`   r  )r_   r  BPHWn_dimrJ   rJ   rK   _process_image_features  s   



z7Step3VLForConditionalGeneration._process_image_featuresinput_tensorc                 C   s   |  |d d dd f S )Nrt   )r  )r_   r  rJ   rJ   rK   _get_vision_model_output  s   z8Step3VLForConditionalGeneration._get_vision_model_outputimage_input.c                 C   s  |d dkr|d }n|  |d }t|d dkr!|  |d nd }|d }| |}|d ur5| |nd }g }d}t|D ]B\}}g }	|dkr]||||  }
|	|
d|
jd  |	|| d|jd  ||7 }|t|	dkr|t|	n|	d  q?|S )	Nr9   rM   r8   rA   r   rB   r   r-   )	r  r   r  r   r   r\  r[  rH   r  )r_   r  r  patch_image_featuresrB   merged_image_featurescur_patch_idxr   	num_patchcur_featurepatch_slicerJ   rJ   rK   _process_image_input	  s8   



z4Step3VLForConditionalGeneration._process_image_inputc                 K   s*   | j di |}|d u rg S | |}|S )NrJ   )r  r  )r_   r  r  vision_embeddingsrJ   rJ   rK   embed_multimodal.  s
   
z0Step3VLForConditionalGeneration.embed_multimodal)is_multimodalhandle_oov_mm_token	input_idsmultimodal_embeddingsr  r  c                   s0   |d u s|d u rt  |S t  j||||dS )N)r  r  r  )r   embed_input_ids)r_   r  r  r  r  r   rJ   rK   r  5  s   
z/Step3VLForConditionalGeneration.embed_input_ids	positionsintermediate_tensorsr  c                 K   s"   |d urd }| j ||||d}|S r  )r  )r_   r  r  r  r  r  r  rJ   rJ   rK   r  I  s   z'Step3VLForConditionalGeneration.forwardr  c                 C   s   | j |S rn   )r  compute_logitsr  rJ   rJ   rK   r  Z  s   z.Step3VLForConditionalGeneration.compute_logitsweightsc                 C   s   t | }|j|| jdS )N)mapper)r1   load_weightshf_to_vllm_mapper)r_   r  loaderrJ   rJ   rK   r  `  s   z,Step3VLForConditionalGeneration.load_weightsrn   )NN)"rC   rD   rE   r2   r   supports_encoder_tp_dataclassmethodr!  r   r  r   re   r  r  r   rU  rP   r  rH   rI   r  r  r   r  r.   r  r   r  r(   r  r  r   r  r"  rJ   rJ   r   rK   r    s~     .



%


$r  )qrZ  collections.abcr   r   r   	itertoolsr   r   r   typingr   r	   r
   r   numpyr   rH   torch.nnrv  torch.nn.functional
functionalra  PILr   torchvisionr   !torchvision.transforms.functionalr   transformersr   r   r   vllm.configr   vllm.config.multimodalr   vllm.distributedr   %vllm.model_executor.layers.activationr   $vllm.model_executor.layers.attentionr   vllm.model_executor.layers.convr   !vllm.model_executor.layers.linearr   r   r   'vllm.model_executor.layers.quantizationr   vllm.multimodalr   vllm.multimodal.inputsr   r   r   vllm.multimodal.parser    r!   vllm.multimodal.processingr"   r#   r$   r%   r&   r'   vllm.sequencer(   vllm.tokenizersr)   vllm.transformers_utils.configsr*   vllm.utils.tensor_schemar+   r,   
interfacesr.   r/   r0   utilsr1   r2   r3   r4   visionr5   r6   r7   rL   rP   rG   r   r   r   r   rR   r   rS   rk   r   r#  r4  r?  rk  Modulerl  r  r  r  r  r  register_processorr  rJ   rJ   rJ   rK   <module>   st   
 - 4 E)4 5E#