o
    
۾i|                     @   s:  U d dl mZmZmZ d dlmZmZmZ d dlZd dl	m
Z
 d dlmZ d dlmZ d dlmZmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z&m'Z'm(Z( d dl)m*Z*m+Z+m,Z,m-Z- d dl.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4 d dl5m6Z6 d dl7m8Z8 d dl9m:Z:m;Z; ddl<m=Z=m>Z>m?Z? ddl@mAZAmBZBmCZC dZDdZEdZFdZGdZHG dd de:ZIG dd de:ZJeIeJB ZKeeLd < d!eMfd"d#ZNd$eOd%ePeQeMeMf  d&eMd'eMd(eMd)eQeMeMf fd*d+ZRd,eMd-eMd.eSd/eSd)eQeMeMf f
d0d1ZTd2eMd3eMd)ePeQeMeMf  fd4d5ZUd6eMd7eMd%ePeQeMeMf  d(eMd/eSd)eQeMeMeMf fd8d9ZVd:ejd%ePeQeMeMf  d(eMd/eSd)ePej f
d;d<ZWd:ejd!eMd2eMd3eMd/eSd)ejXfd=d>ZYG d?d@ d@ZZG dAdB dBe1Z[G dCdD dDe/e[ Z\G dEdF dFe0e[ Z]e"j^e]e[e\dGG dHdI dIe
j_e>e?Z`dS )J    )IterableMappingSequence)	AnnotatedLiteral	TypeAliasN)Image)BatchFeaturePretrainedConfig
TensorType)
VllmConfig)BaseDummyOptions)ReplicatedLinear)QuantizationConfig)	AWQConfig)InternVisionModelInternVisionPatchModel)MULTIMODAL_REGISTRYconvert_image_mode)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItems)ImageEmbeddingItemsImageProcessorItems	ImageSizeMultiModalDataItems)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdatePromptUpdateDetails)IntermediateTensors)TokenizerLike)TensorSchemaTensorShape   )MultiModalEmbeddingsSupportsMultiModal
SupportsPP)AutoWeightsLoaderinit_vllm_registered_modelmaybe_prefixz<img>z</img>z<IMG_CONTEXT>)g
ףp=
?gv/?gCl?)gZd;O?gy&1?g?c                   @   sT   e Zd ZU dZdZed ed< eej	e
ddddf ed< eej	e
d	f ed
< dS )SkyworkR1VImagePixelInputsz
    Dimensions:
        - bnp: Batch size * number of images * (1 + num_patches)
        - c: Number of channels (3)
        - h: Height
        - w: Width
        - bn: Batch size * number of images
    pixel_valuestypebnp   hwpixel_values_flatbnnum_patchesN)__name__
__module____qualname____doc__r0   r   __annotations__r   torchTensorr&    r?   r?   Y/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/skyworkr1v.pyr.   @   s   
 	r.   c                   @   sF   e Zd ZU dZdZed ed< eej	e
ej	 B edddf ed< dS )	SkyworkR1VImageEmbeddingInputsz
    Dimensions:
        - ni: Number of images
        - ifs: Image feature size
        - hs: Hidden size (must match the hidden size of language model
          backbone)
    image_embedsr0   niifshsdataN)r8   r9   r:   r;   r0   r   r<   r   r=   r>   listr&   r?   r?   r?   r@   rA   W   s   
 
rA   SkyworkR1VImageInputs
input_sizec              	   C   sF   t t}}ttdd tj| | ftjjdt tj	||dgS )Nc                 S   s
   t | dS )NRGBr   )imgr?   r?   r@   <lambda>r   s   
 z!build_transform.<locals>.<lambda>)interpolation)meanstd)
IMAGENET_MEANIMAGENET_STDTComposeLambdaResizeInterpolationModeBICUBICToTensor	Normalize)rI   MEANSTDr?   r?   r@   build_transformn   s   
r\   aspect_ratiotarget_ratioswidthheight
image_sizereturnc                C   s|   t d}d}|| }|D ]/}|d |d  }	t| |	 }
|
|k r%|
}|}q|
|kr;|d| | |d  |d  kr;|}q|S )Ninf)r'   r'   r   r'         ?)floatabs)r]   r^   r_   r`   ra   best_ratio_diff
best_ratioarearatiotarget_aspect_ratio
ratio_diffr?   r?   r@   find_closest_aspect_ratio}   s    rm   min_dynamic_patchmax_dynamic_patchdynamic_image_sizeuse_thumbnailc                 C   s4   |r| nd} |r
|nd}|r|dkr|d7 }| |fS )Nr'   r?   rn   ro   rp   rq   r?   r?   r@   resolve_skyworkr1v_min_max_num   s
   rs   min_nummax_numc                    s.    fddt  d D }t|dd dS )Nc                    sX   h | ](}t d |d  D ]}t d |d  D ]}||   kr" krn n||fqqqS )r'   )range).0nijru   rt   r?   r@   	<setcomp>   s    
z/get_skyworkr1v_target_ratios.<locals>.<setcomp>r'   c                 S   s   | d | d  S )Nr   r'   r?   )xr?   r?   r@   rL      s    z.get_skyworkr1v_target_ratios.<locals>.<lambda>)key)rv   sorted)rt   ru   r^   r?   r{   r@   get_skyworkr1v_target_ratios   s   r   
orig_widthorig_heightc           
      C   s`   | | }t ||| ||d}||d  }||d  }|d |d  }	|r+|	dkr+|	d7 }	|	||fS )N)r_   r`   ra   r   r'   )rm   )
r   r   r^   ra   rq   r]   rk   target_widthtarget_heightblocksr?   r?   r@   calculate_skyworkr1v_targets   s   	
r   imagec                C   s   | j \}}t||||dd\}}}| ||f}	g }
t|D ].}|||  | |||  | |||  d | |||  d | f}|	|}|
| qt|
|ksUJ |rit|
dkri| ||f}|
| |
S )NF)r   r   r^   ra   rq   r'   )sizer   resizerv   cropappendlen)r   r^   ra   rq   r   r   r   r   r   resized_imgprocessed_imagesry   box	split_imgthumbnail_imgr?   r?   r@   dynamic_preprocess_skyworkr1v   s.   
	

r   c                   s@   t ||}t|d t| |||d}t fdd|D }|S )N)rI   )r^   ra   rq   c                    s   g | ]} |qS r?   r?   rw   r   	transformr?   r@   
<listcomp>      z4image_to_pixel_values_skyworkr1v.<locals>.<listcomp>)r   r\   r   r=   stack)r   rI   rt   ru   rq   r^   imagesr/   r?   r   r@    image_to_pixel_values_skyworkr1v   s   

r   c                       s  e Zd ZdZdddddedededB dedB dedB d	df fd
dZe	d	efddZ
dededB d	ee fddZddddddedB dedB dedB dedB d	eeef f
ddZddddddedB dedB dedB dedB d	eeeef  f
ddZdeded	efddZ			d#deej dedB dedB dedB d	eej f
ddZ						d$deee B dB dejeej B dB dedB dedB dedB d eeB dB d	efd!d"Z  ZS )%SkyworkR1VProcessorz
    This model doesn't define its own HF processor,
    so we implement our own one here.

    The code to insert image tokens is based on:
    https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/modeling_skywork_chat.py#L252
    Nrn   ro   rp   config	tokenizerrn   ro   rp   rb   c                   s   t    || _|| _|jj}|jj}|d u r|j}t|t	s!J |d u r(|j
}t|t	s/J |d u r6|j}t|ts=J t	|| d |jd  | _|| _|| _|| _
|| _|j| _d S )N   )super__init__r   r   vision_configra   
patch_sizern   
isinstanceintro   rp   booldownsample_rationum_image_tokenrq   )selfr   r   rn   ro   rp   ra   r   	__class__r?   r@   r     s,   
	zSkyworkR1VProcessor.__init__c                 C   s   | j  t S N)r   	get_vocabIMG_CONTEXTr   r?   r?   r@   image_token_id@  s   z"SkyworkR1VProcessor.image_token_idfeature_sizer7   c                 C   s    t | }t| t }t|t S r   )r   	IMG_STARTIMG_ENDr"   select_text)r   r   r7   repl_features	repl_fullr?   r?   r@   get_image_replD  s   z"SkyworkR1VProcessor.get_image_replrr   rq   c                C   sX   |d u r| j n|}|d u r| jn|}|d u r| jn|}|d u r"| jn|}t||||dS Nrr   )rn   ro   rp   rq   rs   )r   rn   ro   rp   rq   r?   r?   r@   resolve_min_max_numN  s   	z'SkyworkR1VProcessor.resolve_min_max_numc                C   s    | j ||||d\}}t||S r   )r   r   )r   rn   ro   rp   rq   rt   ru   r?   r?   r@   resolve_target_ratiosj  s   

z)SkyworkR1VProcessor.resolve_target_ratiosimage_widthimage_heightc                C   s2   | j dd}t||| j|| jd\}}}|| j S )NF)rq   )r   r   ra   r^   rq   )r   r   ra   rq   r   )r   r   r   r^   r7   _r?   r?   r@   get_num_image_tokens{  s   
z(SkyworkR1VProcessor.get_num_image_tokensr   c                    s,   j |||dd\  fdd|D S )NFrr   c              	      s"   g | ]}t |j jd qS ))rI   rt   ru   rq   )r   ra   rq   r   ru   rt   r   r?   r@   r     s    zCSkyworkR1VProcessor._images_to_pixel_values_lst.<locals>.<listcomp>)r   )r   r   rn   ro   rp   r?   r   r@   _images_to_pixel_values_lst  s   
z/SkyworkR1VProcessor._images_to_pixel_values_lsttextreturn_tensorsc                    s   |d u rg }t |ts|g}|d u rg }t |ts|g}t|dkr%i }n7| j||||d}t|tdd |D d}|D ]}	|	jd }
|
| j }| 	||
  fdd|D }q@| 
|}i ||}t||dS )Nr   r   c                 S   s   g | ]}t |qS r?   )r   )rw   itemr?   r?   r@   r     r   z0SkyworkR1VProcessor.__call__.<locals>.<listcomp>)r5   image_num_patchesc                    s   g | ]
}| d  jdqS )<image>r'   )replacefull)rw   t
image_replr?   r@   r     s    )tensor_type)r   rG   r   r   r=   cattensorshaper   r   r   r	   )r   r   r   rn   ro   rp   r   image_inputspixel_values_lstr/   r7   r   text_inputscombined_outputsr?   r   r@   __call__  s:   	




zSkyworkR1VProcessor.__call__)NNN)NNNNNN)r8   r9   r:   r;   r
   r$   r   r   r   propertyr   r"   strr   tupler   rG   r   r   r   r=   r>   r   r   r	   r   __classcell__r?   r?   r   r@   r     s    &






r   c                   @   sd   e Zd ZdedefddZdeeedB f fddZ	ded	ed
edB defddZ
defddZdS )SkyworkR1VProcessingInfokwargsrb   c                 K   s"   | j jtf|  |  d|S )N)r   r   )ctxinit_processorr   get_hf_configget_tokenizer)r   r   r?   r?   r@   get_hf_processor  s   z)SkyworkR1VProcessingInfo.get_hf_processorNc                 C   s   dd iS )Nr   r?   r   r?   r?   r@   get_supported_mm_limits  s   z0SkyworkR1VProcessingInfo.get_supported_mm_limitsr   r   	processorc                C   s   |d u r|   }|j||dS )N)r   r   )r   r   )r   r   r   r   r?   r?   r@   r     s   z-SkyworkR1VProcessingInfo.get_num_image_tokensc                 C   s   |   }|j}| }d\}}|D ]!\}}|| || }}	| j||	|d}
|
|kr2|
}t||	d}q|dks;|d u r?td|S )N)r   Nr   r   r   )r_   r`   r   z(Cannot have a largest feature size of 0!)r   ra   r   r   r   
ValueError)r   r   	base_sizer^   largest_feature_sizelargest_feature_pinpointwrhrr_   r`   	feat_sizer?   r?   r@   !get_image_size_with_most_features  s$   z:SkyworkR1VProcessingInfo.get_image_size_with_most_features)r8   r9   r:   objectr   r   r   r   r   r   r   r   r   r?   r?   r?   r@   r     s    
r   c                	   @   sX   e Zd Zdeeef defddZ	d
dedeeef deeef dB defdd	Z	dS )SkyworkR1VDummyInputsBuilder	mm_countsrb   c                 C   s   | dd}d| S )Nr   r   r   )get)r   r   
num_imagesr?   r?   r@   get_dummy_text  s   z+SkyworkR1VDummyInputsBuilder.get_dummy_textNseq_len
mm_optionsc                 C   sB   | j  \}}|dd}|r|dnd }d| j||||diS )Nr   r   )r_   r`   r   	overrides)infor   r   _get_dummy_images)r   r   r   r   r   r   r   image_overridesr?   r?   r@   get_dummy_mm_data  s   z.SkyworkR1VDummyInputsBuilder.get_dummy_mm_datar   )
r8   r9   r:   r   r   r   r   r   r   r   r?   r?   r?   r@   r     s    	
r   c                
       s   e Zd Zdedeeef deeef deeef def
 fddZded	eeef deeef fd
dZ	de
d	eeef dedee fddZ  ZS )SkyworkR1VMultiModalProcessorpromptmm_data	mm_kwargs
tok_kwargsrb   c                    s>   t  j||||d}| jjdi |}|j}t||d< |S )N)r   r   r   r   r   r?   )r   _call_hf_processorr   r   r   r=   r   )r   r   r   r   r   processed_outputshf_processorr   r   r?   r@   r   )  s   z0SkyworkR1VMultiModalProcessor._call_hf_processor	hf_inputshf_processor_mm_kwargsc                 C   sF   | dtd}t|}ttd|tdtdtd|dS )Nr   r   r   )r5   r   rB   r   )	r   r=   emptyr   dictr   flat_from_sizesbatchedshared)r   r   r   r   r   r?   r?   r@   _get_mm_fields_configA  s   
z3SkyworkR1VMultiModalProcessor._get_mm_fields_configmm_itemsout_mm_kwargsc                    s   j jd	i | | }d|v r"|d ttjsJ  nd|v r0d gt|d  ng dtf fdd}t	dd|dgS )
Nr   rB   item_idxc                    sp    dttf}t|tr|| }n|| }jj|j|j	 d}|  }|d ur2t|t
s2J  ||S )Nr   r   )	get_itemsr   r   r   get_feature_sizeget_image_sizer   r   r_   r`   r   r   )r	  r   r   ra   r7   r   r   r  r   r?   r@   get_replacement_skyworkr1vf  s   

zUSkyworkR1VMultiModalProcessor._get_prompt_updates.<locals>.get_replacement_skyworkr1vr   r   )modalitytargetreplacementr?   )
r   r   get_datar   r=   r>   tolistr   r   r    )r   r  r   r  out_mm_datar  r?   r  r@   _get_prompt_updatesR  s    
z1SkyworkR1VMultiModalProcessor._get_prompt_updates)r8   r9   r:   r   r   r   r	   r   r   r  r   r   r   r!   r  r   r?   r?   r   r@   r   (  s8    






r   )r   dummy_inputsc                       s  e Zd ZededededB fddZddd	ed
eddf fddZde	de
fddZde	de
dB ded
efddZ	d:de	de
d
edejfddZd;ddZdejdejfddZdededB fddZdedejeej B eejd f B fd!d"Zd#ejddfd$d%Zdedefd&d'Z	d<dd(d)d#ejd*edB d+ejdB d,edejf
 fd-d.Z		d=d#ejdB d/ejd0edB d1ejdB dedefd2d3Z d4ejdejdB fd5d6Z!d7e"eeejf  de#e fd8d9Z$  Z%S )>SkyworkR1VChatModelr  ry   rb   Nc                 C   s   | drdS td)Nr   r   z Only image modality is supported)
startswithr   )clsr  ry   r?   r?   r@   get_placeholder_str  s   
z'SkyworkR1VChatModel.get_placeholder_str prefixvllm_configr  c          	         sR  t    |jj}|j}|jj}|| _|| _| || |jp"|j	j
}|j	j}|| _t|| d |jd  | _|j| _|j| _|jjd }|dk| _| |d" | j||| jt|dd| _| j||t|dd| _W d    n1 svw   Y  | | t||jt|d	d
| _W d    n1 sw   Y  d | _d | _| jj| _d S )Nr   r   SkyworkLM2VEForCausalLMr   vision_model)quant_configis_monor  mlp1r  language_model)r  	hf_configr  )r   r   model_configr%  r!  multimodal_configr   _patch_quant_configforce_image_sizer   ra   r   r   r   r   
ps_versiontext_configarchitecturesr"  _mark_tower_model_init_vision_modelr-   r   
_init_mlp1r#  _mark_language_modelr,   r$  img_context_token_idvisual_token_maskmake_empty_intermediate_tensors)	r   r  r  r   r!  r'  ra   r   llm_arch_namer   r?   r@   r     sN   



zSkyworkR1VChatModel.__init__r   r!  c                 C   sF   t |tr|j}t|dd }|js|d ur!|jd d S d S d S d S )Nquantization_configr   )r   r   r+  getattrmodules_to_not_convertr   )r   r   r!  r+  llm_quant_configr?   r?   r@   r(    s   
z'SkyworkR1VChatModel._patch_quant_configr"  c                C   sH   |s|j }|dk r|jj| d }n|d }t|j|||dS t|jS )Nr   r'   )r!  num_hidden_layers_overrider  )select_layerr   num_hidden_layersr   r   )r   r   r!  r"  r  vision_feature_layerr;  r?   r?   r@   r.    s   
z&SkyworkR1VChatModel._init_vision_modelc                 C   sv   |j j}|jj}tt|td| j d  t|td| j d  |d|| ddt	 t||d|| ddS )Nr'   r   Fz.1)return_biasr!  r  z.3)
r   hidden_sizer+  nn
Sequential	LayerNormr   r   r   GELU)r   r   r!  r  vit_hidden_sizellm_hidden_sizer?   r?   r@   r/    s(   zSkyworkR1VChatModel._init_mlp1rd   c              	   C   s   |  \}}}}|||t|| t|| }|dddd }||t|| t|| t|||  }| jdkr@	 |S |dddd }|S )Nr   r   r'   r2   v1)r   viewr   permute
contiguousr*  )r   r}   scale_factorrx   r4   r3   cr?   r?   r@   pixel_shuffle  s    


z!SkyworkR1VChatModel.pixel_shuffler/   c                 C   s   | j |d}|d d dd d d f }t|jd d  }}||jd ||d}| j|| jd}||jd d|jd }| |}|S )N)r/   r'   rd   r   )rI  )r   r   r   reshaperK  r   r#  )r   r/   
vit_embedsr3   r4   r?   r?   r@   extract_feature  s   
z#SkyworkR1VChatModel.extract_featurer   c                 K   s   | dd }| dd }| dd }|d u r|d u rd S |d ur&td|dS |d }t|tjr8|   }t|ts?J || _	|d urWt
d||| jjj| jjjddS td	)
Nr5   r   rB   )r0   rF   r   r/   )r3   r4   )r0   r5   r7   resolve_bindingsz This line should be unreachable.)poprA   r   r=   r>   flattenuniquer   r   r1  r.   r   r   ra   AssertionError)r   r   r5   r   rB   r   r?   r?   r@   _parse_and_validate_image_input   s2   
z3SkyworkR1VChatModel._parse_and_validate_image_inputimage_input.c                    s   |d dkr
|d S |  |d }|d }t|dkr'|d| jjjdS |jd  |d| jjj} fd	d
|D }||S )Nr0   rB   rF   r5   r7   r'   rL  r   c                    s   g | ]}|  qS r?   r?   )rw   r7   r   r?   r@   r   Y  s    z<SkyworkR1VChatModel._process_image_input.<locals>.<listcomp>)	rO  r   rF  r   r+  r>  	unsqueezer   split)r   rV  rB   r7   image_feature_sizesr?   rW  r@   _process_image_inputD  s   


z(SkyworkR1VChatModel._process_image_input	input_idsc                 C   s(   | j r|| jkdd| _d S d | _d S )NrL  r'   )r"  r1  rM  r2  )r   r\  r?   r?   r@   _set_visual_token_mask^  s
   


z*SkyworkR1VChatModel._set_visual_token_maskc                 K   s&   | j di |}|d u rg S | |S )Nr?   )rU  r[  )r   r   rV  r?   r?   r@   embed_multimodalf  s   
z$SkyworkR1VChatModel.embed_multimodalF)is_multimodalhandle_oov_mm_tokenmultimodal_embeddingsr_  r`  c                   sN   |d urt |dkr| | |d u s|d u rt |S t j||||dS )Nr   )ra  r_  r`  )r   r]  r   embed_input_ids)r   r\  ra  r_  r`  r   r?   r@   rb  m  s   
z#SkyworkR1VChatModel.embed_input_ids	positionsintermediate_tensorsinputs_embedsc                 K   sP   |d urd }||||d}| j d ur|d| j i d | _ | jjdi |}|S )N)r\  rc  rd  re  r2  r?   )r2  updater$  model)r   r\  rc  rd  re  r   forward_kwargshidden_statesr?   r?   r@   forward  s   
zSkyworkR1VChatModel.forwardri  c                 C   s   | j |S r   )r$  compute_logits)r   ri  r?   r?   r@   rk    s   z"SkyworkR1VChatModel.compute_logitsweightsc                 C   s   g d}t | |d}||S )N)action_embedtemporal_embedtrack_embedtrack_embed_decoder	box_tokencg_criterioncg_modelloc_encoderloc_decodersamtemporal_tokentrack_token)skip_prefixes)r+   load_weights)r   rl  ry  loaderr?   r?   r@   rz    s   
z SkyworkR1VChatModel.load_weights)r  )rd   r   )NN)&r8   r9   r:   classmethodr   r   r  r   r   r
   r   r(  r   r.  r?  Moduler/  rK  r=   r>   rO  r   rH   rU  rG   r   r[  r]  r(   r^  rb  r#   rj  rk  r   setrz  r   r?   r?   r   r@   r    s     /




$



,r  )acollections.abcr   r   r   typingr   r   r   r=   torch.nnr?  torchvision.transforms
transformsrR   PILr   transformersr	   r
   r   vllm.configr   vllm.config.multimodalr   !vllm.model_executor.layers.linearr   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.quantization.awqr   %vllm.model_executor.models.intern_vitr   r   vllm.multimodalr   vllm.multimodal.imager   vllm.multimodal.inputsr   r   r   vllm.multimodal.parser   r   r   r   vllm.multimodal.processingr   r   r   r    r!   r"   vllm.sequencer#   vllm.tokenizersr$   vllm.utils.tensor_schemar%   r&   
interfacesr(   r)   r*   utilsr+   r,   r-   r   r   r   rP   rQ   r.   rA   rH   r<   r   r\   re   rG   r   rm   r   rs   r   r   r   r>   r   r   r   r   r   register_processorr}  r  r?   r?   r?   r@   <module>   s   	 







*
 I4\