o
    -ird                     @   sV  U d dl mZmZmZ d dlmZmZmZmZ d dl	Z
d dlZd dlmZ d dlmZmZmZmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlm Z m!Z!m"Z" d dl#m$Z$m%Z%m&Z&m'Z' d dl(m)Z) d dl*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1 d dl2m3Z3 d dl4m5Z5m6Z6 ddl7m8Z8 ddl9m:Z:m;Z;m<Z<m=Z=m>Z> ddl?m@Z@mAZAmBZBmCZCmDZD eeEZFdZGeddddddddddd 
ZH	!d8d"ed#edB d$eId%e8fd&d'ZJG d(d) d)e5ZKG d*d+ d+e5ZLeKeLB ZMeeNd,< G d-d. d.ejOZPG d/d0 d0e,ZQG d1d2 d2e)eQ ZRG d3d4 d4e+eQ ZSejTeSeQeRd5G d6d7 d7ejOe;e<e=ZUdS )9    )IterableMappingSequence)	AnnotatedAnyLiteral	TypeAliasN)BatchFeatureCLIPVisionConfigPretrainedConfigProcessorMixin)
VllmConfig)BaseDummyOptions)init_logger)QuantizationConfig)VocabParallelEmbedding)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItems)ImageEmbeddingItemsImageProcessorItems	ImageSizeMultiModalDataItems)BaseDummyInputsBuilder)BaseMultiModalProcessorBaseProcessingInfoMultiModalPromptUpdatesPlaceholderFeaturesInfoPromptReplacementPromptUpdateResolvedPromptUpdate)IntermediateTensors)TensorSchemaTensorShape   )CLIPVisionModel)MultiModalEmbeddingsSupportsMultiModal
SupportsPPSupportsQuant_require_is_multimodal)AutoWeightsLoaderWeightsMapper_merge_multimodal_embeddingsinit_vllm_registered_modelmaybe_prefixi,}  g        
quick_gelu   P  i               i   )
dropout
hidden_acthidden_size
image_sizeintermediate_sizenum_attention_headsnum_channelsnum_hidden_layers
patch_sizeprojection_dim 	hf_configquant_configprefixreturnc                 C   sF   t }| jdd}|dk r|j| d }n|d }t||||d}|S )N	layer_idxr   r%   )rD   num_hidden_layers_overriderE   )!CLIP_VIT_LARGE_PATCH14_336_CONFIGimg_processorgetr?   r&   )rC   rD   rE   clip_configrG   r?   rK    rN   ]/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/model_executor/models/phi3v.py_init_img_processor`   s   rP   c                
   @   sl   e Zd ZU dZdZed ed< eej	e
ej	 B eddddd	dhd
f ed< eej	dB eddf ed< dS )Phi3VImagePixelInputsz
    Dimensions:
        - b: Batch size
        - n: Number of images
        - p: Number of patches
        - h: Height of each patch
        - w: Width of each patch
    pixel_values)rR   image_embedstypebnpr5   hw)dynamic_dimsN   image_sizes__name__
__module____qualname____doc__rT   r   __annotations__r   torchTensorlistr$   rN   rN   rN   rO   rQ   x   s   
 	 rQ   c                   @   sF   e Zd ZU dZdZed ed< eej	e
ej	 B edddf ed< dS )	Phi3VImageEmbeddingInputsz
    Dimensions:
        - b: Batch size
        - n: Number of images
        - f: Image feature size (e.g., number of tokens per image)
        - h: Hidden size (must match language model backbone)
    rS   rT   rU   frW   dataNr\   rN   rN   rN   rO   re      s   
 
re   Phi3VImageInputsc                	       s   e Zd ZdZ	ddededB deddf fdd	Zd
ej	dej	fddZ
dej	dejdej	fddZdd Zdd Zdd Z  ZS )Phi3HDImageEmbeddingz'Phi3 Image embedding with HD transform.rB   configrD   NrE   rF   c           
   	      s4  t    t|dr|jn|j}t||| dd| _|jd }|jd | _|| _|j	
dd| _|j	
dd| _|j	
d	d
| _| jrG| jsIJ ttdd| jd g| _ttddd| jd g| _|}d}t|d |g}td|D ]}	|t t||g qztj| | _|j
dd| _d S )Nn_embdz.img_processorrD   rE   image_dim_outnum_img_tokensuse_hd_transformFwith_learnable_separatorhd_transform_orderglb_subr%      rZ   type_featurepatch)super__init__hasattrrk   r:   rP   rK   rn   rm   
embd_layerrL   ro   rp   rq   nn	Parameterrb   emptyglb_GNsub_GNLinearrangeextendGELU
Sequentialimg_projectionrt   )
selfrj   rD   rE   r:   rm   dim_projectiondepthlayers_	__class__rN   rO   rw      s2   

 zPhi3HDImageEmbedding.__init__
img_embedsc                 C   sD   | j }| |}|dkr|d d dd f }|S |dkr|S t|)Nru   r%   	cls_patch)rt   rK   NotImplementedError)r   r   rt   img_featurepatch_featurerN   rN   rO   get_img_features   s   
z%Phi3HDImageEmbedding.get_img_featuresrR   r[   c           
      C   sH   |j \}}}}}|dd}| |}|||d| j}| ||}	|	S )z
        process image and return vision embeddings.

        pixel_values: (num_images, num_crops, c, h, w)
        output: (num_images, num_img_tokens, hidden_size)
        r   r%   )shapeflattenr   reshaperm   hd_feature_transform)
r   rR   r[   
num_images	num_cropscrW   rX   img_featuresimage_features_projrN   rN   rO   forward   s   	

zPhi3HDImageEmbedding.forwardc                 C   s(  | j dksJ d| j  dt| jtjr$| jd jj}| jd jj}n
| jjj}| jjj}|dddf }| |dd}| 	|}g }t
|D ]I\}	}
|
\}}|d }|d }|| }||	dd| f }| |||}| 	|}t|d| jd||	 g}| |||}|| qH|S )zH
        image_features: (num_images, num_crops+1, 24*24, 1024)
        sub_glbzhd_transform_order `z` not implementedr   Nr%   r3   )rq   
isinstancer   rz   r   biasdevicedtypereshape_hd_patches_2x2mergeadd_image_newline	enumeraterb   catsqueezer}   toappend)r   image_featuresr[   target_devicetarget_dtypeglobal_image_featuresglobal_image_features_hd global_image_features_hd_newlinebatch_image_features_projiimg_sizerW   rX   h_cropw_cropr   sub_image_featuressub_image_features_hdsub_image_features_hd_newlineimage_embeddingsimg_projrN   rN   rO   r      sR   


	
z)Phi3HDImageEmbedding.hd_feature_transformc           
      C   s   |j \}}}|dkr|dkr|||  dksJ |||  }t|d }|||||||d d|d d|dddddd	|d
d| ||||d |d d
dddddd	||| d || d d| }	|	S )z
        image_features: (num_images*num_crops, 24*24, 1024)
        output: (num_images, h_crop*12, w_crop*12, 4096)
        where h_crop*w_crop == num_crops
        i@  r2   r   g      ?rZ   r%   r5   rs      r   )r   intr   permute)
r   r   r   r   NLCr   Himage_features_hdrN   rN   rO   r   0  s"   $z0Phi3HDImageEmbedding.reshape_hd_patches_2x2mergec                 C   s@   |j \}}}}| j||dd}tj||gdd|d|}|S )z
        image_features_hd: (num_images, h_crop*12, w_crop*12, 4096)
        output: (num_images, (h_crop*12) * (w_crop*12+1), 4096)
        r   rZ   )dim)r   r~   expandrb   r   r   )r   r   r   rW   rX   hid_dimnewline_embeddingsimage_features_hd_newlinerN   rN   rO   r   I  s   
z&Phi3HDImageEmbedding.add_image_newlinerB   )r]   r^   r_   r`   r   r   strrw   rb   FloatTensorr   rc   r   r   r   r   __classcell__rN   rN   r   rO   ri      s.    ,
;ri   c                	   @   sX   e Zd ZdeeedB f fddZdddedededB defd	d
Zde	fddZ
dS )Phi3VProcessingInforF   Nc                 C   s   dd iS NimagerN   r   rN   rN   rO   get_supported_mm_limitsZ  s   z+Phi3VProcessingInfo.get_supported_mm_limits)	processorimage_widthimage_heightr   c                C   s   |d u r|   }|j||dS )N)widthheight)get_hf_processor%calc_num_image_tokens_from_image_size)r   r   r   r   rN   rN   rO   get_num_image_tokens]  s   z(Phi3VProcessingInfo.get_num_image_tokensc                 C   s   t dddS )Ni@  2   )r   r   )r   r   rN   rN   rO   !get_image_size_with_most_featuresl  s   z5Phi3VProcessingInfo.get_image_size_with_most_features)r]   r^   r_   r   r   r   r   r   r   r   r   rN   rN   rN   rO   r   Y  s    
r   c                	   @   sX   e Zd Zdeeef defddZ	d
dedeeef deeef dB defdd	Z	dS )Phi3VDummyInputsBuilder	mm_countsrF   c                 C   s.   | dd}| j }|j}d|d | S )Nr   r   rB   )rL   infor   
img_tokensjoin)r   r   r   hf_processorimage_tokensrN   rN   rO   get_dummy_textr  s   
z&Phi3VDummyInputsBuilder.get_dummy_textNseq_len
mm_optionsc                 C   sB   | dd}| j \}}|r| dnd }d| j||||diS )Nr   r   )r   r   r   	overrides)rL   r   r   _get_dummy_images)r   r   r   r   r   target_widthtarget_heightimage_overridesrN   rN   rO   get_dummy_mm_dataz  s   z)Phi3VDummyInputsBuilder.get_dummy_mm_dataN)
r]   r^   r_   r   r   r   r   r   r   r   rN   rN   rN   rO   r   q  s    
r   c                       s   e Zd Zdedeeef deeef deeef def
 fddZded	eeef deeef fd
dZ	de
d	eeef dedee fddZdededef fddZdee dedeee eeee f f f fddZ  ZS )Phi3VMultiModalProcessorpromptmm_data	mm_kwargs
tok_kwargsrF   c                    s@   t  j||||d}|d }t|tjsJ ||dk t |S )N)r   r   r   r   	input_idsr   )rv   _call_hf_processorr   rb   rc   masked_fill__IMAGE_TOKEN_ID)r   r   r   r   r   processed_outputsr   r   rN   rO   r     s   z+Phi3VMultiModalProcessor._call_hf_processor	hf_inputshf_processor_mm_kwargsc                 C   s    t tdtdtddS )Nr   )rR   r[   rS   )dictr   batched)r   r   r   rN   rN   rO   _get_mm_fields_config  s
   z.Phi3VMultiModalProcessor._get_mm_fields_configmm_itemsout_mm_kwargsc                    s@   j jdi |  j}dtf fdd}td|j|dgS )Nitem_idxc                    sP    dttf}t|tr|| }n|| }jj|j|j	 d}t
g| S )Nr   )r   r   r   )	get_itemsr   r   r   get_feature_sizeget_image_sizer   r   r   r   r   )r   imagesnum_image_tokensr;   r   r   r   rN   rO   get_replacement_phi3v  s   


zKPhi3VMultiModalProcessor._get_prompt_updates.<locals>.get_replacement_phi3vr   )modalitytargetreplacementrN   )r   r   r   r   r   __getitem__)r   r   r   r   r   r  rN   r  rO   _get_prompt_updates  s   z,Phi3VMultiModalProcessor._get_prompt_updatescached_updatenew_item_idxc                    s:   t  ||}|jdkr| j }|j}||| }|S r   )rv   _recompute_cached_prompt_updater  r   r   r   with_target)r   r  r	  
new_updater   r   r   rN   rO   r
    s   

z8Phi3VMultiModalProcessor._recompute_cached_prompt_update	token_idsmm_prompt_updatesc           
         s^  t |r|| j  t |r|d  jkr|dd  } |} j D ]$}t|tr5|	| d|}q$t|t
rH|D ]}|	| d|}q<q$d} fddt||D } fddt||D }t |t |krr|g  dd t||D }t j||d	\}}	t |r|d d
  jdddkr|d g|d
d  }dd |	 D }	||	fS )Nr   r%    z<\|image_\d+\|>c                    s   g | ]} |j qS rN   r   .0chunk	tokenizerrN   rO   
<listcomp>  s    
zBPhi3VMultiModalProcessor._apply_prompt_updates.<locals>.<listcomp>c                    s   g | ]	} |d dj qS )Fadd_special_tokensr  r  r  rN   rO   r    s    c                 S   s$   g | ]}|D ]	}|D ]}|q
qqS rN   rN   )r  sublisteleerN   rN   rO   r    s    
)r  r  rZ   z<s> <|image|>Fr  c                 S   s    i | ]\}}|d d |D qS )c              	   S   s,   g | ]}t |j|j|jd  |j|jdqS )r%   )r  r   	start_idxtokensis_embed)r   r  r   r  r  r  )r  rV   rN   rN   rO   r    s    zMPhi3VMultiModalProcessor._apply_prompt_updates.<locals>.<dictcomp>.<listcomp>rN   )r  r  psrN   rN   rO   
<dictcomp>  s    zBPhi3VMultiModalProcessor._apply_prompt_updates.<locals>.<dictcomp>)lenr   get_tokenizerbos_token_iddecodespecial_tokens_mapvaluesr   r   replacerd   resplitfindallr   ziprv   _apply_prompt_updatesencodeitems)
r   r  r  textspecial_tokensspecial_tokenpatternprompt_chunks
image_tagsplaceholdersr   r  rO   r,    sF   









z.Phi3VMultiModalProcessor._apply_prompt_updates)r]   r^   r_   r   r   objectr	   r   r   r   r   r   r   r   r    r  r!   r   r
  rd   r   tupler   r,  r   rN   rN   r   rO   r     sT    







"r   )r   dummy_inputsc                       sB  e Zd ZeddddddZededed	ed
B fddZddde	def fddZ
ded	ed
B fddZded	ejfddZded	efddZ	
d.d
dddejded
B dejd
B d ed	ejf
d!d"Z	
	
d/dejd#ejd$ed
B d%ejd
B def
d&d'Zd(ejd	ejd
B fd)d*Zd+eeeejf  d	ee fd,d-Z  ZS )0Phi3VForCausalLMembed_tokenszvision_embed_tokens.zlanguage_model.lm_head.zlanguage_model.model.)zmodel.vision_embed_tokens.wtezmodel.vision_embed_tokens.zlm_head.zmodel.)orig_to_new_prefixr  r   rF   Nc                 C   s   | drd| dS td)Nr   z<|image_z|>z Only image modality is supported)
startswith
ValueError)clsr  r   rN   rN   rO   get_placeholder_str9  s   
z$Phi3VForCausalLM.get_placeholder_strrB   )rE   vllm_configrE   c                   s   t    |jj}|j}|jj}|| _|| _t| _| 	|d! t
|j|j|t|dd| _t||t|dd| _W d    n1 sCw   Y  | | t|ddgd| _W d    n1 saw   Y  | jj| _d S )Nr   zmodel.embed_tokensrl   zmodel.vision_embed_tokensrB   LlamaForCausalLM)r@  rE   architectures)rv   rw   model_configrC   rD   multimodal_configrj   r   image_token_id_mark_tower_modelr   
vocab_sizer:   r0   r:  ri   vision_embed_tokens_mark_language_modelr/   language_modelmake_empty_intermediate_tensors)r   r@  rE   rj   rD   rD  r   rN   rO   rw   @  s:   


zPhi3VForCausalLM.__init__kwargsc                 K   sv   | dd }| dd }| dd }|d u r|d u rd S |d ur-td||tjtjddS |d ur7td|dS td)NrR   r[   rS   )rW   rX   )rT   rR   r[   resolve_bindings)rT   rg   z This line should be unreachable.)poprQ   rJ   r;   re   AssertionError)r   rL  rR   r[   rS   rN   rN   rO   _parse_and_validate_image_inputf  s(   
z0Phi3VForCausalLM._parse_and_validate_image_inputimage_inputc                 C   s,   |d dkr
|d S |  |d |d }|S )NrT   rS   rg   rR   r[   )rH  )r   rQ  rS   rN   rN   rO   _process_image_input  s   z%Phi3VForCausalLM._process_image_inputc                 K   s*   | j di |}|d u rg S | |}|S )NrN   )rP  rR  )r   rL  rQ  vision_embeddingsrN   rN   rO   embed_multimodal  s
   
z!Phi3VForCausalLM.embed_multimodalFis_multimodalhandle_oov_mm_tokenr   multimodal_embeddingsrV  rW  c                C   s>   | j || j||d}|d u st|dkr|S t||t|dS )NrU  r   )inputs_embedsrX  rV  )_embed_text_input_idsr:  r!  r.   r+   )r   r   rX  rV  rW  rY  rN   rN   rO   embed_input_ids  s   z Phi3VForCausalLM.embed_input_ids	positionsintermediate_tensorsrY  c                 K   s$   |d urd }| j j||||d}|S )N)rY  )rJ  model)r   r   r\  r]  rY  rL  hidden_statesrN   rN   rO   r     s   zPhi3VForCausalLM.forwardr_  c                 C   s   | j |S r   )rJ  compute_logits)r   r_  rN   rN   rO   r`    s   zPhi3VForCausalLM.compute_logitsweightsc                 C   s:   t | }|j|| jd}d|vr| jjj| _|d |S )N)mapperzembed_tokens.weight)r,   load_weightshf_to_vllm_mapperrJ  r^  r:  add)r   ra  loaderautoloaded_weightsrN   rN   rO   rc    s   
zPhi3VForCausalLM.load_weightsr   )NN)r]   r^   r_   r-   rd  classmethodr   r   r?  r   rw   r6  rh   rP  rb   rc   rR  r'   rT  boolr[  r"   r   r`  r   r7  setrc  r   rN   rN   r   rO   r9  *  sp    	&





,r9  r   )Vcollections.abcr   r   r   typingr   r   r   r   regexr(  rb   torch.nnrz   transformersr	   r
   r   r   vllm.configr   vllm.config.multimodalr   vllm.loggerr   'vllm.model_executor.layers.quantizationr   3vllm.model_executor.layers.vocab_parallel_embeddingr   vllm.multimodalr   vllm.multimodal.inputsr   r   r   vllm.multimodal.parser   r   r   r   vllm.multimodal.processingr   $vllm.multimodal.processing.processorr   r   r   r   r   r    r!   vllm.sequencer"   vllm.utils.tensor_schemar#   r$   clipr&   
interfacesr'   r(   r)   r*   r+   utilsr,   r-   r.   r/   r0   r]   loggerr   rJ   r   rP   rQ   re   rh   ra   Moduleri   r   r   r   register_processorr9  rN   rN   rN   rO   <module>   sx   $	
 6 