o
    
۾ij                  	   @   s|  U d dl mZmZmZ d dlmZ d dlmZmZm	Z	 d dl
Zd dlZd dlZd dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dl m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z)m*Z*m+Z+ d dl,m-Z-m.Z. d dl/m0Z0m1Z1m2Z2m3Z3m4Z4 d dl5m6Z6 d dl7m8Z8 d dl9m:Z:m;Z; ddl<m=Z=m>Z>m?Z? ddl@mAZA ddlBmCZCmDZDmEZE e%eFZGG dd de:ZHeHZIe	eJd< ded eKd!eKd"ejLdB fd#d$ZMd%eKd&eKd'eKd"ejNfd(d)ZOG d*d+ d+ejPZQG d,d- d-ejPZRG d.d/ d/eAZSG d0d1 d1e2ZTG d2d3 d3e0eT ZUG d4d5 d5e1eT ZVe'jWeVeTeUd6G d7d8 d8ejPe>e?ZXdS )9    )IterableMappingSequence)partial)	AnnotatedLiteral	TypeAliasN)	rearrange)Image)LayerNorm2d)resample_abs_pos_embed)RegStage)nn)BatchFeature)BaseModelOutput)Qwen2VLVisionConfig)
VllmConfig)BaseDummyOptions)init_logger)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItems)	ImageSizeMultiModalDataItems)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdate)IntermediateTensors)resolve_obj_by_qualname)TensorSchemaTensorShape   )MultiModalEmbeddingsSupportsMultiModal
SupportsPP)Qwen2VisionTransformer)AutoWeightsLoaderinit_vllm_registered_modelmaybe_prefixc                   @   sN   e Zd ZU dZed ed< eeje	ddf ed< eeje	ddf ed< d	S )
KananaVImagePixelInputsz
    Dimensions:
        - np: The total number of patches over all images in the batch
        - cps: Number of channels * patch_size * patch_size
        - ni: Number of images
    pixel_valuestypenpcpsni   vision_grid_thwN)
__name__
__module____qualname____doc__r   __annotations__r   torchTensorr#    r;   r;   W/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/kanana_v.pyr,   0   s   
 r,   KananaVImageInputsconfignum_input_tokensvision_hidden_sizereturnc                 C   s8   | j rttd||}tjj|ddd |S d}|S )z:Build positional embeddings for the visual encoder output.r$   g        g{Gz?)meanstdN)pos_embr   	Parameterr9   zerosinittrunc_normal_)r>   r?   r@   rD   r;   r;   r<   build_pos_embedsH   s   rI   depthhidden_sizeoutput_hidden_sizec                 C   sH   t ||g}td| D ]}|t   |t || qt j| S )z6Simple SiLU-activated MLP used as a projector readout.r$   )r   LinearrangeappendSiLU
Sequential)rJ   rK   rL   layers_r;   r;   r<   	build_mlpW   s
   
rT   c                       sH   e Zd ZdZdeddf fddZ	ddejd	edejfd
dZ	  Z
S )
PatchMergez9Merge neighboring patches spatially to reduce resolution.
merge_sizerA   Nc                    s   t    || _d S N)super__init__rV   )selfrV   	__class__r;   r<   rY   g   s   

zPatchMerge.__init__Fxchannel_lastc                 C   s4   |rt |d}|j\}}}}t |d| j| jd}|S )z+Merge patches by `merge_size x merge_size`.zB H W D -> B D H Wz$B D (H h2) (W w2) -> B (D h2 w2) H W)h2w2)r	   shaperV   )rZ   r]   r^   rS   HWmerged_xr;   r;   r<   forwardk   s   
zPatchMerge.forward)F)r4   r5   r6   r7   intrY   r9   r:   boolre   __classcell__r;   r;   r[   r<   rU   d   s    rU   c                       s   e Zd ZdZdededdf fddZd fdd	Zdd
dZde	j
de	j
dedefddZde	j
deeef de	j
fddZ  ZS )DynamicCAbstractorz,Dynamic C-Abstractor based on RegNet blocks.r>   r?   rA   Nc                    sb   t    t|dsJ d|| _|j| _|j| _|dkr |j}|| _t|||j| _	| 
  d S )NrV   zmerge_size must be provided.)rX   rY   hasattrr>   rV   pos_emb_sizer?   rI   encoder_hidden_sizerD   	build_net)rZ   r>   r?   r[   r;   r<   rY      s   
zDynamicCAbstractor.__init__c                    s   |sd S | j d urAtd}d }|D ]}||r|} nq|d us$J || }|d| j dd krA|d d dd f ||< t j|g|R i | d S )Nz[\w,.]*abstractor[\w,.]*pos_embr$   )rD   recompilematchsizerX   _load_from_state_dict)rZ   
state_dictargskwargskey_repos_emb_keykeyrD   r[   r;   r<   rs      s   


z(DynamicCAbstractor._load_from_state_dictc           
      C   s   | j j}| j j}| j j}| j j}| j j}ttddtj	t
d}||||}t| jd}||| jd | |}	|rIt|||	g| _t|||| _d S || _t|||| _d S )Nr$   )stridedilation	act_layer
norm_layer)rV      )r>   rm   rK   rL   rJ   	mlp_depthr   r   r   rP   r   rU   rV   
ModuleListnetrT   readout)
rZ   rm   rK   rL   rJ   r   RegBlocks1samplers2r;   r;   r<   rn      s8   zDynamicCAbstractor.build_netflattened_visual_embedsgrid_thwunused_kwargsc                 K   s   t j|dd}t || }g }t||D ]V\}}|\}}	}
|dks'J dt|d||	|
d}|dddf }| jdur^t| jtt	| j
d gd	 |	|
fdd
}t|d|	|
d}|| }| j||	|
fd}|| qt j|dd}t|dS )z>Apply the dynamic abstractor over flattened visual embeddings.r$   dimz(T must be 1. Video is not supported yet.z(t h w) d -> 1 t h w d)thwNr   g      ?r~   )posembold_sizenew_sizenum_prefix_tokensz1 (h w) d -> 1 h w dr   r   )
input_size)last_hidden_state)r9   prodsplittolistzipr	   rD   r   tuplerf   rl   _forwardrO   catr   )rZ   r   r   r   n_token_locsplit_visual_embeds_visual_embeds	_grid_thwTrb   rc   reshaped_visual_embeds_local_pos_embr;   r;   r<   re      s>   



zDynamicCAbstractor.forwardr]   r   c                 C   sn   |\}}t |d||d}| jjr&| jd |}| jd |}| jd |}n| |}t |d}| |}|S )Nz1 h w d -> 1 d h wr   r   r$   r~   z1 d h w -> (h w) d)r	   r>   rJ   r   r   )rZ   r]   r   r   r   r;   r;   r<   r      s   


zDynamicCAbstractor._forward)rA   N)r4   r5   r6   r7   r   rf   rY   rs   rn   r9   r:   objectr   re   r   r   rh   r;   r;   r[   r<   ri   }   s6    
"
,
ri   c                       s   e Zd ZdZdeddf fddZededd fddZ		dd	ej	d
ej	de
dB de
dB deeB f
ddZdefddZ  ZS )CustomQwen2VLVEzThin wrapper around the Qwen2-VL used as a vision encoder.

    This mirrors the original HF-based vision encoder used in Kanana-V, but
    reuses vLLM's optimized `Qwen2VisionTransformer` building blocks.
    r>   rA   Nc                    s2   t  j|t|ddd dd t| dr| `d S d S )Nrms_norm_epsgư> )vision_confignorm_epsquant_configprefixmerger)rX   rY   getattrrk   r   )rZ   r>   r[   r;   r<   rY     s   


zCustomQwen2VLVE.__init__c                 C   s   | |S )z:Drop-in replacement for the HF `_from_config` constructor.r;   )clsr>   r;   r;   r<   _from_config  s   zCustomQwen2VLVE._from_configr-   r   output_hidden_statesreturn_dictc                 C   sj  |sJ d|j | j| jd}| |}t|tr$|}tj|tjd}n
|	 }|
  }| |\}}	t|dddf |dddf  |dddf jdtjd}
ttjdtjd|
g}
t|
j | jd	d
}
|d}| |
}|r{dnd}| jD ]}|r||df }|||
||	|d}q|d}|r||f }|stdd ||fD S t||dS )a2  Run the vision transformer and optionally return intermediate states.

        Unlike the base `Qwen2VisionTransformer`, this wrapper exposes the
        pre-merger patch-level representations and a HF-style `BaseModelOutput`
        so that the existing projector / abstractor code can be reused.
        z#Only return_dict=True is supported.)devicedtype)r   Nr$   r~   r   )axisr   T)non_blockingr;   )
cu_seqlensrotary_pos_emb_cosrotary_pos_emb_sin
max_seqlenc                 s   s    | ]	}|d ur|V  qd S rW   r;   ).0vr;   r;   r<   	<genexpr>e  s    z*CustomQwen2VLVE.forward.<locals>.<genexpr>)r   hidden_states)tor   r   patch_embed
isinstancelistr/   arrayint32r   cpunumpyrot_pos_embrepeatcumsumconcatenaterF   r9   
from_numpy	unsqueezecompute_attn_mask_seqlenblockssqueezer   r   )rZ   r-   r   r   r   r]   grid_thw_listgrid_thw_npr   r   r   r   encoder_statesblkr   r;   r;   r<   re   "  sT   






	
zCustomQwen2VLVE.forwardc                 C   s   dS )Nrj   r;   rZ   r;   r;   r<   get_num_tokensk  s   zCustomQwen2VLVE.get_num_tokensNN)r4   r5   r6   r7   r   rY   classmethodr   r9   r:   rg   r   r   re   rf   r   rh   r;   r;   r[   r<   r     s&    
Ir   c                   @   s   e Zd ZdeeedB f fddZdefddZddd	d
ededede	de
eef f
ddZdedeeef deeef fddZdS )KananaVProcessingInforA   Nc                 C   s   dd iS )Nimager;   r   r;   r;   r<   get_supported_mm_limitsq  s   z-KananaVProcessingInfo.get_supported_mm_limitsc                 C   s   | j dddd\}}|S )N'  r$   image_widthimage_height
num_frames)_get_vision_info)rZ   max_image_sizerS   r;   r;   r<   !get_image_size_with_most_featurest  s   
z7KananaVProcessingInfo.get_image_size_with_most_featuresr$   T)r   	do_resizer   r   r   r   c                C   s   | j  j}tt|j d}|  }|j}|j}	|j	}
|j
}|r8||||	|
 |j|jd\}}t||d}nt||d}|||  }t|| d}|j|	 }|j|	 }|| | }||
d  }||fS )Nz.smart_resize)heightwidthfactor
min_pixels
max_pixels)r   r   r$   r~   )ctxget_hf_processorimage_processorr!   r.   r5   get_hf_configr   
patch_sizespatial_merge_sizetemporal_patch_sizer   r   r   maxr   r   )rZ   r   r   r   r   r   smart_resize	hf_configr   r   rV   r   resized_heightresized_widthpreprocessed_sizepadded_num_framesgrid_tgrid_hgrid_wnum_patchesnum_vision_tokensr;   r;   r<   r   |  s4   


z&KananaVProcessingInfo._get_vision_infoseq_len	mm_countsc                 C   s(   |   \}}| j||ddd }d|iS )Nr$   r   r   )r   r   )rZ   r   r   target_widthtarget_heightr   r;   r;   r<   get_mm_max_tokens_per_item  s   z0KananaVProcessingInfo.get_mm_max_tokens_per_item)r4   r5   r6   r   strrf   r   r   r   rg   r   r   r   r;   r;   r;   r<   r   p  s.    

,

r   c                	   @   sX   e Zd Zdeeef defddZ	d
dedeeef deeef dB defdd	Z	dS )KananaVDummyInputsBuilderr   rA   c                 C   s   | dd}d| S )Nr   r   <image>)get)rZ   r   
num_imagesr;   r;   r<   get_dummy_text  s   z(KananaVDummyInputsBuilder.get_dummy_textNr   
mm_optionsc                 C   s    | dd}d| jdd|diS )Nr   r   r   )r   r   r  )r  _get_dummy_images)rZ   r   r   r  r  r;   r;   r<   get_dummy_mm_data  s
   z+KananaVDummyInputsBuilder.get_dummy_mm_datarW   )
r4   r5   r6   r   r   rf   r  r   r   r  r;   r;   r;   r<   r    s    
r  c                
   @   s   e Zd ZdZedefddZdedeee	f deee	f deee	f de
f
d	d
Zdedeee	f dedee fddZde
deee	f deeef fddZdS )KananaVMultiModalProcessorz6vLLM multimodal processor for Kanana-V (text + image).rA   c                 C   s   | j  jjd S )Nr$   )infor   text_configeos_token_idr   r;   r;   r<   media_token_id  s   z)KananaVMultiModalProcessor.media_token_idpromptmm_data	mm_kwargs
tok_kwargsc                    s  |r| dg s| j |}tt|gdddS | dg }g }t|d tjs1dd |D }| j j	fdd|D }d	d |D }	d
d |D   fdd d D  |	D ]
}
|
|
jd  q[tj|	dd}	| j }|| jgd }|d|}||}t|}t|}t d }|jdd }ttdd |D }t|| jk  }||kr||krg }d}| D ]"}|| jkr||k r|| jgt||   |d7 }q|
| q||}t|d|	t d t d t|d}t|ddS )z7Run the underlying HF processor on text and image data.images)	input_idspt)tensor_typer   c                 S   s   g | ]}t |qS r;   )r
   	fromarrayr   r   r;   r;   r<   
<listcomp>  s    zAKananaVMultiModalProcessor._call_hf_processor.<locals>.<listcomp>c                    s   g | ]} |qS r;   r;   r  )r   r;   r<   r        c                 S      g | ]}|d  qS )r-   r;   r   or;   r;   r<   r    r  c                 S   r  
image_metar;   r  r;   r;   r<   r    r  c                    s    i | ]   fd dD qS )c                    s   g | ]}|  qS r;   r;   )r   dkr;   r<   r    r  zLKananaVMultiModalProcessor._call_hf_processor.<locals>.<dictcomp>.<listcomp>r;   )r   r  r   r<   
<dictcomp>  s     zAKananaVMultiModalProcessor._call_hf_processor.<locals>.<dictcomp>r   r  image_token_thwr$   c                 s   s    | ]}t |V  qd S rW   )rf   )r   r]   r;   r;   r<   r     s    z@KananaVMultiModalProcessor._call_hf_processor.<locals>.<genexpr>r3   )r  r-   r3   r#  pixel_sizes)r  r
  get_tokenizerencoder   dictr   r
   r   r   rO   ra   r9   concatconvert_ids_to_tokensr  replacetensorlenr   r   rf   sumitemextend
new_tensorr   )rZ   r  r  r  r  
prompt_idsimage_inputsr$  processor_outputr-   pixel_value	tokenizermedia_tokenprompt_replacedr  r  r#  per_image_token_countsexpected_totaln_placeholdersexpandedimg_itokcombined_outputsr;   )r  r   r<   _call_hf_processor  sX   	




z-KananaVMultiModalProcessor._call_hf_processormm_itemshf_processor_mm_kwargsout_mm_kwargsc                    s,   dt dtt  f fdd}tdd|dgS )NidxrA   c                    sB    d |  }|d j }t|tjsJ t|  }jg| S )Nr   r#  )datar   r9   r:   rf   r   r.  r  )rC  out_itemr#  
num_tokensrB  rZ   r;   r<   get_replacement  s
   
zGKananaVMultiModalProcessor._get_prompt_updates.<locals>.get_replacementr   r  )modalitytargetreplacement)rf   r   r   )rZ   r@  rA  rB  rH  r;   rG  r<   _get_prompt_updates  s   	z.KananaVMultiModalProcessor._get_prompt_updates	hf_inputsc                 C   s8   | dtd}ttd|tdtdd}|S )Nr$  r   r   )r-   r3   r#  )r  r9   emptyr'  r   flat_from_sizesbatched)rZ   rM  rA  r$  mm_fields_configr;   r;   r<   _get_mm_fields_config+  s   
z0KananaVMultiModalProcessor._get_mm_fields_configN)r4   r5   r6   r7   propertyrf   r  r   r   r   r   r?  r   r   r   r   rL  r   rR  r;   r;   r;   r<   r	    s>    



E



r	  )r
  dummy_inputsc                
       s~  e Zd ZededededB fddZddd	ed
ef fddZde	de
dB fddZde
dejfddZdeej deee B dejfddZ	d.dejdedB dejfddZ	d.dejdedB dejfddZ	d.dejdedB dejfddZde	defd d!Z		d/d"ejdB d#ejd$edB d%ejdB fd&d'Zd(ejdejfd)d*Zd+eeeejf  dee fd,d-Z  ZS )0KananaVForConditionalGenerationrI  irA   Nc                 C   s   | drdS td| )Nr   r  zUnsupported modality: )
startswith
ValueError)r   rI  rV  r;   r;   r<   get_placeholder_str@  s   
z3KananaVForConditionalGeneration.get_placeholder_strr   )r   vllm_configr   c                   s   t    |jj}|| _| |d t|j| _	t
|j| j	 d| _W d    n1 s/w   Y  | | t||jt|ddgd| _W d    n1 sRw   Y  | jj| _d S )Nr   )r?   modelLlamaForCausalLM)rZ  r   r   architectures)rX   rY   model_configr   r>   _mark_tower_modelr   r   r   vision_modelri   projector_configr   
abstractor_mark_language_modelr*   r  r+   language_modelmake_empty_intermediate_tensors)rZ   rZ  r   r>   r[   r;   r<   rY   G  s(   


	z(KananaVForConditionalGeneration.__init__rv   c                 K   s   | dd }| dd }|d u rd S |d u rtdt|tjr?|jdkr&n|jdkr2|dd}ntd|j d	|j d
t|}t|tjrV|jdkrU|dd}nt|}t	d||dS )Nr-   r3   z9vision_grid_thw is required when pixel_values is providedr~   r2   r   r$   z:pixel_values should be 2D or batched 3D tensor. Got ndim: z (shape=))r.   r-   r3   )
poprX  r   r9   r:   ndimflattenra   r(  r,   )rZ   rv   r-   r3   r;   r;   r<   _parse_and_validate_image_input`  s>   




z?KananaVForConditionalGeneration._parse_and_validate_image_inputimage_inputc                 C   s   |d }|d }d|i}|  ||}| jj}|d}d}d}	t|D ]1}
||
 d ||
 d | ||
 d | }}}|| | }||	|	|  }||f7 }|	|7 }	q#|S )Nr-   r3   r   r;   r$   r~   )forward_and_project_visionrb  rV   rr   rN   )rZ   rk  r-   r3   image_metasvisual_embedsrV   
batch_sizemulti_modal_embeddingssample_indexrV  r   r   r   rF  visual_embedr;   r;   r<   _process_image_input  s$   




z4KananaVForConditionalGeneration._process_image_inputv_outputlayer_indexc                 C   s8   t |ttfrtj|ddd d |f }|S || }|S )Nr$   r   )r   r   r   r9   stack)rZ   rt  ru  visual_featuresr;   r;   r<   _get_visual_feature_at  s   
z6KananaVForConditionalGeneration._get_visual_feature_atr-   rm  c                 C   s>   |dd|d d}| j di |}| jjj}| |j|}|S )NTr3   )r-   r   r   r   r;   )r`  r>   ra  feature_layer_indexrx  r   )rZ   r-   rm  vision_model_args	v_outputsru  rw  r;   r;   r<   forward_vision  s   
z.KananaVForConditionalGeneration.forward_visionrw  c                 C   s   | j ||d dd }|S )Nr3   )r   r   )rb  )rZ   rw  rm  rn  r;   r;   r<   forward_projector  s   z1KananaVForConditionalGeneration.forward_projectorc                 C   s,   |d usJ | j ||d}| j||d}|S )N)rm  )r|  r}  )rZ   r-   rm  rw  rn  r;   r;   r<   rl    s   z:KananaVForConditionalGeneration.forward_and_project_visionc                 K   s&   | j di |}|d u rg S | |S )Nr;   )rj  rs  )rZ   rv   rk  r;   r;   r<   embed_multimodal  s   
z0KananaVForConditionalGeneration.embed_multimodalr  	positionsintermediate_tensorsinputs_embedsc                 K   s"   |d urd }| j ||||d}|S )N)r  r  r  r  )rd  )rZ   r  r  r  r  rv   r   r;   r;   r<   re     s   z'KananaVForConditionalGeneration.forwardr   c                 C   s   | j |S rW   )rd  compute_logits)rZ   r   r;   r;   r<   r    s   z.KananaVForConditionalGeneration.compute_logitsweightsc                 C   s   t | }||S rW   )r)   load_weights)rZ   r  loaderr;   r;   r<   r    s   
z,KananaVForConditionalGeneration.load_weightsrW   r   )r4   r5   r6   r   r   rf   rY  r   rY   r   r=   rj  r9   r:   rs  r   rx  r'  r|  r}  rl  r%   r~  r    re   r  r   r   setr  rh   r;   r;   r[   r<   rU  :  sn    
*






,rU  )Ycollections.abcr   r   r   	functoolsr   typingr   r   r   r   r/   regexro   r9   einopsr	   PILr
   timm.layersr   timm.layers.pos_embedr   timm.models.regnetr   r   transformersr   transformers.modeling_outputsr   3transformers.models.qwen2_vl.configuration_qwen2_vlr   vllm.configr   vllm.config.multimodalr   vllm.loggerr   vllm.multimodalr   vllm.multimodal.inputsr   r   r   vllm.multimodal.parser   r   vllm.multimodal.processingr   r   r   r   r   vllm.sequencer    vllm.utils.import_utilsr!   vllm.utils.tensor_schemar"   r#   
interfacesr%   r&   r'   qwen2_vlr(   utilsr)   r*   r+   r4   loggerr,   r=   r8   rf   rE   rI   rQ   rT   ModulerU   ri   r   r   r  r	  register_processorrU  r;   r;   r;   r<   <module>   s|   

 hFq