o
    
۾iv;                     @   s  d dl Z d dlmZmZmZ d dlmZ d dlmZm	Z	m
Z
 d dlZd dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZmZ d dlmZ d dlmZ d dlmZm Z m!Z!m"Z" d dl#m$Z$m%Z%m&Z& d dl'm(Z(m)Z)m*Z*m+Z+m,Z, d dl-m.Z. d dl/m0Z0m1Z1 d dl2m3Z3m4Z4 ddl5m6Z6m7Z7m8Z8 ddl9m:Z:m;Z; eG dd dZ<G dd dej=Z>G dd de3Z?e?Z@G dd de*ZAG dd  d e(eA ZBG d!d" d"e)eA ZCejDeCeAeBd#G d$d% d%ej=eeZEdS )&    N)IterableMappingSequence)	dataclass)	AnnotatedAnyLiteral)nn)BatchFeature)GELUActivation)
VllmConfig)BaseDummyOptions)ReplicatedLinear)SupportsMultiModal
SupportsPP)MoonVitPretrainedModel)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItemsNestedTensors)ImageEmbeddingItemsImageProcessorItemsMultiModalDataItems)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdate)IntermediateTensors)KimiVLConfigMoonViTConfig)TensorSchemaTensorShape   )AutoWeightsLoaderinit_vllm_registered_modelmaybe_prefix)is_vit_use_data_parallel!run_dp_sharded_mrope_vision_modelc                   @   s&   e Zd ZU dZeed< dZeed< dS )MaxImageTokenMetai   widthheightN)__name__
__module____qualname__r+   int__annotations__r,    r2   r2   V/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/kimi_vl.pyr*   X   s   
 r*   c                       s@   e Zd Z	d
dedef fddZdejdejfdd	Z  Z	S )KimiVLMultiModalProjector configprefixc                    s   t    t | _|jj|jjd  |jjd  | _tjj	|jjdd| _
t| j| jdt|dd| _t| j|jjdt|dd| _t | _d S )	Nr   r$   gh㈵>)epsTlinear_1)biasr7   linear_2)super__init__r(   use_data_parallelvision_confighidden_sizemerge_kernel_sizetorchr	   	LayerNormpre_normr   r'   r9   text_configr;   r   act)selfr6   r7   	__class__r2   r3   r=   _   s,   


z"KimiVLMultiModalProjector.__init__image_featuresreturnc                 C   s>   |  |d| j}| |\}}| |}| |\}}|S )N)rD   viewr@   r9   rF   r;   )rG   rJ   hidden_states_r2   r2   r3   forward|   s
   
z!KimiVLMultiModalProjector.forwardr5   )
r-   r.   r/   r    strr=   rB   TensorrP   __classcell__r2   r2   rH   r3   r4   ^   s    r4   c                   @   s`   e Zd ZU dZdZed ed< eej	e
ej	 B eddddf ed< eej	eddf ed	< d
S )KimiVLImagePixelInputsz
    Dimensions:
        - nc: Number of channels
        - np: Number of patches
        - ps: Patch size
        - ni: Number of images
    pixel_valuestypenp   psni   image_grid_hwsN)r-   r.   r/   __doc__rW   r   r1   r   rB   rS   listr#   r2   r2   r2   r3   rU      s   
 rU   c                   @   sV   e Zd Zdd ZdeeedB f fddZdededefd	d
Ze	defddZ
dS )KimiVLProcessingInfoc                 C   s   | j tS N)ctxget_hf_configr    rG   r2   r2   r3   rc         z"KimiVLProcessingInfo.get_hf_configrK   Nc                 C   s   dd iS )Nimager2   rd   r2   r2   r3   get_supported_mm_limits   s   z,KimiVLProcessingInfo.get_supported_mm_limitsimage_widthimage_heightc                C   s.  |   }|jj}|jj}|jj}|}|}t|ts J d| t|ts,J d| |d us4J d|| ||  |kr]t||| ||   }	t||	 t||	 }
}|
|}}|\}}|| |||   ||  }|| |||   ||  }|| |d |  }|| |d |  }t|| S )Nz#height must be int, current height z!width must be int, current width zkernel_size must be specifiedr   r$   )	get_hf_processorimage_processor
patch_sizerA   in_token_limit
isinstancer0   mathsqrt)rG   rh   ri   hf_processorrl   kernel_sizerm   r,   r+   scalenew_wnew_hkernel_heightkernel_width
pad_height	pad_widthtoken_heighttoken_widthr2   r2   r3   get_num_image_tokens   s2   
z)KimiVLProcessingInfo.get_num_image_tokensc                 C   s
   |   jS ra   )rc   media_placeholder_token_idrd   r2   r2   r3   image_token_id   s   
z#KimiVLProcessingInfo.image_token_id)r-   r.   r/   rc   r   rR   r0   rg   r|   propertyr~   r2   r2   r2   r3   r`      s    
%r`   c                	   @   sX   e Zd Zdeeef defddZ	d
dedeeef deeef dB defdd	Z	dS )KimiVLDummyInputsBuilder	mm_countsrK   c                 C   s$   | dd}| j }|j}|| S )Nrf   r   )getinforj   image_token)rG   r   
num_images	processorr   r2   r2   r3   get_dummy_text   s   
z'KimiVLDummyInputsBuilder.get_dummy_textNseq_len
mm_optionsc                 C   s8   | dd}|r| dnd }d| jtjtj||diS )Nrf   r   )r+   r,   r   	overrides)r   _get_dummy_imagesr*   r+   r,   )rG   r   r   r   r   image_overridesr2   r2   r3   get_dummy_mm_data   s   z*KimiVLDummyInputsBuilder.get_dummy_mm_datara   )
r-   r.   r/   r   rR   r0   r   r   r   r   r2   r2   r2   r3   r      s    
r   c                	   @   sX   e Zd Zdedeeef deeef fddZde	deee
f dedee fdd	Zd
S )KimiVLMultiModalProcessor	hf_inputshf_processor_mm_kwargsrK   c                 C   s6   | dtd}|d}ttd|tddS )Nr]   )r   r\   rL   rf   )rV   r]   )r   rB   emptyproddictr   flat_from_sizesbatched)rG   r   r   r]   image_grid_sizesr2   r2   r3   _get_mm_fields_config   s   
z/KimiVLMultiModalProcessor._get_mm_fields_configmm_itemsout_mm_kwargsc                    s0   j j dtf fdd}td g|dgS )Nitem_idxc                    sN    dttf}t|tr|| }n|| }jj|j|j	d} g| S )Nrf   )rh   ri   )
	get_itemsr   r   rn   get_feature_sizeget_image_sizer   r|   r+   r,   )r   imagesnum_image_tokens
image_sizer~   r   rG   r2   r3   get_replacement  s   


zFKimiVLMultiModalProcessor._get_prompt_updates.<locals>.get_replacementrf   )modalitytargetreplacement)r   r~   r0   r   )rG   r   r   r   r   r2   r   r3   _get_prompt_updates   s   z-KimiVLMultiModalProcessor._get_prompt_updatesN)r-   r.   r/   r
   r   rR   objectr   r   r   r   r   r   r   r   r2   r2   r2   r3   r      s"    



r   )r   dummy_inputsc                       s  e Zd ZdZededededB fddZ	d$d	ed
eddf fddZ	de
dedB fddZe dedejfddZdedejfddZde
dedB fddZ		d%dejdB dejdedB dejdB de
defddZdejdejfdd Zd!eeeejf  fd"d#Z  ZS )&KimiVLForConditionalGenerationTr   irK   Nc                 C   s   | drdS td)Nrf   z?<|media_start|>image<|media_content|><|media_pad|><|media_end|>z Only image modality is supported)
startswith
ValueError)clsr   r   r2   r2   r3   get_placeholder_str%  s   
z2KimiVLForConditionalGeneration.get_placeholder_strr5   vllm_configr7   c                    s  t    |j}|j}|j}|| _|| _t|jtsJ |j	j
dk| _|jj| _| |d t|jt|dd| _t|t|dd| _W d    n1 sNw   Y  | | t||jt|ddgd	| _W d    n1 sqw   Y  | jj| _| jj| _d S )
Ndatarf   vision_tower)r7   multi_modal_projector)r6   r7   language_modelDeepseekV2ForCausalLM)r   	hf_configr7   architectures)r<   r=   model_configr   quant_configr6   rn   r?   r!   multimodal_configmm_encoder_tp_moder>   rE   r@   _mark_tower_modelr   r'   r   r4   r   _mark_language_modelr&   r   make_empty_intermediate_tensorsr}   media_placeholder)rG   r   r7   r   r6   r   rH   r2   r3   r=   ,  s>   





	z'KimiVLForConditionalGeneration.__init__kwargsc                 K   s2   | dd }| dd }|d u rd S td||dS )NrV   r]   )rW   rV   r]   )poprU   )rG   r   rV   r]   r2   r2   r3   _parse_and_validate_image_inputW  s   z>KimiVLForConditionalGeneration._parse_and_validate_image_inputinputsc                 C   s8   |d }|d }| j rt| j|| ddS | ||S )NrV   r]   rope_2d)	rope_type)r>   r)   r   tolist)rG   r   rV   r]   r2   r2   r3   _process_image_pixelsh  s   z4KimiVLForConditionalGeneration._process_image_pixelsimage_inputc                 C   sP   |d dksJ |  |}t|ttfsJ dd |D }| t||S )NrW   rV   c                 S   s   g | ]}|j d  qS )r   )shape).0xr2   r2   r3   
<listcomp>z  s    zGKimiVLForConditionalGeneration._process_image_input.<locals>.<listcomp>)r   rn   r_   tupler   rB   catsplit)rG   r   rJ   lengthsr2   r2   r3   _process_image_inputv  s
   
z3KimiVLForConditionalGeneration._process_image_inputc                 K   s*   | j di |}|d u rd S | |}|S )Nr2   )r   r   )rG   r   r   vision_embeddingsr2   r2   r3   embed_multimodal}  s
   
z/KimiVLForConditionalGeneration.embed_multimodal	input_ids	positionsintermediate_tensorsinputs_embedsc                 K   s"   |d urd }| j ||||d}|S )N)r   r   r   r   )r   )rG   r   r   r   r   r   rN   r2   r2   r3   rP     s   z&KimiVLForConditionalGeneration.forwardrN   c                 K   s   | j |S ra   )r   compute_logits)rG   rN   r   r2   r2   r3   r     re   z-KimiVLForConditionalGeneration.compute_logitsweightsc                 C   s   t | }||S ra   )r%   load_weights)rG   r   loaderr2   r2   r3   r     s   
z+KimiVLForConditionalGeneration.load_weightsrQ   )NN)r-   r.   r/   supports_encoder_tp_dataclassmethodrR   r0   r   r   r=   r   KimiVLImageInputsr   rB   inference_moderU   rS   r   r   r   r   r   rP   r   r   r   r   rT   r2   r2   rH   r3   r     sL    	+

$r   )Fro   collections.abcr   r   r   dataclassesr   typingr   r   r   rB   r	   transformersr
   transformers.activationsr   vllm.configr   vllm.config.multimodalr   !vllm.model_executor.layers.linearr   %vllm.model_executor.models.interfacesr   r   "vllm.model_executor.models.moonvitr   vllm.multimodalr   vllm.multimodal.inputsr   r   r   r   vllm.multimodal.parser   r   r   vllm.multimodal.processingr   r   r   r   r   vllm.sequencer   vllm.transformers_utils.configsr    r!   vllm.utils.tensor_schemar"   r#   utilsr%   r&   r'   visionr(   r)   r*   Moduler4   rU   r   r`   r   r   register_processorr   r2   r2   r2   r3   <module>   sH   ,&13