o
    
۾iSB                     @   s  d Z ddlZddlmZmZmZ ddlmZ ddlm	Z	m
Z
mZ ddlZddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZ ddlmZmZmZ ddl m!Z! ddl"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z) ddl*m+Z+m,Z, ddl-m.Z.m/Z/m0Z0m1Z1m2Z2m3Z3 ddl4m5Z5 ddl6m7Z7 ddl8m9Z9 ddl:m;Z; ddl<m=Z=m>Z> ddl?m@Z@mAZAmBZBmCZC eeDZEeG dd dZFG dd de=ZGG dd deZHG dd  d e0ZIG d!d" d"e.eI ZJG d#d$ d$e/eI ZKe!jLeKeIeJd%G d&d' d'ejMeeeZNdS )(z
Kimi-K2.5 Model Implementation for vLLM.

Kimi-K2.5 extends Kimi-K2 with vision support

This module defines:
- KimiK25ProcessingInfo/KimiK25MultiModalProcessor: Processing logic
- KimiK25ForConditionalGeneration: Main model class
    N)IterableMappingSequence)	dataclass)	AnnotatedAnyLiteral)nn)BatchFeature)ProcessorMixin)
VllmConfig)BaseDummyOptions)init_logger)SupportsMultiModal
SupportsPPSupportsQuant)KimiK25MultiModalProjectorMoonViT3dPretrainedModelvision_tower_forward)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItemsNestedTensorsVisionChunkVisionChunkImageVisionChunkVideo)MultiModalDataItemsVisionChunkProcessorItems)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoInputProcessingContextPromptReplacementPromptUpdate)current_platform)IntermediateTensors)KimiK25Config)cached_get_image_processor)TensorSchemaTensorShape   )AutoWeightsLoaderWeightsMapperinit_vllm_registered_modelmaybe_prefixc                   @   s&   e Zd ZU dZeed< dZeed< dS )MaxImageTokenMetai  widthheightN)__name__
__module____qualname__r1   int__annotations__r2    r8   r8   W/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/kimi_k25.pyr0   I   s   
 r0   c                   @   s`   e Zd ZU dZdZed ed< eej	e
ej	 B eddddf ed< eej	eddf ed< d	S )
KimiK25MediaPixelInputsz
    Media input schema for K2-VL model.

    Dimensions:
        - np: Number of patches (flattened from all media items)
        - ps: Patch size
        - nm: Number of media items
    pixel_valuestypenp   psnm	grid_thwsN)r3   r4   r5   __doc__r<   r   r7   r   torchTensorlistr*   r8   r8   r8   r9   r:   O   s   
 	r:   c                       s^   e Zd ZdgZdZ	ddedB f fddZ	ddee dB dee e	B d	e
fd
dZ  ZS )MoonshotKimiVAutoProcessor	tokenizerAutoTokenizerNmedia_token_idc                    s*   t  | || _|| _| jd usJ d S N)super__init__media_processorrI   )selfrM   rG   rI   	__class__r8   r9   rL   g   s   z#MoonshotKimiVAutoProcessor.__init__vision_chunkstextreturnc          	         s   i }t |tr j|n|}|durIt |tsJ  j|} fdd|D }g }|D ]}| jkrA| jg|	d  q-|
| q-|}tdt|gi|dS )a  
        Args:
            vision_chunks: List of VisionChunk items to be processed.
                For image: VisionChunkImage with type='image', image=PIL.Image
                For video_chunk: VisionChunkVideo with type='video_chunk', video_chunk=list[PIL.Image]
            text: The token ids to be fed to a model (required).
        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **input_ids** -- list of token ids to be fed to a model.
            - **pixel_values** -- Pixel values to be fed to a model. Returned when `vision_chunks` is not `None`.
            - **grid_thws** -- list of image 3D grid in LLM. Returned when `vision_chunks` is not `None`.
        Nc                    s   g | ]} j |qS r8   )rM   media_tokens_calculator).0chunkrN   r8   r9   
<listcomp>   s    
z7MoonshotKimiVAutoProcessor.__call__.<locals>.<listcomp>r   	input_ids)data)
isinstancestrrG   encoderE   rM   
preprocessrI   extendpopappendr
   rC   tensor)	rN   rQ   rR   kwargs	mm_inputsrY   num_tokens_per_chunknew_input_idstokenr8   rW   r9   __call__p   s,   

z#MoonshotKimiVAutoProcessor.__call__)NNNrJ   )r3   r4   r5   
attributestokenizer_classr6   rL   rE   r   r\   r
   rh   __classcell__r8   r8   rO   r9   rF   c   s    

rF   c                       sT   e Zd ZdZdeddf fddZdd Zd	d
 Zdee	e
dB f fddZ  ZS )KimiK25ProcessingInfozProcessing information for Kimi-K2.5 model.

    Provides configuration and utilities for processing both
    images and video-chunks.
    ctxrS   Nc                    s^   t  | |  | _| jj| _t| jjj	dd}|| _
t| j
|  | jd| _| j
j| _d S )NT)trust_remote_code)rM   rG   rI   )rK   rL   get_hf_config	hf_configmedia_placeholder_token_idrI   r(   rm   model_configmodelrM   rF   get_tokenizerhf_processorrT   )rN   rm   rM   rO   r8   r9   rL      s   


zKimiK25ProcessingInfo.__init__c                 C   s   | j S rJ   )ru   rW   r8   r8   r9   get_hf_processor   s   z&KimiK25ProcessingInfo.get_hf_processorc                 C   s   | j tS rJ   )rm   ro   r'   rW   r8   r8   r9   ro      s   z#KimiK25ProcessingInfo.get_hf_configc                 C   s   dd iS Nvision_chunkr8   rW   r8   r8   r9   get_supported_mm_limits   s   z-KimiK25ProcessingInfo.get_supported_mm_limits)r3   r4   r5   rB   r"   rL   rv   ro   r   r\   r6   ry   rk   r8   r8   rO   r9   rl      s    "rl   c                	       s~   e Zd ZdZdeddf fddZdeeef defdd	Z	d
d Z
	ddedeeef deeef dB defddZ  ZS )KimiK25DummyInputsBuilderz2Builds dummy inputs for Kimi-K2.5 model profiling.inforS   Nc                    s&   t  | | jj| _| jjj| _d S rJ   )rK   rL   r{   rI   rM   num_frames_per_chunkframe_per_chunk)rN   r{   rO   r8   r9   rL      s   
z"KimiK25DummyInputsBuilder.__init__	mm_countsc                 C   s   | dd}d| S )Nrx   r   z<|media_pad|>)get)rN   r~   	num_mediar8   r8   r9   get_dummy_text   s   z(KimiK25DummyInputsBuilder.get_dummy_textc                 C   sn   | j tjtj| jd}td|d}| j|}td| j tjtjddd d}| j|}||kr4|gS |gS )N)r2   r1   
num_imagesvideo_chunk)r<   r   imager+   r   )r<   r   )	_get_dummy_imagesr0   r2   r1   r}   r   r{   rT   r   )rN   dummy_videosvideo_chunk_dummy_itemvideo_chunk_num_tokensimage_dummy_itemimage_num_tokensr8   r8   r9   get_dummy_mm_items   s2   z,KimiK25DummyInputsBuilder.get_dummy_mm_itemsseq_len
mm_optionsc                 C   s   |   }d|iS rw   )r   )rN   r   r~   r   dummy_itemsr8   r8   r9   get_dummy_mm_data   s   z+KimiK25DummyInputsBuilder.get_dummy_mm_datarJ   )r3   r4   r5   rB   rl   rL   r   r\   r6   r   r   r   r   r   rk   r8   r8   rO   r9   rz      s    !
rz   c                	   @   sd   e Zd ZdZdedeeef deeef fddZ	de
deeef dedee fd	d
Zdd ZdS )KimiK25MultiModalProcessorz]Multi-modal processor for Kimi-K2.5.

    Handles both image and video-chunk modalities.
    	hf_inputshf_processor_mm_kwargsrS   c                 C   s6   | dtd}|d}ttd|tddS )a  Indicates how to slice media input into multiple items.

        pixel_values: [N, 3, patch_size, patch_size], all patches collected from B medias
        grid_thws: [B,3], each item: [N_t, N_h ,N_w], indicates the grid size in time/height/width direction
                    for current item.

        by multiplying [N_t, N_h ,N_w], we get the number of patches for each media item, thus we can slice
        pixel_values by pixel_values[start:start + N_t*N_h*N_w] to get patches of one item.

        rA   )r   r>   rx   )r;   rA   )r   rC   emptyproddictr   flat_from_sizesbatched)rN   r   r   rA   
grid_sizesr8   r8   r9   _get_mm_fields_config   s   
z0KimiK25MultiModalProcessor._get_mm_fields_configmm_itemsout_mm_kwargsc                    s8   j  }|j dtf fdd}td g|dgS )Nitem_idxc                    s(    dtf}j||  } g| S rw   )	get_itemsr   r{   rT   )r   medianum_media_tokenrI   r   rN   r8   r9   get_replacement  s   
zGKimiK25MultiModalProcessor._get_prompt_updates.<locals>.get_replacementrx   )modalitytargetreplacement)r{   ro   rq   r6   r#   )rN   r   r   r   rp   r   r8   r   r9   _get_prompt_updates  s   
z.KimiK25MultiModalProcessor._get_prompt_updatesc                 C   s   | j j|S rJ   )r{   rM   split_video_chunks)rN   videor8   r8   r9   r   ,  s   z-KimiK25MultiModalProcessor.split_video_chunksN)r3   r4   r5   rB   r
   r   r\   objectr   r   r   r   r   r   r$   r   r   r8   r8   r8   r9   r      s&    




r   )r{   dummy_inputsc                       s  e Zd ZdZdZedddddZeded	e	d
edB fddZ
	d'deded
df fddZded
edB fddZded
eej fddZded
edB fddZ		d(dejdejdedB dejdB ded
efdd Zd!ejd
ejfd"d#Zd$eeeejf  fd%d&Z  ZS ))KimiK25ForConditionalGenerationzKimi-K2.5 model for conditional generation.

    Supports both image and video-chunk modalities.
    Video-chunks are temporal segments (typically 4 frames) that are
    processed with temporal pooling.
    Tzlanguage_model.model.layers.zmm_projector.linear_1zmm_projector.linear_2)zlanguage_model.layers.zmm_projector.proj.0zmm_projector.proj.2)orig_to_new_prefixr   irS   Nc                 C   s&   |dkrdS |dkrdS t d| )Nr   z?<|media_begin|>image<|media_content|><|media_pad|><|media_end|>r   z<|kimi_k25_video_placeholder|>zUnsupported modality: )
ValueError)clsr   r   r8   r8   r9   get_placeholder_strL  s
   z3KimiK25ForConditionalGeneration.get_placeholder_str vllm_configprefixc                    sH  t    |j}|j}|| _|j}|jjdk| _|j	j
| _
t | _| |d6 t|jt|dd| _| jj| j|jd| _t|j| jt|dd| _| jj| j|jd| _W d    n1 saw   Y  || _t|}|jjj	|j_| | t||j	t|dd	gd
| _W d    n1 sw   Y  | jj| _| jj| _d S )NrZ   rx   vision_tower)r   )devicedtypemm_projector)configuse_data_parallelr   language_modelDeepseekV2ForCausalLM)r   rp   r   architectures)rK   rL   rr   rp   r   quant_configmultimodal_configmm_encoder_tp_moder   text_confighidden_sizer%   current_devicer   _mark_tower_modelr   vision_configr/   r   tor   r   r   copydeepcopy_mark_language_modelr.   r   make_empty_intermediate_tensorsrq   media_placeholder)rN   r   r   rr   r   r   sub_vllm_configrO   r8   r9   rL   W  sR   






z(KimiK25ForConditionalGeneration.__init__rc   c                 K   s  | dd }| dd }|d u rd S t|trtj|dd}t|jdks,t|jdkrB|j|jd |jd  g|jdd  R  }t| j	
 j}||}t|tjs^J d	t| |d
|jd
 }|jdkrs|ddks{J d|j td||dS )Nr;   rA   r   )dim   r>   r+      z%expect grid_thws to be a tensor, get r   z unexpected shape for grid_thws: )r<   r;   rA   )r`   r[   rE   rC   catlenshapereshapenextr   
parametersr   r   rD   r<   ndimsizer:   )rN   rc   r;   rA   target_dtyper8   r8   r9   _parse_and_validate_media_input  s4   


z?KimiK25ForConditionalGeneration._parse_and_validate_media_inputmedia_inputc                 C   s$   t | j|d |d | j| jd}|S )Nr;   rA   )r   r   )r   r   r   r   )rN   r   media_featuresr8   r8   r9   _process_media_input  s   z4KimiK25ForConditionalGeneration._process_media_inputc                 K   s*   | j di |}|d u rd S | |}|S )Nr8   )r   r   )rN   rc   r   vision_embeddingsr8   r8   r9   embed_multimodal  s
   
z0KimiK25ForConditionalGeneration.embed_multimodalrY   	positionsintermediate_tensorsinputs_embedsc                 K   s"   |d urd }| j ||||d}|S )N)rY   r   r   r   )r   )rN   rY   r   r   r   rc   hidden_statesr8   r8   r9   forward  s   z'KimiK25ForConditionalGeneration.forwardr   c                 K   s   | j |}|S rJ   )r   compute_logits)rN   r   rc   logitsr8   r8   r9   r     s   z.KimiK25ForConditionalGeneration.compute_logitsweightsc                 C   s   t | }|j|| jdS )N)mapper)r,   load_weightshf_to_vllm_mapper)rN   r   loaderr8   r8   r9   r     s   z,KimiK25ForConditionalGeneration.load_weights)r   )NN)r3   r4   r5   rB   supports_encoder_tp_datar-   r   classmethodr\   r6   r   r   rL   r   r:   r   rE   rC   rD   r   r   r   r&   r   r   r   tupler   rk   r8   r8   rO   r9   r   0  s^    5
"

$r   )OrB   r   collections.abcr   r   r   dataclassesr   typingr   r   r   rC   r	   transformersr
   transformers.processing_utilsr   vllm.configr   vllm.config.multimodalr   vllm.loggerr   %vllm.model_executor.models.interfacesr   r   r   'vllm.model_executor.models.kimi_k25_vitr   r   r   vllm.multimodalr   vllm.multimodal.inputsr   r   r   r   r   r   r   vllm.multimodal.parser   r   vllm.multimodal.processingr   r    r!   r"   r#   r$   vllm.platformsr%   vllm.sequencer&   vllm.transformers_utils.configsr'   !vllm.transformers_utils.processorr(   vllm.utils.tensor_schemar)   r*   utilsr,   r-   r.   r/   r3   loggerr0   r:   rF   rl   rz   r   register_processorModuler   r8   r8   r8   r9   <module>   sP   
$	 ?!49


