o
    
۾i                     @   s@   d Z ddlmZ ddlmZ G dd deZG dd deZdS )	z
Kimi-K2.5 Model Configuration.

This configuration supports video-chunk as an internal modality type.
A video-chunk is the smallest independently processable unit of video.
    )DeepseekV3Config)PretrainedConfigc                !       s   e Zd ZdZ											
						d#dedededededededededeeef dededededB ded ef  fd!d"Z  Z	S )$KimiK25VisionConfigkimi_k25_vision   @      divided_fixed             r   spatial_temporal	sd2_tpoolpatchmergerNgeluh㈵>
patch_sizeinit_pos_emb_heightinit_pos_emb_widthinit_pos_emb_timepos_emb_typenum_attention_headsnum_hidden_layershidden_sizeintermediate_sizemerge_kernel_sizevideo_attn_type
merge_typemm_projector_typemm_hidden_sizeprojector_hidden_actprojector_ln_epsc                    s   t  jdi | || _|| _|| _|| _|| _|| _|| _|| _	|	| _
|
| _|| _|| _|| _|d ur8|| _n|| _|| _|| _d S )N )super__init__r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   )selfr   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   kwargs	__class__r%   \/home/ubuntu/.local/lib/python3.10/site-packages/vllm/transformers_utils/configs/kimi_k25.pyr'      s&   
zKimiK25VisionConfig.__init__)r   r   r   r   r	   r
   r   r   r   r   r   r   r   Nr   r   )
__name__
__module____qualname__
model_typeintstrtuplefloatr'   __classcell__r%   r%   r*   r,   r      sf    	

r   c                       s   e Zd ZdZdZ							dd	eeB dB d
eeB dB dededede	de
f fddZedefddZedefddZ  ZS )KimiK25Configa*  Kimi-K2.5 model configuration.

    Kimi-K2.5 extends Kimi-K2 with vision support using video-chunks.
    A video-chunk consists of multiple consecutive frames
    that are processed together with temporal pooling.

    Args:
        vision_config: Configuration for the vision tower and projector.
        text_config: Configuration for the text model (DeepseekV3).
        ignore_index: The ignore index for the loss function.
        media_placeholder_token_id: The token ID for media placeholders.
        pad_token_id: The token ID for padding.
    kimi_k25N r   F<|kimi_k25_video_placeholder|>vision_configtext_configignore_indexmedia_placeholder_token_idpad_token_iduse_unified_vision_chunkvideo_placeholderc           	         s   |d u rt  }nt|trt di |}|| _|d u rt }nt|tr+tdi |}|| _| jj| jjkr<| jj| j_|| _|| _	|| _
|| _t| jdd d urV| jj| _t jdd|i| d S )Nquantization_configr?   r%   )r   
isinstancedictr;   r   r<   r"   r   r=   r>   r@   rA   getattrrB   r&   r'   )	r(   r;   r<   r=   r>   r?   r@   rA   r)   r*   r%   r,   r'   P   s&   


zKimiK25Config.__init__returnc                 C      | j jS )z3Get hidden size from text config for compatibility.)r<   r   r(   r%   r%   r,   r   y      zKimiK25Config.hidden_sizec                 C   rG   )z2Get vocab size from text config for compatibility.)r<   
vocab_sizerH   r%   r%   r,   rJ   ~   rI   zKimiK25Config.vocab_size)NNr8   r9   r   Fr:   )r-   r.   r/   __doc__r0   rD   r   r   r1   boolr2   r'   propertyr   rJ   r5   r%   r%   r*   r,   r6   ?   s:    

)r6   N)rK   transformersr    transformers.configuration_utilsr   r   r6   r%   r%   r%   r,   <module>   s
   1