o
    ÔÙ¾i  ã                   @   s@   d Z ddlmZ ddlmZ G dd„ deƒZG dd„ deƒZdS )	z
Kimi K25 Model Configuration.
é    )ÚDeepseekV3Config)ÚPretrainedConfigc                #       sš   e Zd ZdZdZ										
								d&dedededededededededeeef dededed edB d!ed"ed#ef"‡ fd$d%„Z	‡  Z
S )'ÚKimiK25VisionConfiga›  Vision configuration for K2-VL (vision tower + mm projector).

    Args:
        Vision Tower Parameters:
            patch_size: Patch size for vision tower.
            init_pos_emb_height: Initial position embedding height.
            init_pos_emb_width: Initial position embedding width.
            init_pos_emb_time: Initial position embedding time dimension.
            pos_emb_type: Type of position embedding.
            num_attention_heads: Number of attention heads in vision tower.
            num_hidden_layers: Number of hidden layers in vision tower.
            hidden_size: Hidden size of vision tower.
            intermediate_size: Intermediate size in vision tower FFN.
            merge_kernel_size: Kernel size for spatial patch merging.
            video_attn_type: Type of video attention.
            merge_type: Type of merge operation.

        MM Projector Parameters:
            mm_projector_type: Type of multimodal projector.
            mm_hidden_size: Hidden size for projector (defaults to hidden_size).
            projector_hidden_act: Activation function for projector.
            projector_ln_eps: Layer norm epsilon for projector.
    Úkimi_k25é   é@   é   Údivided_fixedé   é   é€  éÐ  ©é   r   Úspatial_temporalÚ	sd2_tpoolÚpatchmergerNÚgeluçñhãˆµøä>é   Ú
patch_sizeÚinit_pos_emb_heightÚinit_pos_emb_widthÚinit_pos_emb_timeÚpos_emb_typeÚnum_attention_headsÚnum_hidden_layersÚhidden_sizeÚintermediate_sizeÚmerge_kernel_sizeÚvideo_attn_typeÚ
merge_typeÚmm_projector_typeÚmm_hidden_sizeÚprojector_hidden_actÚprojector_ln_epsÚtext_hidden_sizec                    sŒ   t ƒ jdi |¤Ž || _|| _|| _|| _|| _|| _|| _|| _	|	| _
|
| _|| _|| _|| _|d ur8|| _n|| _|| _|| _|| _d S )N© )ÚsuperÚ__init__r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   )Úselfr   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   Úkwargs©Ú	__class__r'   úO/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/configs/kimi_k25.pyr)   $   s(   
zKimiK25VisionConfig.__init__)r   r   r   r   r	   r
   r   r   r   r   r   r   r   Nr   r   r   )Ú__name__Ú
__module__Ú__qualname__Ú__doc__Ú
model_typeÚintÚstrÚtupleÚfloatr)   Ú__classcell__r'   r'   r,   r.   r   	   sn    ìýüûúùø	÷
öõ
ôóòðïîíìr   c                       s†   e Zd ZdZdZ							dd	eeB dB d
eeB dB dededede	de
f‡ fdd„Zedefdd„ƒZedefdd„ƒZ‡  ZS )ÚKimiK25Configap  K2-VL model configuration.

    K2-VL extends Kimi-VL with video support using video-chunks.
    A video-chunk consists of multiple consecutive frames (default: 4)
    that are processed together with temporal pooling.

    Args:
        text_config: Configuration for the text model (DeepseekV3).

        Vision Tower Parameters:
            patch_size: Patch size for vision tower.
            init_pos_emb_height: Initial position embedding height.
            init_pos_emb_width: Initial position embedding width.
            init_pos_emb_time: Initial position embedding time dimension.
            pos_emb_type: Type of position embedding.
            vt_num_attention_heads: Number of attention heads in vision tower.
            vt_num_hidden_layers: Number of hidden layers in vision tower.
            vt_hidden_size: Hidden size of vision tower.
            vt_intermediate_size: Intermediate size in vision tower FFN.
            merge_kernel_size: Kernel size for spatial patch merging.
            video_attn_type: Type of video attention.
            merge_type: Type of merge operation.

        Video-Chunk Parameters:
            temporal_merge_kernel_size: Number of frames per video chunk.
                Default is 4, meaning 4 frames are merged into 1 chunk.
            sample_fps: Video sampling frame rate.
            timestamp_mode: Format for chunk timestamps.

        MM Projector Parameters:
            mm_projector_type: Type of multimodal projector.
            mm_hidden_size: Hidden size from vision tower.
            projector_hidden_act: Activation function for projector.
            projector_ln_eps: Layer norm epsilon for projector.

        Other Parameters:
            ignore_index: The ignore index for the loss function.
            media_placeholder_token_id: The token ID for media placeholders.
            pad_token_id: The token ID for padding.
    r   Néœÿÿÿé r   Fú<|kimi_k25_video_placeholder|>Útext_configÚvision_configÚignore_indexÚmedia_placeholder_token_idÚpad_token_idÚuse_unified_vision_chunkÚvideo_placeholderc           	         sª   |d u rt ƒ }nt|tƒrt di |¤Ž}|d u rtƒ }nt|tƒr(tdi |¤Ž}|| _|| _|| _|| _|| _|| _	t
| jdd ƒd urH| jj| _tƒ jdd|i|¤Ž d S )NÚquantization_configrA   r'   )r   Ú
isinstanceÚdictr   r>   r=   r?   r@   rB   rC   ÚgetattrrD   r(   r)   )	r*   r=   r>   r?   r@   rA   rB   rC   r+   r,   r'   r.   r)   €   s"   


zKimiK25Config.__init__Úreturnc                 C   ó   | j jS )z3Get hidden size from text config for compatibility.)r=   r   ©r*   r'   r'   r.   r   £   ó   zKimiK25Config.hidden_sizec                 C   rI   )z2Get vocab size from text config for compatibility.)r=   Ú
vocab_sizerJ   r'   r'   r.   rL   ¨   rK   zKimiK25Config.vocab_size)NNr:   r;   r   Fr<   )r/   r0   r1   r2   r3   rF   r   r   r4   Úboolr5   r)   Úpropertyr   rL   r8   r'   r'   r,   r.   r9   T   s:    )÷
þ
ýûúùø	÷#r9   N)r2   Útransformersr   Ú transformers.configuration_utilsr   r   r9   r'   r'   r'   r.   Ú<module>   s
    K