o
    پi|                     @   s   d Z ddlmZmZmZmZ ddlZddlmZ ddlm	Z	 ddl
mZ ddlmZ ddlmZmZ dd	lmZmZ dd
lmZmZ ddlmZ ddlmZ ddlmZ G dd dejZegZdS )zAInference-only Dots-VL model compatible with HuggingFace weights.    )IterableListOptionalTupleN)nn)DotsVLMConfig)get_pp_group)QuantizationConfig)/MultiModalityDataPaddingPatternMultimodalTokensgeneral_mm_embed_routine)MultimodalDataItemMultimodalInputs)ForwardBatchPPProxyTensors)default_weight_loader)DeepseekV2ForCausalLM   )DotsVisionTransformerc                       s   e Zd ZdZ	ddedee ddf fddZded	e	j
fd
dZdeeee	j
f  fddZedd Zdee defddZdee de	j
fddZ	dde	j
de	j
dedee de	j
f
ddZ  ZS )DotsVLMForCausalLMz"DotsVLM model for sglang inferenceNconfigquant_configreturnc                    sL   t    || _|j| _|j| _t | _|j	st
|j|| _t|j| _d S N)super__init__r   
im_span_idimage_token_idvideo_span_idvideo_token_idr   pp_groupencoder_onlyr   language_configlanguage_modelr   vision_configvision_tower)selfr   r   	__class__ N/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/dots_vlm.pyr   +   s   
zDotsVLMForCausalLM.__init__nameloaded_weightc           	         s  | j jj}|dkr|S | j jjd|v r\|jddd\}}}|dr,||jd g n|dr6|g ntd|  fd	d
}||||||}}}tj	|||gdd}d|v rt|
|jd | }tj	||gdd}d|v s|d|v r|
| }tj	||gdd}|S )z$pad attn qkv weights for dummy headsr   zattn.qkv_proj   dimz.weightz.biaszUnsupported weight with name=c                    s,   t j| ddf|  gddddS )Nr   r/   r-   r   )torchcat	unflatten	new_zerosflatten)xdummy_shapehead_dimr(   r)   <lambda>L   s    
z>DotsVLMForCausalLM._pad_vit_attn_dummy_heads.<locals>.<lambda>zattn.proj.weightzattn.q_norm.weightzattn.k_norm.weight)r   r#   num_dummy_headsr8   chunkendswithshapeRuntimeErrorr0   r1   r3   )	r%   r*   r+   r:   wqwkwvpad_funcpadded_weightr(   r6   r)   _pad_vit_attn_dummy_heads=   s.   




z,DotsVLMForCausalLM._pad_vit_attn_dummy_headsweightsc                 C   s   t |}g }g }|D ]\}}|dr!|dd}|||f q
|||f q
| jjsct|}t| jdd}| D ]%\}}||vrMt	d| d|| }	t
|	dt}
| ||}|
|	| q=| jjsq|rs| j| d	S d	S d	S )
zBLoad weights for the model, separating vision and language weightszvision_tower.z	attn.qkv.zattn.qkv_proj.F)remove_duplicatezWeight z not found in params_dictweight_loaderN)list
startswithreplaceappendr   language_onlydictnamed_parametersitems
ValueErrorgetattrr   rD   r    r"   load_weights)r%   rE   vision_weightslanguage_weightsr*   r+   vision_namevision_state_dictparams_dictparamrG   r(   r(   r)   rR   [   s*   
zDotsVLMForCausalLM.load_weightsc                 C   s
   t |S r   )r   $get_model_config_for_expert_location)clsr   r(   r(   r)   rY   {   s   
z7DotsVLMForCausalLM.get_model_config_for_expert_location	input_ids	mm_inputsc                 C   s   t  }|||}|S )z$Pad input_ids with multimodal tokens)r
   pad_input_tokens)r%   r[   r\   patternpadded_input_idsr(   r(   r)   pad_input_ids   s   z DotsVLMForCausalLM.pad_input_idsrO   c                 C   s   t jdd |D dd| jj}t jdd |D dd| jj}| dks1J d| | dks@J d| | ||}|jt j	kr_t
| jjd	r_| jjjjj}||}|S )
Nc                 S      g | ]}|j qS r(   )feature.0itemr(   r(   r)   
<listcomp>       z8DotsVLMForCausalLM.get_image_feature.<locals>.<listcomp>r   r-   c                 S   ra   r(   )image_grid_thwrc   r(   r(   r)   rf      rg      zpixel_values.dim()=zimage_grid_thw.dim()=embed_tokens)r0   r1   typer$   dtypeconcattodevicer.   bfloat16hasattrr"   modelrj   weight)r%   rO   pixel_valuesrh   image_embedstarget_dtyper(   r(   r)   get_image_feature   s"   

z$DotsVLMForCausalLM.get_image_feature	positionsforward_batchpp_proxy_tensorsc                 C   s6   | j jrt|||| | jd}|S | j||||d}|S )N)r[   rx   ry   multimodal_modelr"   )r[   rx   ry   rz   )r   is_first_rankr   r"   )r%   r[   rx   ry   rz   hidden_statesr(   r(   r)   forward   s    zDotsVLMForCausalLM.forwardr   )__name__
__module____qualname____doc__r   r   r	   r   strr0   TensorrD   r   r   rR   classmethodrY   r   intr   r`   r   rw   r   r   r~   __classcell__r(   r(   r&   r)   r   (   s8     
r   ) r   typingr   r   r   r   r0   r   sglang.srt.configs.dots_vlmr   sglang.srt.distributedr   *sglang.srt.layers.quantization.base_configr	   sglang.srt.managers.mm_utilsr
   r   "sglang.srt.managers.schedule_batchr   r   ,sglang.srt.model_executor.forward_batch_infor   r   $sglang.srt.model_loader.weight_utilsr   sglang.srt.models.deepseek_v2r   dots_vlm_vitr   Moduler   
EntryClassr(   r(   r(   r)   <module>   s     
