o
    پizv                     @   s6  d dl mZmZmZmZmZ d dlZd dlm  m	Z
 d dlmZ d dlmZmZ d dlmZmZ d dlmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZ d dlmZmZ d dl m!Z! d dl"m#Z# d dl$m%Z%m&Z& d dl'm(Z(m)Z)m*Z* d dl+m,Z, d dl-m.Z. d dl/m0Z0 d dl1m2Z2 d dl3m4Z4 d dl5m6Z6 d dl7m8Z8 d dl9m:Z: d dl;m<Z< d dl=m>Z> d dl?m@Z@ d dlAmBZB d dlCmDZD eB ZEG dd dejFZGG dd  d ejFZHG d!d" d"ejFZIG d#d$ d$ejFZJeIejKd%ZLG d&d' d'ejFZMG d(d) d)ejFZNG d*d+ d+eZOG d,d- d-ejFZPePZQdS ).    )IterableListOptionalTupleUnionN)nn)PretrainedConfigPreTrainedModel)BaseModelOutputBaseModelOutputWithPooling)get_tensor_model_parallel_rank$get_tensor_model_parallel_world_size)envs)
get_act_fn)vision_utils)SingletonCacheVisionAttention)ColumnParallelLinearRowParallelLinear)FusedMoE)QuantizationConfig))MultiModalityDataPaddingPatternTokenPairsgeneral_mm_embed_routine)ModalityMultimodalDataItemMultimodalInputs)ForwardBatch)default_weight_loader)DropPath)GptOssForCausalLM)InternLM2ForCausalLM)Qwen2ForCausalLM)Qwen3ForCausalLM)Qwen3MoeForCausalLM)InternViTCudaGraphRunner)run_dp_sharded_vision_model)get_global_server_args)is_cuda)loggerc                	       sd   e Zd Z			ddededeejj f fddZ		ddej
d	ej
d
eej
 dej
fddZ  ZS )InternAttentionNFquant_configuse_data_parallel
aux_streamc                    s   t    || _|j| _|j| _| j| j | _| jd | _t	| j| j| jd|t
|ddt
|ddp6t
|ddt
|dd	t
|d
dpFt
|ddd||d| _t|j| _d S )Ng      Tdropout        qkv_biasFattention_biasnum_dummy_headsr   qk_normalizationuse_qk_norm)	embed_dim	num_headsprojection_sizeuse_qkv_parallelr*   r-   r/   r1   r2   flatten_batchr+   r,   )super__init__confighidden_sizer4   num_attention_headsr5   head_dimscaler   getattrattnr   Dropoutr-   	proj_drop)selfr;   r*   r+   r,   	__class__ N/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/internvl.pyr:   4   s.   




zInternAttention.__init__hidden_states
cu_seqlens	output_wsreturnc                 C   s   | j |||d}| |}|S )NrJ   rK   )rA   rC   )rD   rI   rJ   rK   outoutsrG   rG   rH   forwardU   s   
zInternAttention.forwardNFNN)__name__
__module____qualname__r   boolr   torchcudaStreamr:   TensorrP   __classcell__rG   rG   rE   rH   r)   3   s*    
%r)   c                       s@   e Zd Zdef fddZdd Zdejdejfdd	Z	  Z
S )
InternVisionEmbeddingsr;   c                    s   t    || _|j| _t|jtr|jn|jd | _t|jtr$|jn|jd | _t	
tdd| j| _t	jd| j| j| jd| _| j| j d | _| jd | _t	
td| j| j| _d S )Nr         )in_channelsout_channelskernel_sizestride   )r9   r:   r;   r<   r4   
isinstance
image_sizeint
patch_sizer   	ParameterrW   randnclass_embeddingConv2dpatch_embeddingnum_patchesnum_positionsposition_embedding)rD   r;   rE   rG   rH   r:   a   s2   



zInternVisionEmbeddings.__init__c                 C   sn   |j }| d| j| j | j| j ddddd}tj|||fddddd|| ddd|}|S )	Nr]   r   r^   rc   bicubicFsizemodealign_corners)	dtypefloatreshapere   rg   permuteFinterpolateto)rD   	pos_embedHWtarget_dtyperG   rG   rH   _get_pos_embed   s    


z%InternVisionEmbeddings._get_pos_embedpixel_valuesrL   c              	   C   s   | j jj}|  |}|j\}}}}|ddd}| j|dd|}t	j
||gdd}	t	j
| jd d d dd d f | | jd d dd d d f ||gdd}
|	|
| }	|	S )Nrc   r]   rp   dim)rl   weightrv   shapeflatten	transposerj   expandr|   rW   catro   r   )rD   r   r   patch_embeds
batch_size_heightwidthclass_embeds
embeddingsro   rG   rG   rH   rP      s    
$zInternVisionEmbeddings.forward)rS   rT   rU   r   r:   r   rW   FloatTensorrZ   rP   r[   rG   rG   rE   rH   r\   `   s    !r\   c                       s&   e Zd Zd fdd	Zdd Z  ZS )InternRMSNormư>c                    s&   t    tt|| _|| _d S rR   )r9   r:   r   rh   rW   onesr   variance_epsilon)rD   r<   epsrE   rG   rH   r:      s   

zInternRMSNorm.__init__c                 C   sJ   |j }|tj}|djddd}|t|| j  }| j|| S )Nrc   rp   T)keepdim)	rv   r|   rW   float32powmeanrsqrtr   r   )rD   rI   input_dtypevariancerG   rG   rH   rP      s
   zInternRMSNorm.forward)r   )rS   rT   rU   r:   rP   r[   rG   rG   rE   rH   r      s    r   c                       s@   e Zd Z	d
dedef fddZdejdejfdd	Z  Z	S )	InternMLPFr;   r+   c                    s|   t    |r	dnt | _|rdnt | _|| _t|j| _	t
|j|jdd | j| jd| _t|j|jdd | j| jd| _d S )Nr]   r   T)biasr*   tp_sizetp_rank)r9   r:   r   r   r   r   r;   r   
hidden_actactr   r<   intermediate_sizefc1r   fc2)rD   r;   r+   rE   rG   rH   r:      s,   
zInternMLP.__init__rI   rL   c                 C   s*   |  |\}}| |}| |\}}|S rR   )r   r   r   )rD   rI   r   rG   rG   rH   rP      s   
zInternMLP.forward)F)
rS   rT   rU   r   rV   r:   rW   rZ   rP   r[   rG   rG   rE   rH   r      s    r   )rms_norm
layer_normc                       s   e Zd Z			ddededededeej	j
 f
 fdd	Z	dd
ejdejdeej deejeej eeej  f fddZ  ZS )InternVisionEncoderLayerNFr;   drop_path_rater*   r+   r,   c                    s   t    |j| _|j| _|j| _t||||d| _t||| _	t
| j | j|jd| _t
| j | j|jd| _t|jt| j | _t|jt| j | _|dkrZt|nt | _|dkrjt|| _d S t | _d S )N)r;   r*   r+   r,   )r   r.   )r9   r:   r<   r4   r   	norm_typer)   rA   r   mlpNORM2FNlayer_norm_epsnorm1norm2r   rh   initializer_factorrW   r   ls1ls2r   Identity
drop_path1
drop_path2)rD   r;   r   r*   r+   r,   rE   rG   rH   r:      s*   
z!InternVisionEncoderLayer.__init__rI   rJ   rK   rL   c                 C   sZ   ||  | j| ||j||d| j  }|| | | ||j| j	  }|S )z
        Args:
            hidden_states (`Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]`): input to the layer of shape `(batch, seq_len, embed_dim)`
        rM   )
r   rA   r   r|   rv   r   r   r   r   r   )rD   rI   rJ   rK   rG   rG   rH   rP     s   	z InternVisionEncoderLayer.forwardrQ   rR   )rS   rT   rU   r   rw   r   rV   r   rW   rX   rY   r:   rZ   r   r   rP   r[   rG   rG   rE   rH   r      s<    
#r   c                	       sd   e Zd ZdZ		ddedee def fddZ		dd	ee d
ee de	e
ef fddZ  ZS )InternVisionEncodera  
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`InternEncoderLayer`].

    Args:
        config (`InternConfig`):
            The corresponding vision configuration for the `InternEncoder`.
    NFr;   r*   r+   c                    s   t    | _dd tdjjD totj	
 | _| jr#d ntr*tj nd  t fddtjD | _d | _| jrMt| | _d S d S )Nc                 S   s   g | ]}|  qS rG   )item).0xrG   rG   rH   
<listcomp>6  s    z0InternVisionEncoder.__init__.<locals>.<listcomp>r   c                    s    g | ]}t |  qS rG   )r   )r   idxr,   r;   dprr*   r+   rG   rH   r   @  s    )r9   r:   r;   rW   linspacer   num_hidden_layers_is_cudar   SGLANG_VIT_ENABLE_CUDA_GRAPHget	enable_cgrX   rY   r   
ModuleListrangelayerscuda_graph_runnerr$   rD   r;   r*   r+   rE   r   rH   r:   -  s"   
	zInternVisionEncoder.__init__output_hidden_statesreturn_dictrL   c           
      C   s   | j r|s|j|jd }| j|}|s|fS t|ddS |dur%|n| jj}|dur/|n| jj	}|r7dnd}|}t
 }t| jD ]\}}|rN||f }|||d}	|	}qC|r^||f }|sktdd ||fD S t||dS )a-  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Embedded representation of the inputs. Should be float, not int tokens.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        )deviceN)last_hidden_staterI   rG   )rJ   c                 s   s    | ]	}|d ur|V  qd S rR   rG   )r   vrG   rG   rH   	<genexpr>|  s    z.InternVisionEncoder.forward.<locals>.<genexpr>)r   r|   r   
contiguousr   runr
   r;   r   use_return_dictr   	enumerater   tuple)
rD   inputs_embedsr   r   rI   encoder_statesrJ   r   encoder_layerlayer_outputsrG   rG   rH   rP   L  s6   


zInternVisionEncoder.forwardNF)NN)rS   rT   rU   __doc__r   r   r   rV   r:   r   r   r
   rP   r[   rG   rG   rE   rH   r   #  s(    "
r   c                       s   e Zd ZdZdZeZdgZ		ddedee	 de
f fd	d
Zdd Zdd Z				ddeej dee
 dee
 deej deeef f
ddZ  ZS )InternVisionModelr   Tr   NFr;   r*   r+   c                    s4   t  | || _|| _t|| _t|||| _d S rR   )r9   r:   r;   r+   r\   r   r   encoderr   rE   rG   rH   r:     s   zInternVisionModel.__init__c           	      C   s   | j j}|j\}}}|d d d dd d f }|d d dd d d f d|| || ddddd}tj| || ddd}||j	d|dddd}t
j||gdd	}t|| j _|| j _td
|| d S )Nr]   rp   r   r^   rc   rq   Frr   r   z)Resized position embeddings from {} to {})r   ro   r   rx   ry   rz   r{   rw   r|   rv   rW   r   r   rh   re   r(   infoformat)	rD   old_sizenew_sizerg   pos_embr   rn   r4   cls_embrG   rG   rH   resize_pos_embeddings  s(    
z'InternVisionModel.resize_pos_embeddingsc                 C   s   | j S rR   )r   )rD   rG   rG   rH   get_input_embeddings  s   z&InternVisionModel.get_input_embeddingsr   r   pixel_embedsrL   c           	      C   s  |j | j| jd}|d ur|n| jj}|d ur|n| jj}|d u r)|d u r)td|d ur0|}nt|jdkr=| 	|}ntd|j | j
rQt|| j}|}n| j|||d}|j}|d d dd d f }|ss||f|dd   S | j
r~t||d d dS t|||j|jdS )	N)r   rv   z0You have to specify pixel_values or pixel_embeds   zwrong pixel_values size: )r   r   r   r   r]   )r   pooler_outputrI   
attentions)r|   r   rv   r;   r   r   
ValueErrorlenr   r   r+   r%   r   r   r   rI   r   )	rD   r   r   r   r   rI   encoder_outputsr   pooled_outputrG   rG   rH   rP     sN   zInternVisionModel.forwardr   )NNNN)rS   rT   rU   main_input_name_supports_flash_attn_2r   config_class_no_split_modulesr   r   rV   r:   r   r   rW   r   r   r   r   rP   r[   rG   rG   rE   rH   r     s>    
r   c                       s   e Zd Z		ddedee ddf fddZdd	d
Zdd Zde	e
 fddZde	e
 fddZe 	d dejdejdedejdejf
ddZde	e defddZdeeeejf  fddZ  ZS )!InternVLChatModelNTr;   r*   rL   c              	      s  t    || _t j| _|| _t| j |j	p|j
j}|j
j}|| _|j| _|j| _t|| d |jd  | _|j| _|j| _|rFdnd|j
_|rNdnd|j_td| j  td| j  t|j
| jd| _|jjd	 d
kr~t|j|d| _nO|jjd	 dkrt|j|d| _n>|jjd	 dkrt|j|d| _n-|jjd	 dkrt|j|d| _n|jjd	 dkrt |j|d| _nt!|jjd	  d|j
j"}|jj"}t#$t#%|td| j d  t#&|td| j d  |t#' t#&||| _(t)j*| j+t)j,| j-i| _.| jj/| _/d S )Nrc   TFflash_attention_2eagerznum_image_token: zps_version: )r+   r   r!   )r;   r*   r    r#   r   r"   z is not implemented.r]   )0r9   r:   r;   r&   mm_enable_dp_encoderr+   r*   r   "update_vit_attn_dummy_heads_configforce_image_sizevision_configre   rg   select_layertemplaterf   downsample_rationum_image_token
ps_versionuse_flash_attn
llm_config_attn_implementationr(   r   r   vision_modelarchitecturesr!   language_modelr    r#   r   r"   NotImplementedErrorr<   r   
Sequential	LayerNormLinearGELUmlp1r   IMAGEget_image_featureVIDEOget_video_feature external_mm_data_embedding_funcsmodel)rD   r;   r*   r   re   rg   vit_hidden_sizellm_hidden_sizerE   rG   rH   r:     sz   









zInternVLChatModel.__init__      ?c              	   C   s   |  \}}}}|||t|| t|| }|dddd }||t|| t|| t|||  }| jdkrDtd |S |dddd }|S )Nr   rc   r]   r^   v1ziIn ps_version 'v1', the height and width have not been swapped back, which results in a transposed image.)rs   viewrf   ry   r   r   r(   warn)rD   r   scale_factornwhcrG   rG   rH   pixel_shuffle;  s     


zInternVLChatModel.pixel_shufflec                 C   s   | j dkr| j|dddj}n| j|dddj| j  }|d d dd d d f }t|jd d  }}||jd ||d}| j|| jd}||jd d|jd }| 	|}|S )	Nrp   FT)r   r   r   r]   r  r   )r  )
r   r  r   rI   rf   r   rx   r  r   r	  )rD   r   
vit_embedsr  r  rG   rG   rH   extract_featureQ  s$   

z!InternVLChatModel.extract_featureitemsc                 C   s"   t dd |D }| |}|S )z
        Projects the last hidden state from the vision model into language model space.

        Returns:
            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
        c                 S      g | ]}|j qS rG   featurer   r   rG   rG   rH   r   j      z7InternVLChatModel.get_image_feature.<locals>.<listcomp>rW   r   r  )rD   r  r   image_featuresrG   rG   rH   r  c  s   
z#InternVLChatModel.get_image_featurec                 C   s&   t jdd |D dd}| |}|S )Nc                 S   r  rG   r   r"  rG   rG   rH   r   q  r#  z7InternVLChatModel.get_video_feature.<locals>.<listcomp>r   r   r$  )rD   r  r   video_featuresrG   rG   rH   r  n  s   
z#InternVLChatModel.get_video_feature	input_ids	positionsforward_batchinput_embedsc                 C   s   t ||| j| | j|d}|S )N)r'  r)  r  multimodal_modeldata_embedding_funcsr(  )r   r  r  )rD   r'  r(  r)  r*  rI   rG   rG   rH   rP   u  s   		zInternVLChatModel.forward	mm_inputsc                 C   s*   |j }|j}||fg}t|}|||S rR   )im_start_id	im_end_idr   pad_input_tokens)rD   r'  r-  r.  r/  media_token_pairshelperrG   rG   rH   pad_input_ids  s
   
zInternVLChatModel.pad_input_idsweightsc              	   C   sz  g }d| j jjv rddg}n.d| j jjv rg d}n"d| j jjv r1g d}tjddd	| j jd
}nd| j jjv r<g d}t|  }|D ]\}}d|v rMqD|D ]-\}}}	||vrYqOd|v r^qO|||}|	drn||vrnqO|| }
|
j
}||
||	  nd|v r|dd}|dd}|D ]$}|\}}}}	||vrq|||}|| }
|
j
}||
|||	|d  n|	dr||vrqD|| }
d|v r"| j }|j|j }|j|j }|dd| ||jd }tj||ddgdd\}}}|d|jd }|d|jd }|d|jd }|
j
}||
|d ||
|d ||
|d qDt|
dt}d|v r5t| j ||}||
| qDd S )Nr    )gate_up_projw1r   )r5  w3r]   r!   ))qkv_projq_projq)r8  k_projk)r8  v_projr   )r5  	gate_projr   )r5  up_projr]   r#   r>  	down_projr?  )ckpt_gate_proj_nameckpt_down_proj_nameckpt_up_proj_namenum_expertsr"   zrotary_emb.inv_freqzmlp.expertsz.biasr  zattn.z
attn.attn.zqkv.z	qkv_proj.)shard_id	expert_idwqkvrp   rc   r]   r   r:  r<  r   weight_loader)r;   r   r  r   make_expert_params_mappingrD  dictnamed_parametersreplaceendswithrH  r=   num_key_value_headsr<   r  r   rW   splitrx   r@   r   r   pad_vit_attn_dummy_heads)rD   r4  expert_params_mappingstacked_params_mappingparams_dictnameloaded_weight
param_nameweight_namerE  paramrH  mappingrF  r;   	kv_groupsr>   wqwkwvrG   rG   rH   load_weights  s   
		



zInternVLChatModel.load_weights)NT)r  rR   )rS   rT   rU   r   r   r   r:   r  r  r   r   r  r  rW   no_gradrZ   r   rP   rf   r   r3  r   r   strr^  r[   rG   rG   rE   rH   r     s:    
N$
r   )Rtypingr   r   r   r   r   rW   torch.nn.functionalr   
functionalrz   transformersr   r	   transformers.modeling_outputsr
   r   sglang.srt.distributedr   r   sglang.srt.environr   sglang.srt.layers.activationr   sglang.srt.layers.attentionr   "sglang.srt.layers.attention.visionr   r   sglang.srt.layers.linearr   r   ,sglang.srt.layers.moe.fused_moe_triton.layerr   *sglang.srt.layers.quantization.base_configr   sglang.srt.managers.mm_utilsr   r   "sglang.srt.managers.schedule_batchr   r   r   ,sglang.srt.model_executor.forward_batch_infor   $sglang.srt.model_loader.weight_utilsr   $sglang.srt.models.deepseek_janus_pror   sglang.srt.models.gpt_ossr   sglang.srt.models.internlm2r    sglang.srt.models.qwen2r!   sglang.srt.models.qwen3r"   sglang.srt.models.qwen3_moer#   4sglang.srt.multimodal.internvl_vit_cuda_graph_runnerr$   sglang.srt.multimodal.mm_utilsr%   sglang.srt.server_argsr&   sglang.srt.utilsr'   sglang.utilsr(   r   Moduler)   r\   r   r   r  r   r   r   r   r   
EntryClassrG   rG   rG   rH   <module>   sX    -J&@_j  %