o
    پi-                     @   s   d dl mZmZmZmZ d dlZd dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZmZ d d	lmZmZmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dlmZ d dl m!Z! d dl"m#Z# G dd dej$Z%e%Z&dS )    )IterableListOptionalTupleN)nn)PretrainedConfig)vision_utils)FusedMoE)QuantizationConfig))MultiModalityDataPaddingPatternTokenPairsgeneral_mm_embed_routine)ModalityMultimodalDataItemMultimodalInputs)ForwardBatch)default_weight_loader)InternVisionModel)Qwen2ForCausalLM)Qwen3ForCausalLM)Qwen3MoeForCausalLM)loggerc                       s   e Zd Z		ddedee ddf fddZdd	d
Zdd Zde	e
 fddZe 	d dejdejdedejdejf
ddZde	e defddZdd Zdeeeejf  fddZ  ZS )! InternS1ForConditionalGenerationNTconfigquant_configreturnc              	      s  t    || _|| _t| j t|dd p|jj}|jj	}t
|tr(|d }t
|tr1|d }|| _	|j| _t|| d |jd  | _|j| _|rNdnd|j_|rVdnd|j_td| j  t|j| _|jjd d	krzt|j|d
| _n-|jjd dkrt|j|d
| _n|jjd dkrt|j|d
| _nt|jjd  d|jj}|jj}t t!|td| j d  t"|td| j d  |t# t"||| _$d S )Nforce_image_sizer      TFflash_attention_2eagerznum_image_token: r   )r   r   r   r   z is not implemented.   )%super__init__r   r   r   "update_vit_attn_dummy_heads_configgetattrvision_config
image_size
patch_size
isinstancelistvision_feature_layerselect_layerintdownsample_rationum_image_tokenuse_flash_attntext_config_attn_implementationr   infor   vision_modelarchitecturesr   language_modelr   r   NotImplementedErrorhidden_sizer   
Sequential	LayerNormLinearGELUmlp1)selfr   r   r.   r%   r&   vit_hidden_sizellm_hidden_size	__class__ N/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/interns1.pyr!      s^   








z)InternS1ForConditionalGeneration.__init__      ?c              	   C   s   |  \}}}}|||t|| t|| }|dddd }||t|| t|| t|||  }|dddd }|S )Nr   r   r      )sizeviewr+   permute
contiguous)r<   xscale_factornwhcrA   rA   rB   pixel_shuffle[   s    

z.InternS1ForConditionalGeneration.pixel_shufflec                 C   s   | j dkr| j|dddj}n| j|dddj| j  }|d d dd d d f }t|jd d  }}||jd ||d}| j|| jd}||jd d|jd }| 	|}|S )	NFT)pixel_valuesoutput_hidden_statesreturn_dictr   rC   r   )rJ   )
r*   r2   last_hidden_statehidden_statesr+   shapereshaperO   r,   r;   )r<   rQ   
vit_embedsrM   rL   rA   rA   rB   extract_featurek   s$   

z0InternS1ForConditionalGeneration.extract_featureitemsc                 C   s"   t dd |D }| |}|S )z
        Projects the last hidden state from the vision model into language model space.

        Returns:
            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
        c                 S   s   g | ]}|j qS rA   )feature).0itemrA   rA   rB   
<listcomp>   s    zFInternS1ForConditionalGeneration.get_image_feature.<locals>.<listcomp>)torchcatrY   )r<   rZ   rQ   image_featuresrA   rA   rB   get_image_feature}   s   
z2InternS1ForConditionalGeneration.get_image_feature	input_ids	positionsforward_batchinput_embedsc                 C   s    t ||| jtj| ji|d}|S )N)rc   re   r4   data_embedding_funcsrd   )r   r4   r   IMAGErb   )r<   rc   rd   re   rf   hsrA   rA   rB   forward   s   	
z(InternS1ForConditionalGeneration.forward	mm_inputsc                 C   s*   |j }|j}||fg}t|}|||S N)im_start_id	im_end_idr   pad_input_tokens)r<   rc   rk   rm   rn   media_token_pairshelperrA   rA   rB   pad_input_ids   s
   
z.InternS1ForConditionalGeneration.pad_input_idsc                 C   s   ddddddddd	d
dd}||v r|| }n| dr'd|tdd   }n| dr6d|tdd   }| dre|dd}|dd}|dd}|dd}|dd}|dd}|dd}|S ) Nzlanguage_model.lm_head.weightzmlp1.0.biaszmlp1.0.weightzmlp1.1.biaszmlp1.1.weightzmlp1.3.biaszmlp1.3.weightz'vision_model.embeddings.class_embeddingz,vision_model.embeddings.patch_embedding.biasz.vision_model.embeddings.patch_embedding.weightz*vision_model.embeddings.position_embedding)zlm_head.weightz+model.multi_modal_projector.layer_norm.biasz-model.multi_modal_projector.layer_norm.weightz)model.multi_modal_projector.linear_1.biasz+model.multi_modal_projector.linear_1.weightz)model.multi_modal_projector.linear_2.biasz+model.multi_modal_projector.linear_2.weightz'model.vision_tower.embeddings.cls_tokenz>model.vision_tower.embeddings.patch_embeddings.projection.biasz@model.vision_tower.embeddings.patch_embeddings.projection.weightz1model.vision_tower.embeddings.position_embeddingszmodel.language_model.zlanguage_model.model.zmodel.vision_tower.zvision_model.zvision_model.encoder.layerz.layer.z.layers.z.attention.z.attn.attn.z.projection_layer.z.proj.z	.lambda_1z.ls1z	.lambda_2z.ls2z.layernorm_before.z.norm1.z.layernorm_after.z.norm2.)
startswithlenreplace)r<   name	names_maprA   rA   rB   _mapping_interns1_name   s6   



z7InternS1ForConditionalGeneration._mapping_interns1_nameweightsc              	   C   sZ  g d}g }d| j jjv rtjddd| j jd}t|  }|D ]\}}d|v r)q | |}d|v r:t	
| j ||}|D ]-\}}}	||vrFq<d	|v rKq<|||}|d
r[||vr[q<|| }
|
j}||
||	  n@|D ]$}|\}}}}	||vryql|||}|| }
|
j}||
|||	|d  n|d
r||vrq || }
t|
dt}||
| q d S )N))qkv_projq_projq)rz   k_projk)rz   v_projv)gate_up_proj	gate_projr   )r   up_projr   r   r   	down_projr   )ckpt_gate_proj_nameckpt_down_proj_nameckpt_up_proj_namenum_expertszrotary_emb.inv_freqr2   zmlp.expertsz.bias)shard_id	expert_idweight_loader)r   r/   r3   r	   make_expert_params_mappingr   dictnamed_parametersrx   r   pad_vit_attn_dummy_headsru   endswithr   r#   r   )r<   ry   stacked_params_mappingexpert_params_mappingparams_dictrv   loaded_weight
param_nameweight_namer   paramr   mappingr   rA   rA   rB   load_weights   sl   

z-InternS1ForConditionalGeneration.load_weights)NT)rC   rl   )__name__
__module____qualname__r   r   r
   r!   rO   rY   r   r   rb   r_   no_gradTensorr   rj   r+   r   rr   rx   r   r   strr   __classcell__rA   rA   r?   rB   r      s:    
>
$ r   )'typingr   r   r   r   r_   r   transformersr   sglang.srt.layers.attentionr   ,sglang.srt.layers.moe.fused_moe_triton.layerr	   *sglang.srt.layers.quantization.base_configr
   sglang.srt.managers.mm_utilsr   r   "sglang.srt.managers.schedule_batchr   r   r   ,sglang.srt.model_executor.forward_batch_infor   $sglang.srt.model_loader.weight_utilsr   sglang.srt.models.internvlr   sglang.srt.models.qwen2r   sglang.srt.models.qwen3r   sglang.srt.models.qwen3_moer   sglang.utilsr   Moduler   
EntryClassrA   rA   rA   rB   <module>   s&     w