o
    پiE"                     @   s   d dl Z d dlmZ d dlZd dlmZ d dlmZ d dlm	Z	 d dl
mZ d dlmZ d dlmZmZ d dlmZmZmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZmZ d dl m!Z! e "e#Z$G dd deZ%e%gZ&dS )    N)Iterable)NemotronH_Nano_VL_V2_Config)ReLU2)RMSNorm)QuantizationConfig))MultiModalityDataPaddingPatternTokenPairsgeneral_mm_embed_routine)ModalityMultimodalDataItemMultimodalInputs)ForwardBatch)default_weight_loader)NemotronHForCausalLM)
RadioModel)EVS	EVSConfig)
add_prefixc                
       s   e Zd ZedefddZ		d&dededB deddf fd	d
Zde	e
 defddZd'dejdedejfddZdd Zdd Zde	e fddZde	e fddZe 	d(dejdejded efd!d"Zd#eeeejf  fd$d%Z  ZS ))NemotronH_Nano_VL_V2configc                 C   s   t | jdS )N)video_pruning_rate)r   r   r    r   V/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/nano_nemotron_vl.pycreate_evs_config.   s   z&NemotronH_Nano_VL_V2.create_evs_configN quant_configprefixreturnc              
      s   t  | |j| _t|j|td|d| _t| d	| jj
j| _|j}|td| j d  | _|j}|jj}tt| jddtj| j|dd	t tj||dd		| jj
j| _|| _
d S )
Nlanguage_model)r   r   r   r         gh㈵>)hidden_sizeepsF)bias)super__init__downsample_ratior   
llm_configr   r   r   create_radio_configtor   dtypevision_modelvit_hidden_sizeintrmsnorm_hidden_sizeprojector_hidden_sizer!   nn
Sequentialr   Linearr   torch_dtypemlp1)selfr   r   r   r,   vision_projection_hidden_sizellm_hidden_size	__class__r   r   r%   2   s<   
zNemotronH_Nano_VL_V2.__init__	input_ids	mm_inputsc                 C   s*   |j }|j}||fg}t|}|||S N)im_start_id	im_end_idr   pad_input_tokens)r5   r:   r;   r=   r>   media_token_pairshelperr   r   r   pad_input_idsX   s
   
z"NemotronH_Nano_VL_V2.pad_input_ids      ?xscale_factorc              	   C   s   |  \}}}}|||t|| t|| }|dddd }||t|| t|| t|||  }| jjdkrH|dddd }|S )Nr   r    r      v1)sizeviewr-   permute
contiguousr   
ps_version)r5   rD   rE   nwhcr   r   r   pixel_shuffleb   s"   



z"NemotronH_Nano_VL_V2.pixel_shufflec                 C   s
   | j  S r<   )r   get_input_embeddings)r5   r   r   r   rR   y      
z)NemotronH_Nano_VL_V2.get_input_embeddingsc           	      C   s   d}|j d }g }td||D ]K}| ||||  }|jtjd}t|j d d  }}||j d ||d}| j|| j	d}|
d| j}| |}|
|d| j}|| qtj|dd}|S )	N   r   )r*   r   rC   )rE   )dim)shaperanger+   r)   torchbfloat16r-   reshaperQ   r&   rI   r.   r4   appendcat)	r5   pixel_valuesmicro_batch_sizerM   vit_embeds_listi
vit_embedsrO   rN   r   r   r   extract_feature|   s"   

z$NemotronH_Nano_VL_V2.extract_featureitemsc                 C   "   t dd |D }| |}|S )z
        Projects the last hidden state from the vision model into language model space.

        Returns:
            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
        c                 S      g | ]}|j qS r   feature.0itemr   r   r   
<listcomp>       z:NemotronH_Nano_VL_V2.get_image_feature.<locals>.<listcomp>rY   r]   rc   )r5   rd   r^   image_featuresr   r   r   get_image_feature      
z&NemotronH_Nano_VL_V2.get_image_featurec                 C   re   )z
        Projects the last hidden state from the video model into language model space.

        Returns:
            video_features (`torch.Tensor`): Video feature tensor of shape `(num_videos, video_length, embed_dim)`).
        c                 S   rf   r   rg   ri   r   r   r   rl      rm   z:NemotronH_Nano_VL_V2.get_video_feature.<locals>.<listcomp>rn   )r5   rd   r^   video_featuresr   r   r   get_video_feature   rq   z&NemotronH_Nano_VL_V2.get_video_featureF	positionsforward_batchget_embeddingc              	   C   s*   t ||| j| tj| jtj| ji|d}|S )N)r:   ru   r   multimodal_modeldata_embedding_funcsrt   )r   r   r	   IMAGErp   VIDEOrs   )r5   r:   rt   ru   rv   hidden_statesr   r   r   forward   s   zNemotronH_Nano_VL_V2.forwardweightsc              	   C   s.  t | j }dtdtfdd}dtttjf fdd}dtdtfdd	}g }g }|D ]]\}}	||rE|d
	|
d
dd  |	f q+|||	frud
	|
d
dd  }
||
 }t  t||	 W d    n1 sow   Y  q+||r|tdd  }|||	f q+| j| | j| d S )Nnamer   c                 S   
   |  dS )Nr   
startswithr~   r   r   r   is_llm   rS   z1NemotronH_Nano_VL_V2.load_weights.<locals>.is_llmweightc                 S   s   | d  dS )Nr   r4   r   )r   r   r   r   is_adapter_weights   s   z=NemotronH_Nano_VL_V2.load_weights.<locals>.is_adapter_weightsc                 S   r   )Nzvision_model.radio_model.r   r   r   r   r   is_vision_weights   rS   z<NemotronH_Nano_VL_V2.load_weights.<locals>.is_vision_weights.r   zvision_model.)dictr4   named_parametersstrbooltuplerY   Tensorr\   joinsplitno_gradr   lenr   load_weightsr+   )r5   r}   adapter_dictr   r   r   llm_weightsvision_weightsr~   rN   trimmed_nameparamhf_keyr   r   r   r      s,   $
z!NemotronH_Nano_VL_V2.load_weights)Nr   )rC   )F)__name__
__module____qualname__staticmethodr   r   r   r   r%   listr-   r   rB   rY   r   floatrQ   rR   rc   r
   rp   rs   r   r   r   r|   r   r   r   __classcell__r   r   r8   r   r   -   s@    &
$r   )'loggingtypingr   rY   torch.nnr0   #sglang.srt.configs.nano_nemotron_vlr   sglang.srt.layers.activationr   sglang.srt.layers.layernormr   *sglang.srt.layers.quantization.base_configr   sglang.srt.managers.mm_utilsr   r   "sglang.srt.managers.schedule_batchr	   r
   r   ,sglang.srt.model_executor.forward_batch_infor   $sglang.srt.model_loader.weight_utilsr   sglang.srt.models.nemotron_hr   sglang.srt.models.radior   sglang.srt.multimodal.evsr   r   sglang.srt.utilsr   	getLoggerr   loggerr   
EntryClassr   r   r   r   <module>   s(   
 
4