o
    پi>                     @   st  d dl Z d dlmZ d dlmZ d dlZd dlZd dlmZ d dl	m  m
Z d dlmZ d dlmZ d dlmZ d dlmZ d dlmZmZ d dlm  m  mZ d dlm  m  mZ d dlm  mZ d d	lm Z  d d
l!m"Z" d dlm#Z# d dl$m%Z%m&Z&m'Z' d dl(m)Z) d dl*m+Z+ dZ,G dd deZ-G dd dej.Z/G dd dej.Z0G dd dej.Z1e1gZ2dS )    N)Iterable)Any)Tensor)PretrainedConfig)BaseModelOutputWithPooling)Qwen2Config)SiglipVisionConfigSiglipVisionModel)LogitsProcessorOutput)QuantizationConfig)/MultiModalityDataPaddingPatternMultimodalTokens)ModalityMultimodalDataItemMultimodalInputs)ForwardBatch)Qwen2ForCausalLMi  c                
       sl   e Zd ZdZeedZdZddddddee	e
f dB dee	e
f dB dedB d	edB f fd
dZ  ZS )NVILALiteConfig
nvila_lite)text_configvision_config
AutoConfigN)r   r   image_token_idvideo_token_idr   r   r   r   c                   sv   |d urt di |nt  | _|d urtdi |nt | _|d ur$|nd| _|d ur-|nd| _t jdi | d S )N )r   r   r   r   r   r   super__init__)selfr   r   r   r   kwargs	__class__r   P/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/nvila_lite.pyr   (   s   
zNVILALiteConfig.__init__)__name__
__module____qualname__
model_typer   r   sub_configs_auto_classdictstrr   intr   __classcell__r   r   r   r!   r       s&    r   c                   @   s   e Zd ZdedefddZdS )+NVILALiteMultiModalProjectorDownsampleBlockxreturnc              	   C   s   |j \}}}t|}|||||}d|d  d }|dkr/t|ddd|d|f}|| }|||d d|d d|}|dddddd }||dd| }|S )	N   r               r   	   )shapemathisqrtreshapeFpadpermute
contiguous)r   r-   
batch_sizesequence_lengthhidden_size	feat_sizefeatures	pad_afterr   r   r!   forwardA   s   
z3NVILALiteMultiModalProjectorDownsampleBlock.forwardN)r"   r#   r$   r   rC   r   r   r   r!   r,   @   s    r,   c                       s4   e Zd Zdef fddZdedefddZ  ZS )NVILALiteMultiModalProjectorconfigc                    sv   t    tt ttd ttd td t ttd ttd |j	j
t t|j	j
|j	j
| _d S )Nr4   r/   )r   r   nn
Sequentialr,   	LayerNormMM_HIDDEN_SIZELinearGELUr   r?   layers)r   rE   r   r   r!   r   W   s   

z%NVILALiteMultiModalProjector.__init__r-   r.   c                 C   s
   |  |S N)rL   )r   r-   r   r   r!   rC   e   s   
z$NVILALiteMultiModalProjector.forward)r"   r#   r$   r   r   r   rC   r+   r   r   r   r!   rD   V   s    rD   c                       s   e Zd Z		ddededB deddf fddZ		dd
ededede	de
f
ddZdee defddZdeeeef  ddfddZd
ee dedee fddZ  ZS )!NVILALiteForConditionalGenerationN rE   quant_configprefixr.   c                    sD   t    || _t|j| _t|| _t|j	|t
d|d| _d S )Nllm)rE   rP   rQ   )r   r   rE   r	   r   vision_towerrD   mm_projectorr   r   utils
add_prefixrR   )r   rE   rP   rQ   r   r   r!   r   j   s   


z*NVILALiteForConditionalGeneration.__init__F	input_ids	positionsforward_batchget_embeddingc                 C   s:   t j||| jtj| jtj| ji||d}t|tsJ |S )N)rW   rY   language_modeldata_embedding_funcsrZ   rX   )	mm_utilsgeneral_mm_embed_routinerR   r   IMAGEget_image_featureVIDEO
isinstancer
   )r   rW   rX   rY   rZ   outputr   r   r!   rC   |   s   z)NVILALiteForConditionalGeneration.forwardmm_inputc                 C   sX   t jdd |D dd}| j|dd}|jd usJ |jd }| |}t|d}|S )	Nc                 S   s   g | ]}t |jqS r   )torchtensorfeature).0r-   r   r   r!   
<listcomp>   s    zGNVILALiteForConditionalGeneration.get_image_feature.<locals>.<listcomp>r   )dimT)output_hidden_stateszn p d -> (n p) d)re   catrS   hidden_statesrT   einops	rearrange)r   rd   pixel_valuesvision_tower_outputvision_featuresr   r   r!   r`      s   

z3NVILALiteForConditionalGeneration.get_image_featureweightsc                 C   sh   t |  }|D ])\}}|dr!| j|tdd  |fg q|| }t|dtj}||| qd S )Nzllm.weight_loader)	r(   named_parameters
startswithrR   load_weightslengetattrweight_utilsdefault_weight_loader)r   rt   params_dictnameloaded_weightparamru   r   r   r!   rx      s   
 z.NVILALiteForConditionalGeneration.load_weights	mm_inputsc                 C   s   t  }|||S rM   )r   pad_input_tokens)r   rW   r   patternr   r   r!   pad_input_ids   s   z/NVILALiteForConditionalGeneration.pad_input_ids)NrO   )F)r"   r#   r$   r   r   r)   r   r   r   boolr
   rC   listr   r`   r   tuplerx   r*   r   r   r+   r   r   r   r!   rN   i   sB    
rN   )3r6   collections.abcr   typingr   ro   re   torch.nnrF   torch.nn.functional
functionalr9   r    transformers.configuration_utilsr   transformers.modeling_outputsr   -transformers.models.qwen2.configuration_qwen2r   transformers.models.siglipr   r	   sglang.srt.managers.mm_utilssrtmanagersr]   $sglang.srt.model_loader.weight_utilsmodel_loaderr{   sglang.srt.utilsrU   "sglang.srt.layers.logits_processorr
   *sglang.srt.layers.quantization.base_configr   r   "sglang.srt.managers.schedule_batchr   r   r   ,sglang.srt.model_executor.forward_batch_infor   sglang.srt.models.qwen2r   rI   r   Moduler,   rD   rN   
EntryClassr   r   r   r!   <module>   s6     
O