o
    پi-/                     @   s  d dl Z d dlZd dlmZ d dlmZ d dlZd dlZd dlm	Z	 d dl
m	  mZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZmZ d dlm  m  mZ d dlm  m  mZ d dlm  mZ d d	l m!Z! d d
l"m#Z# d dlm$Z$ d dl%m&Z&m'Z'm(Z( d dl)m*Z* d dl+m,Z, dZ-G dd deZ.G dd de	j/Z0G dd de	j/Z1G dd de	j/Z2dd Z3dd Z4dd Z5e2gZ6dS )    N)Iterable)Any)Tensor)PretrainedConfig)BaseModelOutputWithPooling)Qwen2Config)SiglipVisionConfigSiglipVisionModel)LogitsProcessorOutput)QuantizationConfig)/MultiModalityDataPaddingPatternMultimodalTokens)ModalityMultimodalDataItemMultimodalInputs)ForwardBatch)Qwen2ForCausalLMi  c                
       sl   e Zd ZdZeedZdZddddddee	e
f dB dee	e
f dB dedB d	edB f fd
dZ  ZS )NVILAConfignvila)text_configvision_config
AutoConfigN)r   r   image_token_idvideo_token_idr   r   r   r   c                   sv   |d urt di |nt  | _|d urtdi |nt | _|d ur$|nd| _|d ur-|nd| _t jdi | d S )N )r   r   r   r   r   r   super__init__)selfr   r   r   r   kwargs	__class__r   K/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/nvila.pyr   )   s   
zNVILAConfig.__init__)__name__
__module____qualname__
model_typer   r   sub_configs_auto_classdictstrr   intr   __classcell__r   r   r   r!   r   !   s&    r   c                   @   s   e Zd ZdedefddZdS )'NVILAMultiModalProjectorDownsampleBlockxreturnc              	   C   s   |j \}}}t|}|||||}|d }|dkr+t|ddd|d|f}|| }|||d d|d d|}|dddddd }||dd| }|S )N   r               r   )shapemathisqrtreshapeFpadpermute
contiguous)r   r-   
batch_sizesequence_lengthhidden_size	feat_sizefeatures	pad_afterr   r   r!   forwardB   s   
z/NVILAMultiModalProjectorDownsampleBlock.forwardN)r"   r#   r$   r   rB   r   r   r   r!   r,   A   s    r,   c                       s4   e Zd Zdef fddZdedefddZ  ZS )NVILAMultiModalProjectorconfigc              
      sR   t    tt ttd ttd |jj	t
 t|jj	|jj	| _d S )Nr2   )r   r   nn
Sequentialr,   	LayerNormMM_HIDDEN_SIZELinearr   r>   GELUlayers)r   rD   r   r   r!   r   X   s   

z!NVILAMultiModalProjector.__init__r-   r.   c                 C   s
   |  |S N)rK   )r   r-   r   r   r!   rB   c   s   
z NVILAMultiModalProjector.forward)r"   r#   r$   r   r   r   rB   r+   r   r   r   r!   rC   W   s    rC   c                       s   e Zd Z		ddededB deddf fddZ		dd
ededede	de
f
ddZdee defddZdeeeef  ddfddZd
ee dedee fddZ  ZS )NVILAForConditionalGenerationN rD   quant_configprefixr.   c                    sD   t    || _t|j| _t|| _t|j	|t
d|d| _d S )Nllm)rD   rO   rP   )r   r   rD   r	   r   vision_towerrC   mm_projectorr   r   utils
add_prefixrQ   )r   rD   rO   rP   r   r   r!   r   h   s   


z&NVILAForConditionalGeneration.__init__F	input_ids	positionsforward_batchget_embeddingc                 C   s:   t j||| jtj| jtj| ji||d}t|tsJ |S )N)rV   rX   language_modeldata_embedding_funcsrY   rW   )	mm_utilsgeneral_mm_embed_routinerQ   r   IMAGEget_image_featureVIDEO
isinstancer
   )r   rV   rW   rX   rY   outputr   r   r!   rB   z   s   z%NVILAForConditionalGeneration.forwardmm_inputc                 C   s$  t tjdd |D pd }tjdd |D dd}| j|j| jj| jj	ddd	}|j
d us2J |j
d
 }t||d ur?|nd g|jd  dg dd\}}dd t||D }tdd |D }| |}t |jdd |D dd}dd t||D }tdd |D }t|d}|S )Nc                 s   s     | ]}t |d r|jV  qdS )block_sizesN)hasattrrd   .0r-   r   r   r!   	<genexpr>   s    

zBNVILAForConditionalGeneration.get_image_feature.<locals>.<genexpr>c                 S   s   g | ]}t |jqS r   )torchtensorfeaturerf   r   r   r!   
<listcomp>       zCNVILAForConditionalGeneration.get_image_feature.<locals>.<listcomp>r   dim)devicedtypeT)output_hidden_statesr   )i  i  i@  )rd   resize_output_to_scale_idxscalesc                 S   $   g | ]\}}t ||d  |d qS r   r0   )split_chessboardrg   r-   
block_sizer   r   r!   rl          c                 S      g | ]}t |d qS )zb c h w -> b (h w) ceinops	rearrangerf   r   r   r!   rl      rm   c                 S   s   g | ]
}|d  |d  qS rw   r   )rg   rz   r   r   r!   rl      s    c                 S   rv   rw   )merge_chessboardry   r   r   r!   rl      r{   c                 S   r|   )z1 c h w -> (h w) cr}   rf   r   r   r!   rl      rm   zn p d -> (n p) d)list	itertoolschainfrom_iterableri   catrR   torp   rq   hidden_statesmerge_features_for_dynamic_s2r4   ziprS   splitstackr~   r   )r   rc   rd   pixel_valuesvision_tower_outputvision_featuresvision_features_listr   r   r!   r_      sX   


z/NVILAForConditionalGeneration.get_image_featureweightsc                 C   sh   t |  }|D ])\}}|dr!| j|tdd  |fg q|| }t|dtj}||| qd S )Nzllm.weight_loader)	r(   named_parameters
startswithrQ   load_weightslengetattrweight_utilsdefault_weight_loader)r   r   params_dictnameloaded_weightparamr   r   r   r!   r      s   
 z*NVILAForConditionalGeneration.load_weights	mm_inputsc                 C   s   t  }|||S rL   )r   pad_input_tokens)r   rV   r   patternr   r   r!   pad_input_ids   s   z+NVILAForConditionalGeneration.pad_input_ids)NrN   )F)r"   r#   r$   r   r   r)   r   r   r   boolr
   rB   r   r   r_   r   tupler   r*   r   r   r+   r   r   r   r!   rM   g   sB    
=rM   c                    s   j d } dkrj d }tjdt|t|d||  dks)J ||   tj fddt|D dd	}|S )
z
    x: b * n * c or b * h * w * c
    out: b * c * h * w
    Assuming x contains num_split**2 sub-squares concatenated along batch dimension, merge the sub-squares back to the original whole square.
    r   r1   r0   zb (h w) c -> b c h w)hwc                    s2   g | ] t j fd dtD ddqS )c                    s4   g | ]} |    | d     qS )r0   r   )rg   j)binum_split_wr-   r   r!   rl      s    &z/merge_chessboard.<locals>.<listcomp>.<listcomp>r   rn   )ri   r   range)rg   r   r   r-   )r   r!   rl      s    z$merge_chessboard.<locals>.<listcomp>rs   rn   )	r4   ro   r~   r   r5   r6   ri   r   r   )r-   num_split_hr   BNx_merger   r   r!   r      s   


r   c             
      s  g }g }d}|D ]}|d u r=| ||d  }t j|dt|jd d}|dt|dd}|| |d |d7 }qg  |d d D ]&}	|	|d  d }
 t| |||
  |	|d  |	|d  d ||
7 }qE|d |d  } t| |||  |d |d d ||7 } | jd	d  t	j
 fd
dtt D dd}|| |t|d ks|dkr|| q||| |d  || |d  f q|t| ksJ d| dt|  d||fS )Nr   r0   z1 (h w) c -> 1 c h w)r   )r0   r0   r   r/   )r   r   rs   c                    s4   g | ]}t j | tjd d | jqS )area)sizemode)r8   interpolater   ri   float32rq   )rg   r   cur_features_each_scaleoutput_sizer   r!   rl   .  s    z1merge_features_for_dynamic_s2.<locals>.<listcomp>rn   zThe number of blocks (z+) does not match length of image_features (z)!)r~   r   r5   r6   r4   repeatr   appendr   ri   r   r   )image_featuresrd   ru   rt   image_features_each_imagenew_block_sizes	block_cntblock_size_each_imagecur_featuresscalenum_blocks_this_scalenum_blocks_last_scaler   r   r!   r     sr   







r   c                    sf   j \}}}}|| dkr| dksJ || |  tj fddt|D dd}|S )z
    x: b * c * h * w
    out: b * c * h * w
    Deividing x into num_split**2 sub-squares, and concatenate all the sub-squares on the batch dimension
    r   c              
      sR   g | ]%}t D ]}d d d d |  |d   | |d  f qqS )Nr0   )r   )rg   r   r   r   r   r   r-   r   r!   rl   Y  s    6z$split_chessboard.<locals>.<listcomp>rn   )r4   ri   r   r   )r-   r   r   r   CHWx_splitr   r   r!   rx   O  s   rx   )7r   r5   collections.abcr   typingr   r~   ri   torch.nnrE   torch.nn.functional
functionalr8   r    transformers.configuration_utilsr   transformers.modeling_outputsr   -transformers.models.qwen2.configuration_qwen2r   transformers.models.siglipr   r	   sglang.srt.managers.mm_utilssrtmanagersr\   $sglang.srt.model_loader.weight_utilsmodel_loaderr   sglang.srt.utilsrT   "sglang.srt.layers.logits_processorr
   *sglang.srt.layers.quantization.base_configr   r   "sglang.srt.managers.schedule_batchr   r   r   ,sglang.srt.model_executor.forward_batch_infor   sglang.srt.models.qwen2r   rH   r   Moduler,   rC   rM   r   r   rx   
EntryClassr   r   r   r!   <module>   s>     {!L
