o
    پi2                     @   s   d Z ddlmZmZmZmZ ddlZddlZddlm	Z	 ddl
mZmZ ddlmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZ G dd de	jZeZdS )zEInference-only LLaVa video model compatible with HuggingFace weights.    )IterableListOptionalTupleN)nn)CLIPVisionModelLlavaConfig)LlavaMultiModalProjector)QuantizationConfig)MultimodalInputsflatten_nested_list)ForwardBatch)default_weight_loader)LlamaForCausalLM)
add_prefixc                	       s   e Zd Z		ddedee deddf fddZd	ee	 d
e
fddZdejdejfddZe d	ejdejdedejfddZdeeeejf  fddZedd Z  ZS )LlavaVidForCausalLMN configquant_configprefixreturnc                    s   t    || _d | _|j| jj_|j| jj_t|| _	t
| jdd| _tj| j| jd| _t||td|d| _t
| jdd| _dt
|d	d
v r[ttj|jjtjd| jj_d S d S )Nmm_spatial_pool_stride   )kernel_sizestridelanguage_model)r   r   
num_frames   unpadmm_patch_merge_typer   )dtype)super__init__r   vision_towermm_hidden_sizevision_confighidden_sizetext_configr	   multi_modal_projectorgetattrr   r   	AvgPool2d	resamplerr   r   r   r   	Parametertorchemptyfloat16modelimage_newline)selfr   r   r   	__class__ N/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/llavavid.pyr"   !   s*   

zLlavaVidForCausalLM.__init__	input_idsimage_inputsc                 C   sp   dd |j D }| j}||t| t|  }|| jj}|d | |d |  ||d d   }|g|_|S )Nc                 S      g | ]}|j qS r5   )	pad_value.0itemr5   r5   r6   
<listcomp>=       z5LlavaVidForCausalLM.pad_input_ids.<locals>.<listcomp>   )mm_itemsimage_feature_lenlenindexr   image_token_indeximage_offsets)r2   r7   r8   
pad_valuesnew_image_feature_lenpad_idsoffsetnew_input_idsr5   r5   r6   pad_input_ids<   s   

z!LlavaVidForCausalLM.pad_input_idspixel_valuesc                 C   s   | j |dd}|j| j }| jdv r|d d dd f }n| jdkr%|}n	td| jj | j }}|jd }||||d}|	dd	dd

 }| |d
dd

 }| |}|S )NT)output_hidden_states)defaultpatchr@   fullz$Unexpected select feature strategy: r      r   )r#   hidden_statesvision_feature_layervision_feature_select_strategy
ValueErrorr   num_patches_per_sideshapeviewpermute
contiguousr+   flatten	transposer(   )r2   rM   image_outputsselected_image_featureheightwidthnum_of_framesimage_featuresr5   r5   r6   encode_imagesM   s,   




z!LlavaVidForCausalLM.encode_images	positionsforward_batchc                    s  |j  |j rN|j}|jd| jjd d}| jj	|}g } D ]}|r2|j
r2|t|j
 q"|d q"||j   }|t|k rEt fddt|D }	 fddt|D }
|	d jdkrtj|	dd	 tjtj|	dd	| jjd
}| |}dd |	D }tj||dd}ntjt|	| jjd
}	| |	}g }t|D ]\}}||dd q|}|j  }|j}d}t|D ]u}| sq|| }|| }|
| D ]a}||k rq|| }|j d }|||  }|||  | }z||||< W n6 t!y> } z)t"d|  t"d|j d|j  t"d|d|d|d| W Y d }~nd }~ww |d7 }qq| j||||dS |j# r[| |||S d S )Nr   r@   )minmaxrR   c                    s(   g | ]}| rd d  | j D qS )c                 S   r9   r5   )featurer;   r5   r5   r6   r>      r?   :LlavaVidForCausalLM.forward.<locals>.<listcomp>.<listcomp>)rA   r<   ir8   need_visionr5   r6   r>      s    z/LlavaVidForCausalLM.forward.<locals>.<listcomp>c                    s,   g | ]}| rt d d  | jD qS )c                 S   r9   r5   )offsetsr;   r5   r5   r6   r>      r?   rk   )r   rA   rl   rn   r5   r6   r>      s       )axis)devicec                 S   s   g | ]}|j d  qS )r   )rY   )r<   imager5   r5   r6   r>      s    )dimz RuntimeError in image encoding: zinput_embeds.shape=z, tmp_image_feature.shape=z
start_idx=z, image_offset=z, prefix_len=z
, pad_len=)input_embeds)$	mm_inputsforward_mode	is_extend
batch_sizeclamp_r   
vocab_sizer   r0   embed_tokensrF   appendri   extend_start_loccpunumpynparrayanyr   rangendimconcatenater-   tensorr#   rs   re   split	enumerater]   extend_prefix_lens_cpurY   RuntimeErrorprint	is_decode)r2   r7   rf   rg   bsrv   max_image_offsetimstart_positionsrM   rF   concat_imagesrd   split_sizesnew_image_features	image_idximage_featureextend_start_loc_cpuprefix_lens_cpuptrm   	start_idx
prefix_lenimage_offsettmp_image_featurepad_lenleft_idx	right_idxer5   rn   r6   forwardl   s   





zLlavaVidForCausalLM.forwardweightsc                 C   s  | j j}tj|tjd | _| j  | j j	| _
| j j| _| jj j| _| jj j| _t| j dd| _t| j dd| _t| j dd | _td| j  | jt| j| j | j d  | _| jd	kran| jd
krn|  jd7  _ntd| j ddddddd}t|  }|D ]J\}}d|v sd|v sd|v r| D ]\}}||v r|||}q||v r|| }	n	td| d qt|	dt}
|
|	| q| j ||fg qd S )N)torch_dtyper   flatimage_aspect_ratiosquareimage_grid_pinpointsztarget_frames: r   rP   	cls_patchr@   zUnexpected select feature: zmulti_modal_projector.linear_1zmulti_modal_projector.linear_2r#   z"language_model.model.image_newline)zmodel.mm_projector.0zmodel.mm_projector.2z%model.vision_resampler.mm_projector.0z%model.vision_resampler.mm_projector.2zmodel.vision_tower.vision_towerzmodel.image_newline	projectorr1   z	Warning: z not found in the modelweight_loader)!r   mm_vision_towerr   from_pretrainedr-   r/   cudar#   evalmm_vision_select_layerrU   mm_vision_select_featurerV   
image_size
patch_sizer)   r   r   r   r   r   intr   rB   rW   select_featuredictnamed_parametersitemsreplacer   r   load_weights)r2   r   vision_pathprojector_weightsparams_dictnameloaded_weightweight_name
param_nameparamr   r5   r5   r6   r      sZ   




	
z LlavaVidForCausalLM.load_weightsc                 C   s   | j | j S )N)r   r   )r2   r5   r5   r6   rX     s   z(LlavaVidForCausalLM.num_patches_per_side)Nr   )__name__
__module____qualname__r   r   r
   strr"   r   r   r   rL   r-   Tensorre   no_grad
LongTensorr   r   r   r   r   propertyrX   __classcell__r5   r5   r3   r6   r       s6    p9r   )__doc__typingr   r   r   r   r   r   r-   r   transformersr   r   (transformers.models.llava.modeling_llavar	   *sglang.srt.layers.quantization.base_configr
   "sglang.srt.managers.schedule_batchr   r   ,sglang.srt.model_executor.forward_batch_infor   $sglang.srt.model_loader.weight_utilsr   sglang.srt.models.llamar   sglang.srt.utilsr   Moduler   
EntryClassr5   r5   r5   r6   <module>   s     |