o
    پi                     @   s  d Z ddlZddlZddlmZ ddlmZmZmZm	Z	m
Z
mZmZ ddlZddlZddlmZ ddlmZmZmZmZmZmZ ddlmZmZ ddlmZ ddlm  mZ  dd	l!m"Z" dd
l#m$Z$ ddl%m&Z&m'Z'm(Z( ddl)m*Z*m+Z+ ddl,m-Z- ddl.m/Z/ ddl0m1Z1 ddl2m3Z3 ddl4m5Z5m6Z6m7Z7 ddl8m9Z9m:Z:m;Z; G dd dej<Z=G dd de=Z>G dd de=Z?G dd de=Z@G dd de=ZAe>e?e@eAgZBdS )z?Inference-only LLaVa model compatible with HuggingFace weights.    N)	lru_cache)DictIterableListOptionalTupleTypeUnion)nn)CLIPVisionConfigCLIPVisionModelLlavaConfigMistralConfigQwen2ConfigSiglipVisionModel)	AutoModelAutoModelForCausalLM)LlavaMultiModalProjector)QuantizationConfig)general_mm_embed_routine)ModalityMultimodalDataItemMultimodalInputs)ForwardBatchPPProxyTensors)default_weight_loader)LlamaForCausalLM)MistralForCausalLM)Qwen2ForCausalLM)get_anyres_image_grid_shapeunpad_imageunpad_image_shape)
add_prefixflatten_nested_listloggerc                	   @   s   e Zd Zdee defddZdeej	eej	 f dej	fddZ
e dejd	ej	d
edej	fddZdeeeej	f  fddZedd ZdS )LlavaBaseForCausalLM	input_idsimage_inputsc              	   C   s  t dd |jD }dd |jD }tdd |jD rd}nd}g }g |_t|D ]\}}t|dkrBt| j| j	 d	 d	 }	n| j
}	| j }
}d|v rt|| j| jjj\}}||
 }|| }t|||\}}d
| jjv rtd| jj}|rt|d}t|| || j
  }|dkrt|| }t|| }|	||d  7 }	z	|| jj}W n ty   d}Y nw |d | ||t|  g|	  ||d d   }|| |j|	 q*||_|S )Nc                 S      g | ]}|j qS  image_sizes.0itemr)   r)   K/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/llava.py
<listcomp><       z6LlavaBaseForCausalLM.pad_input_ids.<locals>.<listcomp>c                 S   r(   r)   )	pad_valuer,   r)   r)   r/   r0   ?   r1   c                 s   s(    | ]}|j tjkp|j tjkV  qd S N)modalityr   MULTI_IMAGESVIDEOr,   r)   r)   r/   	<genexpr>B   s
    
z5LlavaBaseForCausalLM.pad_input_ids.<locals>.<genexpr>padanyres      
anyres_maxanyres_max_(\d+)   皙?r   )r#   mm_itemsanyimage_pad_len	enumeratelenmathceil
image_size
patch_sizeimage_feature_lennum_patches_per_sider   image_grid_pinpointsvision_towerconfigr!   image_aspect_ratiorematchintgroupsqrtindeximage_token_index
ValueErrorappendimage_offsets)selfr&   r'   r+   
pad_valuesrN   offset_list	image_idximage_snew_image_feature_lenheightwidthnum_patch_widthnum_patch_heighthwnew_hnew_wmatched_anyres_max_num_patchesmax_num_patchestimesoffsetr)   r)   r/   pad_input_ids:   sl   


z"LlavaBaseForCausalLM.pad_input_idspixel_valuesreturnc                 C   sj   | j |dd}|j| j }| jdv r|ddddf }n| jdkr%|}n	td| jj | |}|S )a2  
        encode images by vision tower and multimodal projector
        Args:
            pixel_values: torch.Tensor or List[torch.Tensor]: each tensor for an input image
        Returns:
            torch.Tensor: encoded image features from the input image; if multiple, flattened by seq_len axis
        Toutput_hidden_statesdefaultpatchNr>   fullz$Unexpected select feature strategy: )rL   hidden_statesvision_feature_layervision_feature_select_strategyrV   rM   multi_modal_projector)rY   rl   image_outputsselected_image_featureimage_featuresr)   r)   r/   encode_images~   s   



z"LlavaBaseForCausalLM.encode_images	positionsforward_batchc           2         s   |j  |j r|jd| jjd d | jj|}g }g } D ],}|r0|	dd |j
D  |rH|jrH|tt|jt|j  q!|d q!||j   }|t|k rx|j}	t fddt|	D }
 fddt|	D }|
d jd	krtj|
dd
 tjtj|
dd
| jjd}| |}dd |
D }tj||dd}ntjt|
| jjd}
| |
}| j !drg }| j" }}t#|D ]\}}|| t$j%kr| jj&}n|| t$j'ks|| t$j(krd}|j)d dkr1d|v r1|| t$j%kr1|d }|dd  }|| |j)d ks J d|v r5t*+d|}|r5t,|-d}|dks?d|v r{| j.}zt/|| d | jj0|\}}W n t1yp } zt2d|  d\}}W Y d }~nd }~ww |3||||d}n	|3dd||d}d| j v r|j)d }|4d	dddd5 }|6dd6dd}t7||| d }d|v r|r|j)\}}}t89|| ||d   } | dkr|d  }t:j;j<|t,||  t,||  gddd }tj=|| jjj>d d d d f j?g |j)d d dR  fdd}|6dd@dd}n|4ddddd	5 }|6dd}tj=||fdd}|Ad}ni|| t$j(kr{|j)d }!|3|!||d}|4dddd5 }|j)dd  \}}"t8B|d t8B|"d g}#t:j;j<||#dd}|6d@dd5 }d| j v rtj=|| jjj>d ?|j)d d|j)d fdd}|| q|}|j  }$|jC  }%|jD}&d}'t|	D ]}(|( sq|$|( })|%|( }*|&|( }+t# |( jD ]\}},|, |( j|  |+krq|,|+|* kr n||' | }-|-j)d }.|,|+ }/|)|/ }0|0|. }1|1|)ksJ |/dk r|)}0|-|/ d  }-|1|)|* kr1|-d |)|* |1  }-|)|* }1z	|-||0|1< W q tEyq } z*t2d|  t2d|j)d|-j) t2d |)d!|,d"|+d#|. W Y d }~qd }~ww |'d7 }'q| j||||d$S |jF r| |||S d S )%Nr   r>   )minmaxc                 S   r(   r)   )r4   r,   r)   r)   r/   r0      r1   z0LlavaBaseForCausalLM.forward.<locals>.<listcomp>c                    s(   g | ]}| rd d  | j D qS )c                 S   r(   r)   )featurer,   r)   r)   r/   r0      r1   ;LlavaBaseForCausalLM.forward.<locals>.<listcomp>.<listcomp>)r@   r-   ir'   need_visionr)   r/   r0      s    c                    s,   g | ]}| rt d d  | jD qS )c                 S   r(   r)   r*   r,   r)   r)   r/   r0      r1   r   )r#   r@   r   r   r)   r/   r0      s       )axis)devicec                 S   s   g | ]}|j d  qS )r   )shape)r-   imager)   r)   r/   r0      s    dimspatialr8   r9   r<   r=   zError: )r;   r;   r;   unpad   r?   bilinear)mode)sizer   )NNz RuntimeError in image encoding: zinput_embeds.shape=z, tmp_image_feature.shape=z
start_idx=z, image_offset=z, prefix_len=z
, pad_len=)input_embeds)G	mm_inputsforward_mode	is_extendclamp_rM   
vocab_sizelanguage_modelmodelembed_tokensextendr@   rX   rW   npr   arrayrB   extend_start_loccpunumpyrA   
batch_sizer#   rangendimconcatenatetorchtensorrL   r   r{   splitmm_patch_merge_type
startswithrJ   rC   r   IMAGErN   r5   r6   r   rO   rP   rQ   rR   rG   r   rK   	Exceptionprintviewpermute
contiguousflattenr    rE   rS   r
   
functionalinterpolatecatimage_newlineexpand	transpose	unsqueezerF   extend_seq_lensextend_prefix_lens_cpuRuntimeError	is_decode)2rY   r&   r|   r}   r   modalities_listmax_image_offsetimstart_positionsbsrl   r+   concat_imagesrz   split_sizesnew_image_featuresr_   r`   r\   image_featurerN   base_image_featurerg   rh   vision_tower_image_sizera   rb   eunitcrc   rd   ri   num_of_framesweightscaled_shapeextend_start_loc_cpur   prefix_lens_cpuptr   	start_idxseq_len
prefix_lenimage_offsettmp_image_featurepad_leninput_offsetleft_idx	right_idxr)   r   r/   forward   s  

















	




zLlavaBaseForCausalLM.forwardweightsc                 C   s  | j j}d|v rtj|tjd | _nd|v r'tj|tjd | _d| j _	| j
  | j j| _| j j	| _| jj j| _| jj j| _t| j dd| _t| j dd| _t| j d	d | _t| j| j d
 | _| jdkso| jdkrpn| jdkr}|  jd7  _ntd| j ddddd}t|  }|D ]<\}}d|v sd|v sd|v r| D ]\}}||v r|||}q|| }	t|	dt}
|
|	| q| j||fg qd S )Ncliptorch_dtypesigliprs   r   flatrN   squarerK   r;   rr   	cls_patchr>   Unexpected select feature: zmulti_modal_projector.linear_1zmulti_modal_projector.linear_2rL   z"language_model.model.image_newline)zmodel.mm_projector.0zmodel.mm_projector.2zmodel.vision_tower.vision_towerzmodel.image_newline	projectorr   weight_loader)rM   mm_vision_towerr   from_pretrainedr   float16cudarL   r   mm_vision_select_featureevalmm_vision_select_layerru   rv   rG   rH   getattrr   rN   rK   rQ   rI   rV   select_featuredictnamed_parametersitemsreplacer   r   load_weights)rY   r   vision_pathprojector_weightsparams_dictnameloaded_weightweight_name
param_nameparamr   r)   r)   r/   r     s\   





z!LlavaBaseForCausalLM.load_weightsc                 C   s   | j | j S r3   )rG   rH   rY   r)   r)   r/   rJ     s   z)LlavaBaseForCausalLM.num_patches_per_sideN)__name__
__module____qualname__r   rQ   r   rk   r	   r   Tensorr{   no_grad
LongTensorr   r   r   r   strr   propertyrJ   r)   r)   r)   r/   r%   9   s,    D
  #9r%   c                	       8   e Zd Z		d	dedee deddf fddZ  ZS )
LlavaLlamaForCausalLMN rM   quant_configprefixrm   c                    s   t    || _d | _|j| jj_|j| jj_t|| _	t
||td|d| _dt|ddv rAttj|jjtjd| jj_d S d S )Nr   r	  r
  r   r   r  dtype)super__init__rM   rL   mm_hidden_sizevision_confighidden_sizetext_configr   rw   r   r"   r   r   r
   	Parameterr   emptyr   r   r   rY   rM   r	  r
  	__class__r)   r/   r    s    

zLlavaLlamaForCausalLM.__init__Nr  	r   r   r   r   r   r   r  r  __classcell__r)   r)   r  r/   r        r  c                	       r  )
LlavaQwenForCausalLMNr  rM   r	  r
  rm   c                       t    || _d | _t| jdd d u rt| jj| j_t| jdd d u r-t| jj	| j_
|j| jj_|j| jj
_t| jdd d u rFd| j_t| jdd d u rSd| j_t|| _t||td|d| _d	t|d
dv r}ttj|j
jtjd| jj_d S d S )Nr  r  projector_hidden_actgelurU   i^P r   r  r   r   r  r  )r  r  rM   rL   r   r   r   r  r   _name_or_pathr  r  r  r  rU   r   rw   r   r"   r   r
   r  r   r  r   r   r   r  r  r)   r/   r    0   

zLlavaQwenForCausalLM.__init__r  r  r)   r)   r  r/   r    r  r  c                	       r  )
LlavaMistralForCausalLMNr  rM   r	  r
  rm   c                    r  )Nr  r  r  r   rU   i }  r   r  r   r   r  r  )r  r  rM   rL   r   r   r   r  r   r!  r  r  r  r  rU   r   rw   r   r"   r   r
   r  r   r  r   r   r   r  r  r)   r/   r  6  r"  z LlavaMistralForCausalLM.__init__r  r  r)   r)   r  r/   r#  5  r  r#  c                       s   e Zd ZdZeZedd Zdee	 de
f fddZefdee fd	d
Zedee deeef fddZ		d"dedee deddf fddZdee dejfddZ		d#dejdejdededee f
ddZdee eejf  fd d!Z!  Z"S )$LlavaForConditionalGenerationa  
    An adaptor class to enable support for multiple mmlm such as mistral-community/pixtral-12b
    It follows the structure of (vision_tower, multi_modal_projector, language_model)

    Once a model config is loaded, text_config and vision_config will be extracted, and
    LlavaForConditionalGeneration will load the language_model and vision_tower models
    according to config.
    c                 C   s   | j S r3   r   r   r)   r)   r/   r  f  s   z#LlavaForConditionalGeneration.dtyper&   r'   c                    s(   t | jdr| j||S t ||S )Nrk   )hasattrrL   rk   r  )rY   r&   r'   r  r)   r/   rk   j  s   z+LlavaForConditionalGeneration.pad_input_idsauto_model_typec                 C   s   |j j}| |}|| }rPt|tr)|d }td|j d| d|j  z
tj	j
|d W S  tyO } zt|j d| d| d| d}~ww t|j d	| d
)a  
        Get the SGLang model implementation class according to config.

        Args:
            config: The config object of the model.
            auto_model_type: The type of the auto model.

        Returns:
            The SGLang model implementation class.
        r   z	Multiple z$ models found for submodule config `z`, defaulting to [0]: z found a corresponding model `z` for config class `z5`, but failed to load it from SGLang ModelRegistry. 
Nz5 cannot find a corresponding model for config class ``)r  r   %_config_cls_name_to_arch_name_mappingget
isinstancetupler$   warning
sgl_modelsregistryModelRegistryresolve_model_clsr   rV   )rY   rM   r&  config_cls_namearch_name_mappingarchr   r)   r)   r/   _get_sgl_model_clsp  s&   

z0LlavaForConditionalGeneration._get_sgl_model_clsrm   c                 C   s^   i }|j  D ]%}|j |d }|d ur,t|tr&tdd |D ||j< q|j||j< q|S )Nc                 s   s    | ]}|j V  qd S r3   )r   )r-   r3  r)   r)   r/   r7     s    
zVLlavaForConditionalGeneration._config_cls_name_to_arch_name_mapping.<locals>.<genexpr>)_model_mappingkeysr)  r*  r+  r   )rY   r&  mapping
config_clsarchsr)   r)   r/   r(    s   
zCLlavaForConditionalGeneration._config_cls_name_to_arch_name_mappingNr  rM   r	  r
  c                    s  t    t|dsJ t|dsJ || _| jj| _| jj| _t| jd| _t| jds2| j| j_t| jds=| j| j_t| jdsI| jj| j_t| jdsSd| j_	t| jdsoddgd	d	gd
d
gddgddgddgg| j_
t| jdsyd| j_t| jdsd| j_t| jdsd| j_t| jdd| _t| jdd| _| jj| _| jj| _| jj| _| jj	| _	| jj
| _
t| j| j d | _| || _| | jt}| | jt}|| j|td|d| _|| j|td|d| _dt| jddv r	ttj| jj | jd| jj!_"d S d S )Nr  r  r   r   rN   r9   rK   `      i  i   i   i   r   r   rU   
   r  r   ru   r   rv   rs   r;   r   r  rL   r   r  r  )#r  r  r%  rM   r  r  r   r   r   rN   rK   r   rU   r  ru   rv   rG   rH   rQ   rI   MULTIMODAL_PROJECTOR_TYPErw   r4  r   r   r"   r   rL   r
   r  r   r  r  r   r   )rY   rM   r	  r
  language_model_clsvision_model_clsr  r)   r/   r    sv   









z&LlavaForConditionalGeneration.__init__r   c           	      C   s   g }|D ]B}|j |j}}| j||dd}|j| j }| jdv r+|ddddf }n| jdkr3|}ntd| j || |	d qt
j|dd	}|S )
a&  Extract features from image inputs.

        Args:
            items: List of MultimodalDataItem objects containing image data
                Note that an item can be either "image" or "multi-images"

        Returns:
            torch.Tensor: features from image inputs, concatenated
        Trn   rp   Nr>   rs   r   r   r   )r   r+   rL   rt   ru   rv   rV   rW   rw   squeezer   r   )	rY   r   featuresr.   rl   r+   rx   ry   retr)   r)   r/   get_image_feature  s*   



z/LlavaForConditionalGeneration.get_image_featureFr|   r}   get_embeddingpp_proxy_tensorsc              
   C   s&   t |||| jtj| jid ||d}|S )N)r&   r}   rD  r   data_embedding_funcsplaceholder_tokensr|   rE  )r   r   r   r   rC  )rY   r&   r|   r}   rD  rE  rt   r)   r)   r/   r     s   z%LlavaForConditionalGeneration.forwardr   c                 C   s   | j dks
| j dkrn| j dkr|  jd7  _ntd| j  t|  }|D ]4\}}dD ]}||rL|t|d d }t| |||fg  nq.|| }t|d	t	}||| q(dS )
a(  Load weights for LlavaForConditionalGeneration.

        Unlike the base class implementation, this one doesn't need to handle
        weight name remapping as the weights are already properly structured with
        'language_model' and 'vision_tower' prefixes in the safetensors files.
        rr   rs   r   r>   r   )r   rL   .Nr   )
rv   rI   rV   r   r   r   rD   r   r   r   )rY   r   r   r   r   partr   r   r)   r)   r/   r   %  s*   





z*LlavaForConditionalGeneration.load_weightsr  )FN)#r   r   r   __doc__r   r=  r  r  r   rQ   r   rk   r   r   r4  r   r   r  r(  r   r   r   r  r   r   r  rC  r   boolr   r   r   r   r   r  r)   r)   r  r/   r$  Z  sN    	

M(
$r$  )CrJ  rE   rO   	functoolsr   typingr   r   r   r   r   r   r	   r   r   r   r
   transformersr   r   r   r   r   r   &transformers.models.auto.modeling_autor   r   (transformers.models.llava.modeling_llavar   sglang.srt.modelssrtmodelsr-  *sglang.srt.layers.quantization.base_configr   sglang.srt.managers.mm_utilsr   "sglang.srt.managers.schedule_batchr   r   r   ,sglang.srt.model_executor.forward_batch_infor   r   $sglang.srt.model_loader.weight_utilsr   sglang.srt.models.llamar   sglang.srt.models.mistralr   sglang.srt.models.qwen2r   sglang.srt.multimodal.mm_utilsr   r    r!   sglang.srt.utilsr"   r#   r$   Moduler%   r  r  r#  r$  
EntryClassr)   r)   r)   r/   <module>   sH   $    @%% p