o
    
۾iRW                     @   s  U d dl Z d dlmZmZmZ d dlmZmZmZm	Z	m
Z
mZ d dlZd dlmZ d dlmZmZmZmZ d dlmZ d dlmZmZmZ d dlmZ d dlmZmZ d d	lm Z m!Z! d d
l"m#Z# d dl$m%Z% d dl&m'Z'm(Z( d dl)m*Z* d dl+m,Z, d dl-m.Z. d dl/m0Z0 d dl1m2Z2m3Z3 d dl4m5Z5m6Z6m7Z7m8Z8 d dl9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z? d dl@mAZA d dlBmCZCmDZD ddlEmFZF ddlGmHZHmIZImJZJ ddlKmLZL ddlMmNZNmOZOmPZPmQZQ ddlRmSZSmTZTmUZU G dd deCZVG dd deCZWeVeWB ZXe
eYd < G d!d" d"e	ZZG d#d$ d$ed%d&Z[G d'd( d(eZ\G d)d* d*ej]Z^G d+d, d,e<Z_ed-e_d.Z`G d/d0 d0e,e` ZaG d1d2 d2e;e` Zbd3e=d4e_fd5d6Zcdd7d8e`d9e:e` d:e0dB d4e;fd;d<Zddd=d>d?eZd@e*dB dAeedB dBefd4eFeLB f
dCdDZge.jhedeceadEG dFdG dGej]eIeJZidS )H    N)IterableMappingSequence)	AnnotatedFinalLiteralProtocol	TypeAliasTypeVar)BatchFeatureCLIPVisionConfigPretrainedConfigSiglipVisionConfig)LlavaConfig)
ImageInputget_image_sizeto_numpy_array)LlavaProcessor)ProcessingKwargsUnpack)PreTokenizedInput	TextInput)
VllmConfig)
get_act_fn)ColumnParallelLinearRowParallelLinear)QuantizationConfig)LlavaDummyInputsBuilder)MULTIMODAL_REGISTRY)BaseMultiModalProcessorCache)MultiModalFieldConfigMultiModalKwargsItems)ImageEmbeddingItemsImageProcessorItems	ImageSizeMultiModalDataItems)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoInputProcessingContextPromptReplacementPromptUpdate)IntermediateTensors)TensorSchemaTensorShape   )CLIPVisionModel)MultiModalEmbeddingsSupportsMultiModal
SupportsPP)SiglipVisionModel)AutoWeightsLoaderget_layer_indexinit_vllm_registered_modelmaybe_prefix)VisionEncoderInfoget_num_selected_vision_tokensget_vision_encoder_infoc                   @   s>   e Zd ZU dZdZed ed< eej	e
ddddf ed< dS )	TarsierImagePixelInputsz
    Dimensions:
        - bn: Batch size * number of images
        - c: Number of channels (3)
        - h: Height
        - w: Width
    pixel_valuestypebn   hwN__name__
__module____qualname____doc__r>   r   __annotations__r   torchTensorr.    rK   rK   V/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/tarsier.pyr<   ?   s   
  r<   c                   @   s<   e Zd ZU dZdZed ed< eej	e
dddf ed< dS )	TarsierImageEmbeddingInputsz
    Dimensions:
        - bn: Batch size * number of images
        - ifs: Image feature size
        - hs: Hidden size (must match the hidden size of language model
          backbone)
    image_embedsr>   r?   ifshsdataNrC   rK   rK   rK   rL   rM   L   s   
 rM   TarsierImageInputsc                   @   s   e Zd ZU ee ed< ee ed< ee ed< ee ed< eeee B  ed< ee ed< ee ed< ee ed< d	Z	e
ed
< dS )TarsierHfConfigvision_configtext_configimage_token_indexvision_feature_select_strategyvision_feature_layerprojector_hidden_actimage_newline_idximage_new_idxTmultimodal_projector_biasN)rD   rE   rF   r   r   rH   intstrlistr\   boolrK   rK   rK   rL   rS   \   s   
 rS   c                   @   s   e Zd Zddii dZdS )TarsierProcessorKwargspaddingF)text_kwargsimages_kwargsN)rD   rE   rF   	_defaultsrK   rK   rK   rL   ra   h   s
    
ra   F)totalc                	   @   sH   e Zd Z				ddedeeB ee B ee B dee de	fddZ
dS )	TarsierProcessorNimagestextkwargsreturnc                 K   sP  |d u r|d u rt d| jtfd| jji|}|d ur)| j|fi |d }ni }t|tr4|g}nt|tsDt|d tsDt d|}|	dd ur|d }	t
t|	d \}
}|
| j || j d  | j d }| jdkru|d8 }g }|D ]}|| j| j| }|| qy|d	 d
d }| j|fi |d	 }ti |||dS )Nz7You have to specify at least one of `images` or `text`.tokenizer_init_kwargsrd   r   zAInvalid input text. Please provide a string, or a list of stringsr=   r/   defaultrc   return_tensors)rQ   tensor_type)
ValueError_merge_kwargsra   	tokenizerinit_kwargsimage_processor
isinstancer^   r_   getr   r   
patch_sizenum_additional_image_tokensrW   replaceimage_tokenappendpopr   )selfrh   ri   audiovideosrj   output_kwargsimage_inputsprompt_stringsr=   heightwidthnum_image_tokenssamplern   text_inputsrK   rK   rL   __call__r   sZ   

zTarsierProcessor.__call__)NNNN)rD   rE   rF   r   r   r   r_   r   ra   r   r   rK   rK   rK   rL   rg   q   s(    	
rg   c                       sV   e Zd Z		ddedededededB def fd	d
Zdej	dej	fddZ
  ZS )TarsierMultiModalProjectorN vision_hidden_sizetext_hidden_sizerY   r\   quant_configprefixc                    sL   t    t||||| dd| _t|| _t||||| dd| _d S )Nz	.linear_1)biasr   r   z	.linear_2)super__init__r   linear_1r   actr   linear_2)r}   r   r   rY   r\   r   r   	__class__rK   rL   r      s    
	
z#TarsierMultiModalProjector.__init__image_featuresrk   c                 C   s*   |  |\}}| |}| |\}}|S N)r   r   r   )r}   r   hidden_states_rK   rK   rL   forward   s   
z"TarsierMultiModalProjector.forward)Nr   )rD   rE   rF   r]   r^   r`   r   r   rI   rJ   r   __classcell__rK   rK   r   rL   r      s"    r   c                   @   s   e Zd ZdefddZdefddZdedefddZ	de
eed	B f fd
dZdededefddZdefddZdefddZdefddZdefddZd	S )TarsierProcessingInfork   c                 C   s   | j tS r   )ctxget_hf_configHfLlavaConfigr}   rK   rK   rL   r         z#TarsierProcessingInfo.get_hf_configc                 C   s   t |  S r   )r;   r   r   rK   rK   rL   r;      r   z-TarsierProcessingInfo.get_vision_encoder_inforj   c                 K   s,   |   }|d|  | jjtfi |S )Nrw   )r;   
setdefaultget_patch_sizer   get_hf_processorrg   )r}   rj   vision_inforK   rK   rL   r      s   z&TarsierProcessingInfo.get_hf_processorNc                 C   s   dd iS )NimagerK   r   rK   rK   rL   get_supported_mm_limits      z-TarsierProcessingInfo.get_supported_mm_limitsimage_widthimage_heightc          
      C   s   |   }|  }t|j||d|j}|dkr2|  }t|j|j|jd|j}|dkr0td|}t	t
|}|| d }	|	S )Nr   r   r   z4Could not determine a valid number of image patches.r/   )r   r;   r:   get_num_image_tokensrW   !get_image_size_with_most_featuresr   r   rp   r]   mathsqrt)
r}   r   r   	hf_configvision_encoder_infonum_projected_patchesdefault_sizenum_projected_patches_defaultnum_height_patchestotal_image_tokens_for_llmrK   rK   rL   r      s0   z*TarsierProcessingInfo.get_num_image_tokensc                 C   s    |   }|  }}t||dS )N)r   r   )r;   r   r$   )r}   r   r   r   rK   rK   rL   r     s   z7TarsierProcessingInfo.get_image_size_with_most_featuresc                 C   s   |   \}}| j||dS )Nr   )r   r   )r}   target_widthtarget_heightrK   rK   rL   get_max_image_tokens  s
   z*TarsierProcessingInfo.get_max_image_tokensc                 C   
   |   jS r   )r   rZ   r   rK   rK   rL   get_image_newline_idx     
z+TarsierProcessingInfo.get_image_newline_idxc                 C   r   r   )r   r[   r   rK   rK   rL   get_image_new_idx  r   z'TarsierProcessingInfo.get_image_new_idx)rD   rE   rF   rS   r   r9   r;   objectrg   r   r   r^   r]   r   r   r$   r   r   r   r   rK   rK   rK   rL   r      s     
r   
_I_Tarsier)boundc                   @   s   e Zd ZdS )TarsierDummyInputsBuilderN)rD   rE   rF   rK   rK   rK   rL   r     s    r   c                	   @   sX   e Zd Zdedeeef deeef fddZde	deeef de
dee fdd	Zd
S )TarsierMultiModalProcessor	hf_inputshf_processor_mm_kwargsrk   c                 C   s   t tdtddS )Nr   )r=   rN   )dictr    batched)r}   r   r   rK   rK   rL   _get_mm_fields_config  s   z0TarsierMultiModalProcessor._get_mm_fields_configmm_itemsout_mm_kwargsc                    s8   j  }|j dtf fdd}td g|dgS )Nitem_idxc                    sh    dttf}t|tr || }tt|}|| d }n|| }j	j
|j|jd} g| S )Nr   r/   r   )	get_itemsr"   r#   ru   get_feature_sizer]   r   r   r   infor   r   r   )r   rh   r   r   num_final_image_tokens
image_sizeimage_token_idr   r}   rK   rL   get_replacement0  s   



zGTarsierMultiModalProcessor._get_prompt_updates.<locals>.get_replacementr   )modalitytargetreplacement)r   r   rV   r]   r*   )r}   r   r   r   r   r   rK   r   rL   _get_prompt_updates'  s   
z.TarsierMultiModalProcessor._get_prompt_updatesN)rD   rE   rF   r   r   r^   r   r    r   r%   r!   r   r+   r   rK   rK   rK   rL   r     s"    




r   r   rk   c                 C   s   t | S r   )r   )r   rK   rK   rL   _build_tarsier_hf_infoL  r   r   cacher   dummy_inputsr   c                C   s$   t | trt| ||dS tt| )Nr   )ru   r   r   NotImplementedErrorr>   )r   r   r   rK   rK   rL   _build_tarsier_hf_processorP  s   
r   r   )require_post_normr   r   r   r   r   c                   s   | j }| j}|j t|trt| }nt|ttfr't fdd|D }n
t	dt
| dt|tr?t|||||dS t|trMt|||||dS dt
| }t|)Nc                 3   s    | ]}t | V  qd S r   )r6   ).0idxbase_num_hidden_layersrK   rL   	<genexpr>p  s    

z0init_vision_tower_for_tarsier.<locals>.<genexpr>zvision_layer_feature type:  is not supported)r   num_hidden_layers_overrider   r   z'Unsupported vision config for Tarsier: )rT   rX   num_hidden_layersru   r]   r6   r_   tuplemax	TypeErrorr>   r   r0   r   r4   r   )r   r   r   r   rT   feature_layersnum_hidden_layers_to_initmsgrK   r   rL   init_vision_tower_for_tarsier_  s@   



r   )r   r   c                       s  e Zd Zg dddgdZededededB fd	d
Zdddededdf fddZ	de
dedB fddZdeeB dejeej B dejeejdf B fddZdejdejfddZdedejeejdf B fddZdedejeejdf B fd d!Zde
defd"d#Z		d0d$ejdB d%ejd&edB d'ejdB de
dejeB fd(d)Zd*ejdejdB fd+d,Zd-eeeejf  dee fd.d/Z  Z S )1TarsierForConditionalGeneration)q_projk_projv_proj	gate_projup_proj)qkv_projgate_up_projr   irk   Nc                 C   s   | drdS td)Nr   z<image>z Only image modality is supported)
startswithrp   )clsr   r   rK   rK   rL   get_placeholder_str  s   
z3TarsierForConditionalGeneration.get_placeholder_strr   )r   vllm_configr   c             
      s$  t    |jj}|j}|| _| |dM t||dt|dd| _	t
|dd}t|jj|jj|j||t|dd| _| jd	tj|jgtjd
dd | jdtj|jgtjd
dd W d    n1 sew   Y  | | t||jt|dd| _W d    n1 sw   Y  | jj| _d S )Nr   Fvision_tower)r   r   r   r\   Tmulti_modal_projector)r   r   rY   r\   r   r   image_newline_idx_tensor)dtype)
persistentimage_new_idx_tensorlanguage_model)r   r   r   )r   r   model_configr   r   config_mark_tower_modelr   r8   r   getattrr   rT   hidden_sizerU   rY   r   register_bufferrI   tensorrZ   longr[   _mark_language_modelr7   r  make_empty_intermediate_tensors)r}   r   r   r  r   projector_biasr   rK   rL   r     sP   

	z(TarsierForConditionalGeneration.__init__rj   c                 K   s\   | dd }| dd }|d u r|d u rd S |d ur td|dS |d ur*td|dS td)Nr=   rN   )r>   r=   )r>   rQ   z This line should be unreachable.)r|   r<   rM   AssertionError)r}   rj   r=   rN   rK   rK   rL   _parse_and_validate_image_input  s   z?TarsierForConditionalGeneration._parse_and_validate_image_inputr   r=   .c                 C   s   ||| j jdS )N)feature_select_strategy)r  rW   )r}   r   r=   rK   rK   rL   _image_pixels_to_features  s   z9TarsierForConditionalGeneration._image_pixels_to_featuresprojected_image_featuresc                 C   s$  |j \}}}tt|}|| }|j}| jjj}|| j	|
d}	|| j	|
d}
z
|||||}W n( tya } ztd|j  d| d| d| d| d| d| d|d}~ww |	||d	|f}tj||gd
d}|| }||||}|
|d	|f}tj||gd	d}|S )z@
        Implements Tarsier's `add_split_tokens` logic.
        r   z3Cannot reshape projected_image_features with shape z to (z, z[). Ensure num_projected_patches is compatible with a grid structure. num_projected_patches=z, derived num_height_patches=. Nr/      )dim)shaper]   r   r   devicer  modelembed_tokensr   tosqueezer   viewRuntimeErrorexpandrI   cat)r}   r  
num_imagesr   	embed_dimr   num_width_patchesr  embedding_layerimage_newline_embimage_new_embcurrent_image_features_grideimage_newline_expandedfeatures_with_newlinesnew_num_patches_after_newlinefeatures_with_newlines_flatimage_new_expandedfinal_image_featuresrK   rK   rL   _add_tarsier_split_tokens  sl   

	
z9TarsierForConditionalGeneration._add_tarsier_split_tokensinputsc                 C   sN   |d }|  | j|}t|tjr| |}| |}|S tdt| d)Nr=   z _image_pixels_to_features type: r   )	r  r   ru   rI   rJ   r   r-  r   r>   )r}   r.  r=   image_features_selectedprojected_featuresfinal_featuresrK   rK   rL   _process_image_pixels"  s   

z5TarsierForConditionalGeneration._process_image_pixelsimage_inputc                 C   sH   |d dkr|d }t |tjr| |S tdt| d| |S )Nr>   rN   rQ   z*Incorrect type of image_embeds. Got type: r  )ru   rI   rJ   r-  rp   r>   r2  )r}   r3  r0  rK   rK   rL   _process_image_input4  s   

z4TarsierForConditionalGeneration._process_image_inputc                 K   s&   | j di |}|d u rg S | |S )NrK   )r  r4  )r}   rj   r3  rK   rK   rL   embed_multimodalD  s   
z0TarsierForConditionalGeneration.embed_multimodal	input_ids	positionsintermediate_tensorsinputs_embedsc                 K   s$   |d urd }| j j||||d}|S )N)r6  r7  r8  r9  )r  r  )r}   r6  r7  r8  r9  rj   r   rK   rK   rL   r   J  s   z'TarsierForConditionalGeneration.forwardr   c                 C   s   | j |S r   )r  compute_logits)r}   r   rK   rK   rL   r:  ]  s   z.TarsierForConditionalGeneration.compute_logitsweightsc                 C   s   t | }||S r   )r5   load_weights)r}   r;  loaderrK   rK   rL   r<  c  s   
z,TarsierForConditionalGeneration.load_weights)NN)!rD   rE   rF   packed_modules_mappingclassmethodr^   r]   r   r   r   r   rR   r  r0   r4   rI   rJ   r_   r   r  r-  r<   r2  r4  r1   r5  r,   r   r:  r   setr<  r   rK   rK   r   rL   r     sp     0


1




,r   )jr   collections.abcr   r   r   typingr   r   r   r   r	   r
   rI   torch.nnnntransformersr   r   r   r   r   r   transformers.image_utilsr   r   r   transformers.models.llavar   transformers.processing_utilsr   r   $transformers.tokenization_utils_baser   r   vllm.configr   %vllm.model_executor.layers.activationr   !vllm.model_executor.layers.linearr   r   'vllm.model_executor.layers.quantizationr    vllm.model_executor.models.llavar   vllm.multimodalr   vllm.multimodal.cacher   vllm.multimodal.inputsr    r!   vllm.multimodal.parser"   r#   r$   r%   vllm.multimodal.processingr&   r'   r(   r)   r*   r+   vllm.sequencer,   vllm.utils.tensor_schemar-   r.   clipr0   
interfacesr1   r2   r3   siglipr4   utilsr5   r6   r7   r8   visionr9   r:   r;   r<   rM   rR   rH   rS   ra   rg   Moduler   r   r   r   r   r   r   r`   r^   r   register_processorr   rK   rK   rK   rL   <module>   s   
  	>#C0

.