o
    -iQ                     @   s  U d Z ddlmZmZmZ ddlmZmZmZ ddl	Z	ddl
mZ ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZmZ ddlm Z  ddl!m"Z"m#Z#m$Z$m%Z% ddl&m'Z' ddl(m)Z) ddl*m+Z+ ddl,m-Z-m.Z.m/Z/m0Z0 ddl1m2Z2 ddl3m4Z4m5Z5m6Z6m7Z7m8Z8 ee9Z:G dd de+Z;e;Z<ee=d< G dd dej>Z?G dd dej>Z@G dd de$ZAG dd  d e"eA ZBG d!d" d"e#eA ZCejDeCeAeBd#G d$d% d%ej>e/e.e0ZEdS )&zInference-only BAGEL model compatible with HuggingFace weights.

BAGEL is a unified multimodal model for image understanding and generation.
For vLLM, we focus on the image understanding (vision-to-text) capabilities.
    )IterableMappingSequence)AnyLiteral	TypeAliasN)
VllmConfig)BaseDummyOptions)init_logger)
get_act_fn)ColumnParallelLinearRowParallelLinear)QuantizationConfig)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItems)MultiModalDataItems)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptReplacement)IntermediateTensors)BagelProcessor)TensorSchema   )MultiModalEmbeddingsSupportsLoRASupportsMultiModal
SupportsPP)SiglipVisionModel)AutoWeightsLoaderStageMissingLayerWeightsMapperinit_vllm_registered_modelmaybe_prefixc                   @   s(   e Zd ZU dZed ed< ejed< dS )BagelImagePixelInputsz
    Dimensions:
        - bn: Batch size * number of images
        - c: Number of channels (3)
        - h: Height of each image
        - w: Width of each image
    pixel_valuestypeN)__name__
__module____qualname____doc__r   __annotations__torchTensor r0   r0   ]/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/model_executor/models/bagel.pyr&   <   s   
 r&   BagelImageInputsc                       s\   e Zd ZdZ			ddedededed	edB d
ef fddZdej	dej	fddZ
  ZS )BagelVisionMLPz"MLP connector for vision features.gelu_pytorch_tanhN in_featureshidden_featuresout_features	act_layerquant_configprefixc                    sL   t    t||d|| dd| _t|| _t||d|| dd| _d S )NTz.fc1)biasr:   r;   z.fc2)super__init__r   fc1r   actr   fc2)selfr6   r7   r8   r9   r:   r;   	__class__r0   r1   r>   O   s    
	
zBagelVisionMLP.__init__xreturnc                 C   s*   |  |\}}| |}| |\}}|S N)r?   r@   rA   )rB   rE   _r0   r0   r1   forwardi   s   
zBagelVisionMLP.forward)r4   Nr5   )r)   r*   r+   r,   intstrr   r>   r.   r/   rI   __classcell__r0   r0   rC   r1   r3   L   s&    r3   c                       sz   e Zd ZdZdedef fddZededefdd	Zedefd
dZedefddZ	de
jde
jfddZ  ZS )PositionEmbeddingzA2D position embedding for vision tokens using sin-cos embeddings.max_num_patch_per_sidehidden_sizec                    s@   t    || _|| _| ||}| jdt| dd d S )N	pos_embedF)
persistent)	r=   r>   rN   rO   _get_2d_sincos_pos_embedregister_bufferr.   
from_numpyfloat)rB   rN   rO   rP   rC   r0   r1   r>   s   s   

zPositionEmbedding.__init__	embed_dim	grid_sizec                 C   sd   ddl }|j||jd}|j||jd}|||}|j|dd}|dd||g}t| |}|S )z(Generate 2D sin-cos position embeddings.r   Ndtypeaxis   r   )numpyarangefloat32meshgridstackreshaperM   "_get_2d_sincos_pos_embed_from_grid)rV   rW   npgrid_hgrid_wgridrP   r0   r0   r1   rR      s   z*PositionEmbedding._get_2d_sincos_pos_embedc                 C   sV   ddl }| d dksJ t| d |d }t| d |d }|j||gdd}|S )z2Generate 2D sin-cos position embeddings from grid.r   Nr\   r   rZ   )r]   rM   "_get_1d_sincos_pos_embed_from_gridconcatenate)rV   rg   rd   emb_hemb_wembr0   r0   r1   rc      s   z4PositionEmbedding._get_2d_sincos_pos_embed_from_gridc                 C   s   ddl }| d dksJ |j| d |jd}|| d  }dd|  }|d}|d	||}||}||}|j||gd
d}|S )z(Generate 1D sin-cos position embeddings.r   Nr\   rX   g       @g      ?i'  zm,d->mdr   rZ   )r]   r^   float64rb   einsumsincosri   )rV   posrd   omegaoutemb_sinemb_cosrl   r0   r0   r1   rh      s   


z4PositionEmbedding._get_1d_sincos_pos_embed_from_gridposition_idsrF   c                 C   s   | | jj}| j| S )z
        Args:
            position_ids: Flattened position IDs, shape (N,) where each ID
                         corresponds to a position in the flattened grid
        Returns:
            Position embeddings of shape (N, hidden_size)
        )torP   device)rB   rw   r0   r0   r1   rI      s   	
zPositionEmbedding.forward)r)   r*   r+   r,   rJ   r>   staticmethodrR   rc   rh   r.   r/   rI   rL   r0   r0   rC   r1   rM   p   s    rM   c                   @   sx   e Zd ZdZdedefddZdeee	dB f fddZ
d	e	d
eee	f deee	f fddZde	de	de	fddZdS )BagelProcessingInfoz'Processing information for BAGEL model.kwargsrF   c                 K   sH   ddl m} || jjj| jjj| jjjd}|  }td||d|S )Nr   )cached_get_image_processor)revisiontrust_remote_code)image_processor	tokenizerr0   )	!vllm.transformers_utils.processorr}   ctxmodel_configmodelr~   r   get_tokenizerr   )rB   r|   r}   r   r   r0   r0   r1   get_hf_processor   s   z$BagelProcessingInfo.get_hf_processorNc                 C   s   dd iS )Nimager0   )rB   r0   r0   r1   get_supported_mm_limits   s   z+BagelProcessingInfo.get_supported_mm_limitsseq_len	mm_countsc                 C   s   |   }|jd }d|iS )Nr\   r   )get_hf_configvit_max_num_patch_per_side)rB   r   r   	hf_configmax_num_patchesr0   r0   r1   get_mm_max_tokens_per_item   s   
z.BagelProcessingInfo.get_mm_max_tokens_per_itemimage_widthimage_heightc                C   s,   |   }|j}|j}|| }|| }|| S rG   )r   
vit_config
patch_size)rB   r   r   r   r   r   num_patches_hnum_patches_wr0   r0   r1   get_num_image_tokens   s   z(BagelProcessingInfo.get_num_image_tokens)r)   r*   r+   r,   objectr   r   r   rK   rJ   r   r   r   r0   r0   r0   r1   r{      s$    


r{   c                	   @   s\   e Zd ZdZdeeef defddZ	ddedeeef deeef dB de	fd	d
Z
dS )BagelDummyInputsBuilderz-Build dummy inputs for BAGEL model profiling.r   rF   c                 C   s   | dd}d| S )Nr   r   <|image_pad|>)get)rB   r   
num_imagesr0   r0   r1   get_dummy_text   s   z&BagelDummyInputsBuilder.get_dummy_textNr   
mm_optionsc           	      C   sJ   | dd}| j }|j}|j}|r| dnd }d| j||||diS )Nr   r   )widthheightr   	overrides)r   infor   r   
image_size_get_dummy_images)	rB   r   r   r   r   r   r   r   image_overridesr0   r0   r1   get_dummy_mm_data   s   
z)BagelDummyInputsBuilder.get_dummy_mm_datarG   )r)   r*   r+   r,   r   rK   rJ   r   r	   r   r   r0   r0   r0   r1   r      s    	
r   c                
   @   s   e Zd ZdZdededeeef deeef def
ddZ	dedeee
f d	edee fd
dZde
deeef deeef fddZdS )BagelMultiModalProcessorz%Multimodal processor for BAGEL model.prompt_textmm_itemshf_processor_mm_kwargstokenization_kwargsrF   c                 C   s   dS )NFr0   )rB   r   r   r   r   r0   r0   r1   _hf_processor_applies_updates  s   z6BagelMultiModalProcessor._hf_processor_applies_updatesout_mm_kwargsc                    sX   | j   | j  }| ddu rtddtf fdd}tdg|dgS )	z=Replace image placeholders with the correct number of tokens.r   Nz=Image token '<|image_pad|>' not found in tokenizer vocabularyitem_idxc                    s    j d }g| S )Nr\   )r   )r   
num_tokensr   image_token_idr0   r1   get_replacement_bagel-  s   

zKBagelMultiModalProcessor._get_prompt_updates.<locals>.get_replacement_bagelr   )modalitytargetreplacement)r   r   r   	get_vocabr   
ValueErrorrJ   r   )rB   r   r   r   r   r   r0   r   r1   _get_prompt_updates  s   

z,BagelMultiModalProcessor._get_prompt_updates	hf_inputsc                 C   s   dt diS )Nr'   r   )r   batched)rB   r   r   r0   r0   r1   _get_mm_fields_config;  s   
z.BagelMultiModalProcessor._get_mm_fields_configN)r)   r*   r+   r,   rK   r   r   r   boolr   r   r   r   r   r   r   r   r0   r0   r0   r1   r     s:    


	



r   )r   dummy_inputsc                       s   e Zd ZdZeddddddZeded	ed
edB fddZ	ddde
def fddZded
edB fddZded
eejdf fddZded
efddZ		d)dejdejdedB d ejdB ded
ejeB fd!d"Zd#ejd
ejdB fd$d%Zd&eeeejf  d
ee fd'd(Z  ZS )*BagelForConditionalGenerationz
    BAGEL: A unified multimodal model for image understanding and generation.

    For vLLM, we focus on the image understanding (vision-to-text) capabilities.
    The image generation part is not supported in vLLM.
    language_model.
vit_model.
connector.vit_pos_embed.)r   r   r   r   )orig_to_new_prefixr   irF   Nc                 C   s   | drdS td)Nr   r   z Only image modality is supported)
startswithr   )clsr   r   r0   r0   r1   get_placeholder_str^  s   
z1BagelForConditionalGeneration.get_placeholder_strr5   )r;   vllm_configr;   c          	   
      s  t    |jj}|j}|jj}t|jdkr"tdt|j d|| _	|| _| 
| t||jt|ddgd| _W d    n1 sFw   Y  |jr|j}|jdkr^td d	|_t|d
sktd d|_| |d2 t||t|dd| _|jj}|jj}t||||j|t|dd| _t|j|d| _W d    n1 sw   Y  nt d| _t d| _t d| _| jj!| _!d S )NBagelConfigzExpected BagelConfig, got z0. Make sure the model config is properly loaded.language_modelQwen2ForCausalLM)r   r   r;   architectures   zZOverriding vit_config.num_hidden_layers from 27 to 26 to match the Bagel model checkpoint.   vision_use_headz_Setting vit_config.vision_use_head to False as it is not present in the Bagel model checkpoint.Fr   	vit_model)configr:   r;   	connector)r6   r7   r8   r9   r:   r;   )rN   rO   image_tower)"r=   r>   r   r   r:   multimodal_configr(   r)   r   r   _mark_language_modelr$   
llm_configr%   r   
visual_undr   num_hidden_layersloggerwarninghasattrr   _mark_tower_modelr    r   rO   r3   connector_actr   rM   r   vit_pos_embedr"   make_empty_intermediate_tensors)	rB   r   r;   r   r:   r   r   vit_hidden_sizellm_hidden_sizerC   r0   r1   r>   e  sr   

	






z&BagelForConditionalGeneration.__init__r|   c                 K   s$   | dd }|d u rd S td|dS )Nr'   )r(   r'   )popr&   )rB   r|   r'   r0   r0   r1   _parse_and_validate_image_input  s   z=BagelForConditionalGeneration._parse_and_validate_image_inputimage_input.c                 C   s   |d }|j dkr|j\}}}}}||| |||}| |}| |}	|	j\}}
}| jjj}| jjj}|| }t	j
||	jd}t	j
||	jd}|dddf | jj |  }|d|d }| |}|||
|}||	j}|	| }	t|	S )z:Process image inputs through vision encoder and connector.r'      )ry   Nr   rm   )ndimshaperb   r   r   r   r   r   r   r.   r^   ry   r   flatten	unsqueezeexpandr   rx   tuple)rB   r   r'   
batch_sizer   channelsr   r   vision_featuresvision_embedsnum_patchesrO   r   r   num_patches_per_sideh_coordsw_coordsrw   
pos_embedsr0   r0   r1   _process_image_input  s.   





z2BagelForConditionalGeneration._process_image_inputc                 K   s&   | j di |}|du rg S | |S )z%Get multimodal embeddings from input.Nr0   )r   r   )rB   r|   r   r0   r0   r1   embed_multimodal  s   
z.BagelForConditionalGeneration.embed_multimodal	input_ids	positionsintermediate_tensorsinputs_embedsc                 K   s$   |durd}| j j||||d}|S )an  Run forward pass for BAGEL.

        Args:
            input_ids: Flattened (concatenated) input_ids corresponding to a batch.
            positions: Flattened (concatenated) position ids corresponding to a batch.
            intermediate_tensors: Intermediate tensors from prior forward pass.
            inputs_embeds: Optional tensor of input embeddings.
        N)r   r   r   r   )r   r   )rB   r   r   r   r   r|   hidden_statesr0   r0   r1   rI     s   z%BagelForConditionalGeneration.forwardr   c                 C   s   | j |S rG   )r   compute_logits)rB   r   r0   r0   r1   r     s   z,BagelForConditionalGeneration.compute_logitsweightsc                    s   g d}ddg}g }|D ]Z\ }t  fdd|D rqt  fdd|D r(qd v r_|jdkr_|jd	 }|jd
 }| jjj}| jjj}	||	| | kr_|||||	}|d	dd
d	 }|
 |f qt| dgd}
|
j|| jdS )zLoad weights from checkpoint.)moe_genlatent_pos_embedllm2vaevae2llmtime_embedderzdecoder.zencoder.c                 3   s    | ]}| v V  qd S rG   r0   ).0skipnamer0   r1   	<genexpr>4  s    z=BagelForConditionalGeneration.load_weights.<locals>.<genexpr>c                 3   s    | ]}  |V  qd S rG   )r   )r  r;   r  r0   r1   r  6  s    zpatch_embedding.weightr\   r   r      zvit_pos_embed.pos_embed)skip_prefixes)mapper)anyr   r   r   r   r   num_channelsrb   permute
contiguousappendr!   load_weightshf_to_vllm_mapper)rB   r   generation_keywordsvae_prefixesfiltered_weightstensorout_channelsr6   r   in_channelsloaderr0   r  r1   r    s.   	



z*BagelForConditionalGeneration.load_weights)NN)r)   r*   r+   r,   r#   r  classmethodrK   rJ   r   r   r>   r   r2   r   r   r.   r/   r   r   r   r   rI   r   r   setr  rL   r0   r0   rC   r1   r   E  sX    	P

2

,r   )Fr,   collections.abcr   r   r   typingr   r   r   r.   torch.nnnnvllm.configr   vllm.config.multimodalr	   vllm.loggerr
   %vllm.model_executor.layers.activationr   !vllm.model_executor.layers.linearr   r   'vllm.model_executor.layers.quantizationr   vllm.multimodalr   vllm.multimodal.inputsr   r   r   vllm.multimodal.parser   vllm.multimodal.processingr   r   r   r   vllm.sequencer   (vllm.transformers_utils.processors.bagelr   vllm.utils.tensor_schemar   
interfacesr   r   r   r   siglipr    utilsr!   r"   r#   r$   r%   r)   r   r&   r2   r-   Moduler3   rM   r{   r   r   register_processorr   r0   r0   r0   r1   <module>   sJ   $N2 5


