o
    ib                     @   s*  U d Z ddlZddlmZmZmZ ddlmZmZm	Z	 ddl
Z
ddl
mZ ddlmZmZmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlm Z  ddl!m"Z"m#Z#m$Z$ ddl%m&Z&m'Z'm(Z( ddl)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/ ddl0m1Z1 ddl2m3Z3m4Z4 ddl5m6Z7 ddl8m9Z9m:Z:m;Z; ddl<m=Z= ddl>m?Z?m@Z@ G dd de3ZAG dd de3ZBeAeBB ZCe	eDd< G dd de,ZEG d d! d!e*eE ZFG d"d# d#e+eE ZGG d$d% d%ejHZIG d&d' d'ejHZJG d(d) d)ejHZKe jLeGeEeFd*G d+d, d,ejHe;e:ZMdS )-zBInference-only Idefics3 model compatible with HuggingFace weights.    N)IterableMappingSequence)	AnnotatedLiteral	TypeAlias)nn)BatchFeatureIdefics3ConfigIdefics3ImageProcessorIdefics3Processor)
VllmConfig)BaseDummyOptions)ReplicatedLinear)LogitsProcessor)QuantizationConfig)ParallelLMHead)MultiModelKeys)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItems)ImageProcessorItems	ImageSizeMultiModalDataItems)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdatePromptUpdateDetails)IntermediateTensors)TensorSchemaTensorShape   )Idefics2VisionTransformer)MultiModalEmbeddingsSupportsLoRASupportsMultiModal)
LlamaModel)AutoWeightsLoadermaybe_prefixc                   @   sj   e Zd ZU dZed ed< eeje	ddddf ed< eeje	dddf ed< eeje	d	f ed
< dS )Idefics3ImagePixelInputsz
    Dimensions:
        - bn: Batch size * number of images
        - bnp: Batch size * number of images * number of patches
        - c: Number of channels (3)
        - h: Height
        - w: Width
    pixel_valuestypebnp   hwpixel_attention_maskbnnum_patchesN
__name__
__module____qualname____doc__r   __annotations__r   torchTensorr#    r>   r>   Y/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/model_executor/models/idefics3.pyr,   E   s   
 	r,   c                   @   s8   e Zd ZU dZed ed< eeje	dddf ed< dS )	Idefics3ImageEmbeddingInputsz
    Dimensions:
        - bn: Batch size * number of images
        - f: Image feature size
        - h: Hidden size (must match the hidden size of language model backbone)
    image_embedsr.   r4   fr1   dataNr6   r>   r>   r>   r?   r@   U   s   
 r@   ImageInputsc                   @   sD  e Zd ZdedefddZdeeedB f fddZ	dddd	d
edededB dededB de
eef fddZdededede
eef fddZdedededB de
eef fddZdedededB defddZdedB de
eeef fddZdedededB defddZdedededB defdd Zdefd!d"ZdS )#Idefics3ProcessingInfokwargsreturnc                 K   s   | j jtfi |S N)ctxget_hf_processorr   )selfrF   r>   r>   r?   rJ   e   s   z'Idefics3ProcessingInfo.get_hf_processorNc                 C   s   dd iS )Nimager>   rK   r>   r>   r?   get_supported_mm_limitsh   s   z.Idefics3ProcessingInfo.get_supported_mm_limitsr$   )max_lenmin_lenmax_sizeheightwidthrO   rP   rQ   c                C   s   |d u r	t ||n|}|| }|d urt||}||kr%|}t|| }n|}t|| }||d 7 }||d 7 }t ||}t ||}||fS N   )maxminint)rK   rR   rS   rO   rP   rQ   aspect_ratior>   r>   r?   _resize_output_sizek   s   



z*Idefics3ProcessingInfo._resize_output_sizeimage_widthimage_heightresolution_max_sidec          	      C   sN   |   }|j}|jd }||krtd||}}| j|||d\}}||fS )Nlongest_edgez<`resolution_max_side` cannot be larger than `max_image_size`)rR   rS   rO   )rJ   image_processorsize
ValueErrorrZ   )	rK   r[   r\   r]   hf_processorr_   max_image_sizerR   rS   r>   r>   r?   _get_resize_output_image_size   s   


z4Idefics3ProcessingInfo._get_resize_output_image_size	processorc                C   s   |d u r|   }|j}|jd }|jd }|| dksJ d| j|||d\}}||ks1||krCt|| }	t|| }
|
|	fS d }	}
|
|	fS )Nr^   r   z`longest_edge` in image_processor's `size` must be divisible by `longest_edge` in `max_image_size`, this may be caused by incorrect mm_kwargs override.)r[   r\   r]   )rJ   r_   rc   r`   rd   mathceil)rK   r[   r\   re   r_   rc   r`   resized_heightresized_widthgrid_hgrid_wr>   r>   r?   _get_image_feature_grid_size   s&   


z3Idefics3ProcessingInfo._get_image_feature_grid_sizec                C   s    | j |||d\}}|| d S )Nr[   r\   re   r$   )rl   )rK   r[   r\   re   rk   rj   r>   r>   r?   get_num_patches   s   
z&Idefics3ProcessingInfo.get_num_patchesc                 C   s,   |d u r|   }|j}|j}|j}|||fS rH   )rJ   image_tokenfake_image_tokenglobal_image_tag)rK   re   ro   rp   global_image_tokenr>   r>   r?   _get_image_token   s   
z'Idefics3ProcessingInfo._get_image_tokenc                C   s   |d u r|   }| |\}}}|j}d}|| }	|| |	 }
|| |	 }| j|||d\}}|dkr;|dkr;|
| S tt  }t|D ]$}t|D ]}|j|d |d d}|| ||d krg|d qJqDd	g |d|
|S )Nz<row_{n_h}_col_{n_w}>rm   r   r$   )n_hn_w
 )
rJ   rs   image_seq_lenrl   liststrrangeformatappendjoin)rK   r[   r\   re   ro   rp   global_img_tokenrx   grid_placeholderp_imgglobal_img_placeholdertile_img_placeholderrk   rj   tiles_placeholderijplaceholder_per_tiler>   r>   r?   get_image_repl   sJ   




z%Idefics3ProcessingInfo.get_image_replc                C   s*   |d u r|   }| j|||d}||j S )Nrm   )rJ   rn   rx   )rK   r[   r\   re   r5   r>   r>   r?   get_num_image_tokens  s   
z+Idefics3ProcessingInfo.get_num_image_tokensc                 C   s&   |   }|j}t|jd |jd dS )Nr^   )rS   rR   )rJ   r_   r   r`   )rK   re   r_   r>   r>   r?   !get_image_size_with_most_features   s   z8Idefics3ProcessingInfo.get_image_size_with_most_features)r7   r8   r9   objectr   rJ   r   rz   rX   rN   tuplerZ   rd   rl   rn   rs   r   r   r   r   r>   r>   r>   r?   rE   d   s    

#



 


.
rE   c                	   @   sX   e Zd Zdeeef defddZ	d
dedeeef deeef dB defdd	Z	dS )Idefics3DummyInputsBuilder	mm_countsrG   c                 C   s0   | dd}| j }| j|\}}}|| S )NrL   r   )getinforJ   rs   )rK   r   
num_imagesre   ro   _r>   r>   r?   get_dummy_text+  s   
z)Idefics3DummyInputsBuilder.get_dummy_textNseq_len
mm_optionsc           	      C   sN   | dd}| j }|j}|jd }|r| dnd }d| j||||diS )NrL   r   r^   )rS   rR   r   	overrides)r   r   rJ   r_   rc   _get_dummy_images)	rK   r   r   r   r   rb   r_   r^   image_overridesr>   r>   r?   get_dummy_mm_data3  s   

z,Idefics3DummyInputsBuilder.get_dummy_mm_datarH   )
r7   r8   r9   r   rz   rX   r   r   r   r   r>   r>   r>   r?   r   *  s    
r   c                
       s   e Zd Zdedeeef deeef deeef def
 fddZded	eeef deeef fd
dZ	de
d	eeef dedee fddZ  ZS )Idefics3MultiModalProcessorpromptmm_data	mm_kwargs
tok_kwargsrG   c                    s   | dg  }sj |}|}tt|gdddS ddi|}t ||||}jj	d|idd	}|
dtfd
dttD }	jjdi |  fdd|	D }
t|
|d< |d d |d d |S )Nimages)	input_idspt)tensor_typeinput_data_formatchannels_lastrL   F)validatec                    s   g | ]}  |qS r>   )get_image_size).0r   )parsed_imagesr>   r?   
<listcomp>b  s    
zBIdefics3MultiModalProcessor._call_hf_processor.<locals>.<listcomp>c                    s"   g | ]}j j|j|j d qS )rm   )r   rn   rS   rR   )r   r`   )rb   rK   r>   r?   r   g  s    r5   r-   r   r3   r>   )r   r   get_tokenizerencode_apply_hf_processor_tokens_onlyr	   dictsuper_call_hf_processorparse_mm_data	get_itemsr   r{   lenrJ   r<   tensorsqueeze_)rK   r   r   r   r   r   
prompt_idsprocessed_outputsmm_itemsimage_sizesr5   	__class__)rb   r   rK   r?   r   K  s0   


z.Idefics3MultiModalProcessor._call_hf_processor	hf_inputshf_processor_mm_kwargsc                 C   s>   | dtd}ttd|td|tdtddS )Nr5   r   rL   )r-   r3   rA   r5   )r   r<   emptyr   r   flat_from_sizesbatched)rK   r   r   r5   r>   r>   r?   _get_mm_fields_configw  s   
z1Idefics3MultiModalProcessor._get_mm_fields_configr   out_mm_kwargsc                    sP   j jdi | j  \}}dtdtf fdd}td|dgS )Nitem_idxrG   c                    s:    dt}|| }jj|j|j d}tj|dS )NrL   rm   )
embed_text)	r   r   r   r   r   rS   rR   r    select_text)r   r   
image_size
image_replrb   ro   r   rK   r>   r?   get_replacement_idefics3  s   
zQIdefics3MultiModalProcessor._get_prompt_updates.<locals>.get_replacement_idefics3rL   )modalitytargetreplacementr>   )r   rJ   rs   rX   r    r   )rK   r   r   r   r   r   r>   r   r?   _get_prompt_updates  s   z/Idefics3MultiModalProcessor._get_prompt_updates)r7   r8   r9   rz   r   r   r	   r   r   r   r   r   r   r   r   __classcell__r>   r>   r   r?   r   J  s8    


,



r   c                       sJ   e Zd Z		ddededB def fddZdejd	ejfd
dZ	  Z
S )Idefics3SimpleMLPNrw   configquant_configprefixc                    sB   t    |jj|jd  }|jj}t||d|t|dd| _d S )NrU   Fproj)biasr   r   )	r   __init__vision_confighidden_sizescale_factortext_configr   r+   r   )rK   r   r   r   
input_sizeoutput_sizer   r>   r?   r     s   
zIdefics3SimpleMLP.__init__xrG   c                 C   s   |  |\}}|S rH   )r   )rK   r   outr   r>   r>   r?   forward     zIdefics3SimpleMLP.forwardNrw   )r7   r8   r9   r
   r   rz   r   r<   r=   r   r   r>   r>   r   r?   r     s    r   c                       sf   e Zd Z		ddededB def fddZdd	ejd
e	dejfddZ
dejdejfddZ  ZS )Idefics3ConnectorNrw   r   r   r   c                    s,   t    |j| _t||t|dd| _d S )Nmodality_projectionr   )r   r   r   r   r+   r   )rK   r   r   r   r   r>   r?   r     s   
zIdefics3Connector.__init__rU   r   r   rG   c                 C   s   |  \}}}t|d  }}|||||}|||t|| || }|dddd}||t|| t|| ||d  }|dddd}||t||d  ||d  }|S )Ng      ?r   rU   r$   r0   )r`   rX   viewpermutereshape)rK   r   r   bszseq	embed_dimrR   rS   r>   r>   r?   pixel_shuffle  s   


"zIdefics3Connector.pixel_shuffleimage_hidden_statesc                 C   s   |  || j}| |}|S rH   )r   r   r   )rK   r   r>   r>   r?   r     s   
zIdefics3Connector.forwardr   )rU   )r7   r8   r9   r
   r   rz   r   r<   r=   rX   r   r   r   r>   r>   r   r?   r     s    r   c                       s   e Zd Zdddedef fddZdejdejd	ejfd
dZdejd	ejfddZ			ddejdB dejde
dB dejdB d	eje
B f
ddZ  ZS )Idefics3Modelrw   r   vllm_configr   c                   s   t    |jj}|j}|| _| jjj| _t|j	|t
|dd| _t||t
|dd| _t||jt
|dd| _t|j	j|j	j d |jd  | _| jj| _d S )Nvision_modelr   r   	connectorr   
text_modelr   r   rU   )r   r   model_config	hf_configr   r   r   
vocab_sizeIdefics3VisionTransformerr   r+   r   r   r   r)   with_hf_configr   rX   r   
patch_sizer   rx   image_token_id)rK   r   r   r   r   r   r>   r?   r     s2   

zIdefics3Model.__init__r-   r3   rG   c           	      C   s   |j | jjjjjd}|jdd   }|dkjdd|k}|| 	 }|| 	 }| j
jj}|jd||d}|jd||d}|jddd	k }| j||d
}|S )N)dtyper$   g        ))dim)	dimensionr`   steprU   )r   r   r   )r-   patch_attention_mask)tor   
embeddingspatch_embeddingweightr   shapenumelsum
contiguousr   r   r   unfoldbool)	rK   r-   r3   nb_values_per_imagereal_images_indsr   patches_subgridr   r   r>   r>   r?   image_pixels_to_features  s0   
z&Idefics3Model.image_pixels_to_featuresr   c                 C   s   | j |S rH   )r   embed_input_ids)rK   r   r>   r>   r?   r  *  s   zIdefics3Model.embed_input_idsN	positionsintermediate_tensorsinputs_embedsc                 C   s   | j ||||d}|S N)r  )r   )rK   r   r  r  r  hidden_statesr>   r>   r?   r   -  s   zIdefics3Model.forwardNN)r7   r8   r9   r   rz   r   r<   r=   r  r  r!   r   r   r>   r>   r   r?   r     s.    
'r   )r   dummy_inputsc                       sb  e Zd Zg dddgdZededededB fd	d
Zdddedef fddZ	de
dedB fddZdedejfddZdedejeej B fddZde
defddZ		d0dejdB dejdedB dejdB de
dejeB fd d!Zd"ejdejfd#d$Zd%eeeejf  dee fd&d'Zdefd(d)Zd*edefd+d,Zd-edefd.d/Z  Z S )1 Idefics3ForConditionalGeneration)q_projk_projv_proj	gate_projup_proj)qkv_projgate_up_projr   r   rG   Nc                 C   s   | drdS td)NrL   z<image>z Only image modality is supported)
startswithra   )clsr   r   r>   r>   r?   get_placeholder_strO  s   
z4Idefics3ForConditionalGeneration.get_placeholder_strrw   r   r   r   c                   s   t    |jj}|j}|jj}|| _|| _| j|tdt	t
fid t|t|dd| _W d    n1 s7w   Y  | jj| _t|jj|jj|t|dd| _| jjjr^| jjjj| j_t|jj| _d S )NrL   )language_targetstower_targetsmodelr   lm_headr   )r   r   r   r   r   multimodal_configr   _mark_composite_modelr)   r   r   r   r+   r$  r   r   r   r   r   r%  tie_word_embeddingsr   embed_tokensr  r   logits_processor)rK   r   r   r   r   r&  r   r>   r?   r   V  s4   





z)Idefics3ForConditionalGeneration.__init__rF   c                 K   s   | dd }| dd }|d u r|d u rd S |d ur td|dS |d urA| d}| d}| jjj }}td|||||ddS td)	Nr-   rA   )r.   rC   r3   r5   )r1   r2   )r.   r-   r3   r5   resolve_bindingsz This line should be unreachable.)popr@   r   r   r   r,   AssertionError)rK   rF   r-   rA   r3   r5   
expected_h
expected_wr>   r>   r?   _parse_and_validate_image_inputv  s*   

z@Idefics3ForConditionalGeneration._parse_and_validate_image_inputinputsc                 C   s    |d }|d }| j j||dS )Nr-   r3   )r3   )r$  r  )rK   r1  r-   r3   r>   r>   r?   _process_image_pixels  s   z6Idefics3ForConditionalGeneration._process_image_pixelsimage_inputc                 C   sJ   |d dkr
|d S |  |}| j|}|d }dd || D S )Nr.   rA   rC   r5   c                 S   s   g | ]}| d dqS )r   r$   )flatten)r   er>   r>   r?   r     s    zIIdefics3ForConditionalGeneration._process_image_input.<locals>.<listcomp>)r2  r$  r   splittolist)rK   r3  image_featuresr5   r>   r>   r?   _process_image_input  s   
z5Idefics3ForConditionalGeneration._process_image_inputc                 K   s&   | j di |}|d u rg S | |S )Nr>   )r0  r9  )rK   rF   r3  r>   r>   r?   embed_multimodal  s   
z1Idefics3ForConditionalGeneration.embed_multimodalr   r  r  r  c                 K   s$   |d urd }| j j||||d}|S r  )r$  r   )rK   r   r  r  r  rF   r  r>   r>   r?   r     s   z(Idefics3ForConditionalGeneration.forwardr  c                 C   s   |  | j|}|S rH   )r*  r%  )rK   r  logitsr>   r>   r?   compute_logits  r   z/Idefics3ForConditionalGeneration.compute_logitsweightsc                 C   s   t | }||S rH   )r*   load_weights)rK   r=  loaderr>   r>   r?   r>    s   
z-Idefics3ForConditionalGeneration.load_weightsc                 C   s   t jddddS )z<
        Get the module prefix in multimodal models
        zmodel.text_modelzmodel.connectorzmodel.vision_model)language_modelr   tower_model)r   from_string_fieldrM   r>   r>   r?   get_mm_mapping  s
   z/Idefics3ForConditionalGeneration.get_mm_mappingnum_image_tokensc                 C   s   | j }|j}||d  S rT   r   r   )rK   rD  r   r   r>   r>   r?   get_num_mm_encoder_tokens     z:Idefics3ForConditionalGeneration.get_num_mm_encoder_tokensnum_vision_tokensc                 C   s   | j }|j}||d  S rT   rE  )rK   rH  r   r   r>   r>   r?   get_num_mm_connector_tokens  rG  z<Idefics3ForConditionalGeneration.get_num_mm_connector_tokensr  )!r7   r8   r9   packed_modules_mappingclassmethodrz   rX   r!  r   r   r   rD   r0  r,   r<   r=   r2  ry   r9  r&   r:  r!   r   r<  r   r   setr>  r   rC  rF  rI  r   r>   r>   r   r?   r  =  sZ     	

$

	r  )Nr:   rf   collections.abcr   r   r   typingr   r   r   r<   r   transformersr	   r
   r   r   vllm.configr   vllm.config.multimodalr   !vllm.model_executor.layers.linearr   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr   3vllm.model_executor.layers.vocab_parallel_embeddingr   )vllm.model_executor.models.module_mappingr   vllm.multimodalr   vllm.multimodal.inputsr   r   r   vllm.multimodal.parser   r   r   vllm.multimodal.processingr   r   r   r   r   r    vllm.sequencer!   vllm.utils.tensor_schemar"   r#   idefics2_vision_modelr%   r   
interfacesr&   r'   r(   llamar)   utilsr*   r+   r,   r@   rD   r;   rE   r   r   Moduler   r   r   register_processorr  r>   r>   r>   r?   <module>   sP     G _%X