o
    -i2                     @   s  d Z ddlZddlmZmZmZ ddlmZmZ ddl	Z	ddl
mZ ddlmZmZmZmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlm Z m!Z!m"Z" ddl#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z) ddl*m+Z+ ddl,m-Z-m.Z. ddl/m0Z0m1Z1m2Z2 ddl3m4Z4m5Z5m6Z6m7Z7 dZ8dZ9G dd de-Z:G dd de&Z;G dd de$e; Z<G dd de%e; Z=ej>e=e;e<dG dd dej?e1e2Z@dS ) zPyTorch Fuyu model.    N)IterableMappingSequence)	AnnotatedLiteral)BatchFeature
FuyuConfigFuyuImageProcessorFuyuProcessor)
VllmConfig)BaseDummyOptions)ColumnParallelLinear)PersimmonForCausalLM)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItems)ImageProcessorItems	ImageSizeMultiModalDataItems)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdatePromptUpdateDetails)IntermediateTensors)TensorSchemaTensorShape   )MultiModalEmbeddingsSupportsMultiModal
SupportsPP)AutoWeightsLoaderWeightsMapper
flatten_bnmaybe_prefixic ik c                   @   sR   e Zd ZU dZdZed ed< eej	e
ddf ed< eee e
df ed< d	S )
FuyuImagePatchInputsz
    Dimensions:
        - bn: Batch size * number of images
        - bnp: Batch size * number of images * number of patches
        - fn: patch_size_x * patch_size_y * num_channels
    image_patchestypebnpfnimage_patches_flatbnpatches_per_imageN)__name__
__module____qualname____doc__r)   r   __annotations__r   torchTensorr   listint r8   r8   \/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/model_executor/models/fuyu.pyr'   ;   s   
 r'   c                   @   s   e Zd Zdd ZdefddZdedefddZdee	e
d	B f fd
dZde
de
dee
e
f fddZde
de
de
fddZdefddZd	S )FuyuProcessingInfoc                 C   s   | j tS N)ctxget_hf_configr   selfr8   r8   r9   r=   Q   s   z FuyuProcessingInfo.get_hf_configkwargsc                 K   s   | j jtfi |S r;   )r<   get_hf_processorr
   r?   r@   r8   r8   r9   rA   T   s   z#FuyuProcessingInfo.get_hf_processorreturnc                 K   s   | j di |jS Nr8   )rA   image_processorrB   r8   r8   r9   get_image_processorW   s   z&FuyuProcessingInfo.get_image_processorNc                 C   s   ddiS )Nimager   r8   r>   r8   r8   r9   get_supported_mm_limitsZ   s   z*FuyuProcessingInfo.get_supported_mm_limitsimage_widthimage_heightc                C   s   |   }|jd }|jd }|jd }|jd }||kr ||ks9|| }|| }	t||	}
t||
 }t||
 }t|| }t|| }||fS )Nwidthheight)rF   size
patch_sizeminr7   mathceil)r?   rI   rJ   rE   target_widthtarget_heightpatch_widthpatch_heightheight_scale_factorwidth_scale_factoroptimal_scale_factorncolsnrowsr8   r8   r9   get_image_feature_grid_size]   s   




z.FuyuProcessingInfo.get_image_feature_grid_sizec                C   s   | j ||d\}}|| S )NrI   rJ   )r[   )r?   rI   rJ   rY   rZ   r8   r8   r9   get_num_image_tokensu   s
   
z'FuyuProcessingInfo.get_num_image_tokensc                 C   s    |   }t|jd |jd dS )NrK   rL   )rK   rL   )rF   r   rM   )r?   rE   r8   r8   r9   !get_image_size_with_most_features   s   z4FuyuProcessingInfo.get_image_size_with_most_features)r/   r0   r1   r=   objectrA   r	   rF   r   strr7   rH   tupler[   r]   r   r^   r8   r8   r8   r9   r:   P   s(    


r:   c                	   @   sX   e Zd Zdeeef defddZ	d
dedeeef deeef dB defdd	Z	dS )FuyuDummyInputsBuilder	mm_countsrC   c                 C   s   dS )N r8   )r?   rc   r8   r8   r9   get_dummy_text   s   z%FuyuDummyInputsBuilder.get_dummy_textNseq_len
mm_optionsc                 C   sB   | j  \}}|dd}|r|dnd }d| j||||diS )NrG   r   )rK   rL   
num_images	overrides)infor^   get_get_dummy_images)r?   rf   rc   rg   rR   rS   rh   image_overridesr8   r8   r9   get_dummy_mm_data   s   z(FuyuDummyInputsBuilder.get_dummy_mm_datar;   )
r/   r0   r1   r   r`   r7   re   r   r   rn   r8   r8   r8   r9   rb      s    
rb   c                
       s   e Zd Zdedeeef deeef deeef def
 fddZdee	 dee	 fd	d
Z
dedeeef deeef fddZdedeeef dedee fddZ  ZS )FuyuMultiModalProcessorpromptmm_data	mm_kwargs
tok_kwargsrC   c                    sv   |s| j  |}| |}tt|gdddS t j||||d}|d }t||d< t	
dd |D |d< |S )	N)	input_idspt)tensor_type)rp   rq   rr   rs   r(   c                 S   s   g | ]}t |qS r8   )len).0pr8   r8   r9   
<listcomp>   s    z>FuyuMultiModalProcessor._call_hf_processor.<locals>.<listcomp>r.   )rj   get_tokenizerencode_apply_hf_processor_tokens_onlyr   dictsuper_call_hf_processorr%   r4   tensor)r?   rp   rq   rr   rs   
prompt_idsprocessed_outputsr(   	__class__r8   r9   r      s    
z*FuyuMultiModalProcessor._call_hf_processorprompt_tokensc                 C   s4   | j  }| }|d }|d |kr|| |S )Nz<0x04>)rj   r{   	get_vocabappend)r?   r   	tokenizervocabboa_token_idr8   r8   r9   r}      s   

z7FuyuMultiModalProcessor._apply_hf_processor_tokens_only	hf_inputshf_processor_mm_kwargsc                 C   s,   | dtd}ttd|tddS )Nr.   r   rG   )r(   r.   )rk   r4   emptyr~   r   flat_from_sizesbatched)r?   r   r   r.   r8   r8   r9   _get_mm_fields_config   s   z-FuyuMultiModalProcessor._get_mm_fields_configmm_itemsout_mm_kwargsc                    sd   j  }|j t tsJ j  }|j}t|tsJ dtf fdd}td|g|dgS )Nitem_idxc                    sV    dt}|| }jj|j|jd\}}tg| tg | }t	j
| g tdS )NrG   r\   )embed_token_id)	get_itemsr   get_image_sizerj   r[   rK   rL   _IMAGE_TOKEN_ID_NEWLINE_TOKEN_IDr   select_token_id)r   images
image_sizerY   rZ   image_tokensbos_token_idr   r?   r8   r9   get_replacement_fuyu   s   

zIFuyuMultiModalProcessor._get_prompt_updates.<locals>.get_replacement_fuyurG   )modalitytargetreplacement)rj   r=   r   
isinstancer7   r{   r   )r?   r   r   r   	hf_configr   eot_token_idr   r8   r   r9   _get_prompt_updates   s   

z+FuyuMultiModalProcessor._get_prompt_updates)r/   r0   r1   r`   r   r_   r   r   r6   r7   r}   r   r   r   r   r   r   r   __classcell__r8   r8   r   r9   ro      sB    







ro   )rj   dummy_inputsc                       s  e Zd ZedddddZedededed	B fd
dZddde	def fddZ
deded	B fddZdedefddZdedefddZ				d&dejdejded	B dejd	B def
ddZd ejdejd	B fd!d"Zd#eeeejf  dee fd$d%Z  ZS )'FuyuForCausalLMzvision_embed_tokens.zlanguage_model.model.zlanguage_model.lm_head.)zmodel.vision_embed_tokens.zmodel.language_model.zlm_head.)orig_to_new_prefixr   irC   Nc                 C   s   | drd S td)NrG   z Only image modality is supported)
startswith
ValueError)clsr   r   r8   r8   r9   get_placeholder_str  s   
z#FuyuForCausalLM.get_placeholder_strrd   )prefixvllm_configr   c                   s   t    |jj}|j}|jj}|| _|| _|jj| _t	| _
|jd |j | _| |d t| j|j|dd| _W d    n1 sCw   Y  | | t||jt|dd| _W d    n1 sfw   Y  | jj| _d S )N   rG   T)quant_configgather_outputlanguage_model)r   r   )r   __init__model_configr   r   multimodal_configconfigtext_config
vocab_sizer   image_token_idrN   num_channelsimage_feature_size_mark_tower_modelr   hidden_sizevision_embed_tokens_mark_language_modelr   with_hf_configr&   r   make_empty_intermediate_tensors)r?   r   r   r   r   r   r   r8   r9   r     s2   




zFuyuForCausalLM.__init__r@   c                 K   s:   | dd }| dd }|d u rd S td||d| jidS )Nr(   r.   r+   )r)   r,   r.   resolve_bindings)popr'   r   )r?   r@   r(   r.   r8   r8   r9   _parse_and_validate_image_input4  s   z/FuyuForCausalLM._parse_and_validate_image_inputimage_inputc                 C   s0   |d }|d }|  |\}}|j| ddS )Nr,   r.   r   )dim)r   splittolist)r?   r   r,   r.   vision_embeddings_flat_r8   r8   r9   _process_image_inputD  s   z$FuyuForCausalLM._process_image_inputc                 K   s&   | j di |}|d u rg S | |S rD   )r   r   )r?   r@   r   r8   r8   r9   embed_multimodalN  s   
z FuyuForCausalLM.embed_multimodalrt   	positionsintermediate_tensorsinputs_embedsc                 K   s"   |d urd }| j ||||d}|S )N)rt   r   r   r   )r   )r?   rt   r   r   r   r@   hidden_statesr8   r8   r9   forwardU  s   zFuyuForCausalLM.forwardr   c                 C   s   | j |S r;   )r   compute_logits)r?   r   r8   r8   r9   r   h  s   zFuyuForCausalLM.compute_logitsweightsc                 C   s   t | }||S r;   )r#   load_weights)r?   r   loaderr8   r8   r9   r   n  s   
zFuyuForCausalLM.load_weights)NN)r/   r0   r1   r$   hf_to_vllm_mapperclassmethodr`   r7   r   r   r   r_   r'   r   r    r   r   r4   r5   r   r   r   r   ra   setr   r   r8   r8   r   r9   r     sP    




,r   )Ar2   rP   collections.abcr   r   r   typingr   r   r4   torch.nnnntransformersr   r   r	   r
   vllm.configr   vllm.config.multimodalr   !vllm.model_executor.layers.linearr   $vllm.model_executor.models.persimmonr   vllm.multimodalr   vllm.multimodal.inputsr   r   r   vllm.multimodal.parser   r   r   vllm.multimodal.processingr   r   r   r   r   r   vllm.sequencer   vllm.utils.tensor_schemar   r   
interfacesr    r!   r"   utilsr#   r$   r%   r&   r   r   r'   r:   rb   ro   register_processorModuler   r8   r8   r8   r9   <module>   s>    9_