o
    
۾i?                     @   s  d dl mZmZmZ d dlmZmZ d dlZd dlmZ d dl	m
Z
mZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dlmZmZmZ d dl m!Z!m"Z"m#Z# d dl$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z* d dl+m,Z, d dl-m.Z.m/Z/ ddl0m1Z1m2Z2m3Z3 ddl4m5Z5 ddl6m7Z7m8Z8m9Z9m:Z:m;Z; G dd de.Z<G dd dej=Z>G dd de'Z?G dd de%e? Z@G dd  d e&e? ZAd!ed"eBfd#d$ZCejDeAe?e@d%G d&d' d'ej=e2e3ZEdS )(    )IterableMappingSequence)	AnnotatedLiteralN)nn)BatchFeatureGotOcr2ImageProcessor)ACT2FN)get_size_dict)AyaVisionConfig)AyaVisionProcessor)get_optimal_tiled_canvas)
VllmConfig)BaseDummyOptions)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItems)ImageProcessorItems	ImageSizeMultiModalDataItems)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdatePromptUpdateDetails)IntermediateTensors)TensorSchemaTensorShape   )MultiModalEmbeddingsSupportsMultiModal
SupportsPP)SiglipVisionModel)AutoWeightsLoaderWeightsMapperget_layer_indexinit_vllm_registered_modelmaybe_prefixc                   @   sP   e Zd ZU dZed ed< eeje	ddddf ed< eeje	df ed	< d
S )AyaVisionImagePixelInputsa  
    Dimensions:
        - np: The total number of patches over each image over each prompt in
              the batch
        - c: Number of channels
        - h: Height of each image patch
        - w: Width of each image patch
        - bn: Batch size * number of images
    pixel_valuestypenp   hwbnnum_patchesN)
__name__
__module____qualname____doc__r   __annotations__r   torchTensorr     r;   r;   Y/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/aya_vision.pyr+   1   s   
 
r+   c                       sN   e Zd Zdef fddZdejdejfddZdejdejfdd	Z  Z	S )
AyaVisionMultiModalProjectorconfigc                    s   t    || _|j| _t|d|jj| _tj	|j
j|jd  |jd| _tj|j
j|jd  | jdd| _td | _tj| jd |jjdd| _d S )Nalignment_intermediate_size   )epsT)biassilu)super__init__r>   downsample_factorgetattrtext_confighidden_sizer?   r   	LayerNormvision_configadapter_layer_norm_eps	layernormLinearlinear_1r
   actlinear_2)selfr>   	__class__r;   r<   rE   J   s*   


z%AyaVisionMultiModalProjector.__init__image_featuresreturnc                 C   sL   |  |}| |}| |}|jddd\}}| || }| |}|S )Nr@   )dim)pixel_shufflerM   rO   chunkrP   rQ   )rR   rU   hidden_statesxgater;   r;   r<   forwardd   s   



z$AyaVisionMultiModalProjector.forwardc                 C   s   |j \}}}t|d  }}||j d ||d}|j d }|||t|| j t|| j }|dddd}||t|| j t|| j d}|dddd}|S )Ng      ?r   rW   r@   r!   r/   )shapeintreshaperF   permute)rR   rU   
batch_size
seq_length_heightwidthchannelsr;   r;   r<   rY   p   s*   
z*AyaVisionMultiModalProjector.pixel_shuffle)
r4   r5   r6   r   rE   r9   r:   r^   rY   __classcell__r;   r;   rS   r<   r=   I   s    r=   c                   @   s   e Zd ZdefddZdedefddZdedefddZ	de
eed	B f fd
dZdefddZdedededededefddZd	S )AyaVisionProcessingInforV   c                 C   s   | j tS N)ctxget_hf_configr   rR   r;   r;   r<   rm      s   z%AyaVisionProcessingInfo.get_hf_configkwargsc                 K   s   | j jtfi |S rk   )rl   get_hf_processorr   rR   ro   r;   r;   r<   rp      s   z(AyaVisionProcessingInfo.get_hf_processorc                 K   s   | j di |jS Nr;   )rp   image_processorrq   r;   r;   r<   get_image_processor   s   z+AyaVisionProcessingInfo.get_image_processorNc                 C   s   dd iS )Nimager;   rn   r;   r;   r<   get_supported_mm_limits   s   z/AyaVisionProcessingInfo.get_supported_mm_limitsc                 C   s6   |   }|jd }|jd }|j}t|| || dS )Nrf   rg   )rf   rg   )rt   sizemax_patchesr   )rR   rs   rf   rg   rx   r;   r;   r<   !get_image_size_with_most_features   s
   

z9AyaVisionProcessingInfo.get_image_size_with_most_featuresimage_widthimage_heightrw   min_patchesrx   c          	      C   sJ   t |dd}t||f|d |d f||\}}|| }|dkr!|S |d S )z
        Calculate the number of patches needed for a given image based on size
        constraints.  This method replicates and adjusts the logic from:
        transformers/models/got_ocr2/image_processing_got_ocr2
        F)default_to_squarerf   rg   r!   )r   r   )	rR   rz   r{   rw   r|   rx   num_columnsnum_rows
num_blocksr;   r;   r<   get_num_patches   s   z'AyaVisionProcessingInfo.get_num_patches)r4   r5   r6   r   rm   objectr   rp   r	   rt   r   strr`   rv   r   ry   dictr   r;   r;   r;   r<   rj      s&    rj   c                	   @   sX   e Zd Zdeeef defddZ	d
dedeeef deeef dB defdd	Z	dS )AyaVisionDummyInputsBuilder	mm_countsrV   c                 C   s$   | dd}| j }|j}|| S )Nru   r   )getinforp   image_token)rR   r   
num_images	processorr   r;   r;   r<   get_dummy_text   s   
z*AyaVisionDummyInputsBuilder.get_dummy_textNseq_len
mm_optionsc                 C   sB   | dd}| j }|r| dnd }d| j|j|j||diS )Nru   r   )rg   rf   r   	overrides)r   r   ry   _get_dummy_imagesrg   rf   )rR   r   r   r   r   
image_sizeimage_overridesr;   r;   r<   get_dummy_mm_data   s   
z-AyaVisionDummyInputsBuilder.get_dummy_mm_datark   )
r4   r5   r6   r   r   r`   r   r   r   r   r;   r;   r;   r<   r      s    
r   c                
       s   e Zd Zdedeeef deeef deeef def
 fddZded	eeef deeef fd
dZ	de
d	eeef dedee fddZ  ZS )AyaVisionMultiModalProcessorpromptmm_data	mm_kwargs
tok_kwargsrV   c                    s   t  ||||}jjd	i |}|j |d }d urLjjd|idd}|dtfddt	t
D }	 fdd|	D }
t|
|d< |S )
Nimagesru   F)validatec                    s   g | ]}  |qS r;   )get_image_size).0i)parsed_imagesr;   r<   
<listcomp>   s    
zCAyaVisionMultiModalProcessor._call_hf_processor.<locals>.<listcomp>c              	      s,   g | ]}j j|j|j j j jd qS )rz   r{   rw   r|   rx   )r   r   rg   rf   rw   r|   rx   )r   r   )rs   rR   r;   r<   r      s    r3   r;   )rD   _call_hf_processorr   rp   rs   r   parse_mm_data	get_itemsr   rangelenr9   tensor)rR   r   r   r   r   processed_outputshf_processorr   mm_itemsimage_sizesr3   rS   )rs   r   rR   r<   r      s&   


z/AyaVisionMultiModalProcessor._call_hf_processor	hf_inputshf_processor_mm_kwargsc                 C   s4   | dtd}ttd|tdtddS )Nr3   r   ru   )r,   r3   image_embeds)r   r9   emptyr   r   flat_from_sizesbatched)rR   r   r   r3   r;   r;   r<   _get_mm_fields_config   s   
z2AyaVisionMultiModalProcessor._get_mm_fields_configr   out_mm_kwargsc                    sN   j jdi |  j} j jdtf fdd}td||dgS )Nitem_idxc                    sN    dt}|| }jj|j|jjjj	d} j
|d}t|S )Nru   r   )r3   )r   r   r   r   r   rg   rf   rw   r|   rx   _prompt_split_imager   select_text)r   r   r   r3   replr   rs   img_patch_tokenr   rR   r;   r<   get_replacement  s   
zIAyaVisionMultiModalProcessor._get_prompt_updates.<locals>.get_replacementru   )modalitytargetreplacementr;   )r   rp   r   r   rs   r`   r   )rR   r   r   r   r   r   r;   r   r<   _get_prompt_updates  s   z0AyaVisionMultiModalProcessor._get_prompt_updates)r4   r5   r6   r   r   r   r   r   r   r   r   r   r   r   r   ri   r;   r;   rS   r<   r      s8    


&



r   	hf_configrV   c                    sZ   | j }| jj t|trt| S t|ttfr#t fdd|D S t	dt
| d)Nc                 3   s    | ]}t | V  qd S rk   )r(   )r   idxnum_hidden_layersr;   r<   	<genexpr>0  s    z)_get_num_hidden_layers.<locals>.<genexpr>zvision_layer_feature type: z is not supported)vision_feature_layerrK   r   
isinstancer`   r(   listtuplemax	TypeErrorr-   )r   feature_layersr;   r   r<   _get_num_hidden_layers(  s   

r   )r   dummy_inputsc                       sP  e Zd ZeddddddZededed	ed
B fddZddde	def fddZ
edd Zdeeeejf  d	ee fddZdedejd	ejeejdf B fddZded	eej fddZd ed	ed
B fd!d"Zd ed	efd#d$Z	
	
d.d%ejd
B d&ejd'ed
B d(ejd
B d ed	ejeB fd)d*Zd+ejd	ejd
B fd,d-Z  ZS )/!AyaVisionForConditionalGenerationzlanguage_model.model.zvision_tower.zmulti_modal_projector.zlanguage_model.lm_head.)zmodel.language_model.zmodel.vision_tower.zmodel.multi_modal_projector.zlm_head.)orig_to_new_prefixr   r   rV   Nc                 C   s   | drdS td)Nru   z<image>z Only image modality is supported)
startswith
ValueError)clsr   r   r;   r;   r<   get_placeholder_strF  s   
z5AyaVisionForConditionalGeneration.get_placeholder_str )prefixvllm_configr   c                   s   t    |jj}|j}|jj}t|}|| _|| _|| _| |d t	|j
||t|dd| _t|| _W d    n1 s@w   Y  | | t||jt|ddgd| _W d    d S 1 sdw   Y  d S )Nru   vision_model)num_hidden_layers_overrider   modelCohere2ForCausalLM)r   r   r   architectures)rD   rE   model_configr   quant_configmultimodal_configr   r>   _mark_tower_modelr%   rK   r*   vision_towerr=   multi_modal_projector_mark_language_modelr)   rH   language_model)rR   r   r   r>   r   r   r   rS   r;   r<   rE   M  s2   
	
"z*AyaVisionForConditionalGeneration.__init__c                 C   s   t |  jS rk   )next
parametersdtypern   r;   r;   r<   r   i  s   z'AyaVisionForConditionalGeneration.dtypeweightsc                 C   s   t | }|j|| jdS )N)mapper)r&   load_weightshf_to_vllm_mapper)rR   r   loaderr;   r;   r<   r   m  s   z.AyaVisionForConditionalGeneration.load_weightsr   r,   .c                 C   s   ||j |jd| jjdS )N)r   )feature_select_strategy)tor   r>   vision_feature_select_strategy)rR   r   r,   r;   r;   r<   _image_pixels_to_featuresq  s   z;AyaVisionForConditionalGeneration._image_pixels_to_featuresimage_inputc                 K   sB   |d }|d }| j | j|d}| |}dd || D S )Nr,   r3   )r,   c                 S   s   g | ]}| d dqS )r   r@   )flatten)r   er;   r;   r<   r     s    zJAyaVisionForConditionalGeneration._process_image_input.<locals>.<listcomp>)r   r   r   splittolist)rR   r   ro   r,   r3   rU   r   r;   r;   r<   _process_image_input{  s   
z6AyaVisionForConditionalGeneration._process_image_inputro   c                 K   sb   | dd }| dd }| dd }|d u sJ d|d u r d S td||| jjj| jjjddS )Nr,   r3   r   z)Aya Vision does not support image_embeds.)r0   r1   )r-   r,   r3   resolve_bindings)popr+   r>   rK   r   )rR   ro   r,   r3   r   r;   r;   r<   _parse_and_validate_image_input  s   zAAyaVisionForConditionalGeneration._parse_and_validate_image_inputc                 K   s.   | j di |}|d u rg S | j|fi |S rr   )r   r   )rR   ro   r   r;   r;   r<   embed_multimodal  s   z2AyaVisionForConditionalGeneration.embed_multimodal	input_ids	positionsintermediate_tensorsinputs_embedsc                 K   s$   |d urd }| j j||||d}|S )N)r   r   r   r   )r   r   )rR   r   r   r   r   ro   r[   r;   r;   r<   r^     s   z)AyaVisionForConditionalGeneration.forwardr[   c                 C   s   | j |S rk   )r   compute_logits)rR   r[   r;   r;   r<   r     s   z0AyaVisionForConditionalGeneration.compute_logits)NN) r4   r5   r6   r'   r   classmethodr   r`   r   r   rE   propertyr   r   r   r9   r:   setr   r%   r   r+   r   r   r   r   r"   r   r   r^   r   ri   r;   r;   rS   r<   r   6  sh    

$




r   )Fcollections.abcr   r   r   typingr   r   r9   r   transformersr   r	   transformers.activationsr
   #transformers.image_processing_utilsr   transformers.models.aya_visionr   4transformers.models.aya_vision.processing_aya_visionr   6transformers.models.got_ocr2.image_processing_got_ocr2r   vllm.configr   vllm.config.multimodalr   vllm.multimodalr   vllm.multimodal.inputsr   r   r   vllm.multimodal.parser   r   r   vllm.multimodal.processingr   r   r   r   r   r   vllm.sequencer   vllm.utils.tensor_schemar   r    
interfacesr"   r#   r$   siglipr%   utilsr&   r'   r(   r)   r*   r+   Moduler=   rj   r   r   r`   r   register_processorr   r;   r;   r;   r<   <module>   sB    	?-U