o
    
۾i,B                     @   s  d Z ddlmZmZmZ ddlmZmZ ddlZddlm	Z	 ddl
mZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZmZ ddlmZ ddlmZ ddl m!Z! ddl"m#Z#m$Z$m%Z% ddl&m'Z'm(Z(m)Z) ddl*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0 ddl1m2Z2 ddl3m4Z4m5Z5 ddl6m7Z7m8Z8m9Z9 ddl:m;Z; ddl<m=Z=m>Z>m?Z?m@Z@ G dd de4ZAG dd de	jBZCG dd de-ZDG d d! d!e+eD ZEG d"d# d#e,eD ZFe!jGeFeDeEd$G d%d& d&e	jBe8e9ZHdS )'zJCommand-A-Vision (Cohere2Vision) multimodal model implementation for vLLM.    )IterableMappingSequence)	AnnotatedLiteralN)nn)BatchFeaturePretrainedConfig)Cohere2VisionConfig)get_optimal_tiled_canvas)Cohere2VisionProcessor)
VllmConfig)BaseDummyOptions)
MulAndSilu)MergedColumnParallelLinearRowParallelLinear)QuantizationConfig)	AWQConfig)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItems)ImageProcessorItems	ImageSizeMultiModalDataItems)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdatePromptUpdateDetails)IntermediateTensors)TensorSchemaTensorShape   )MultiModalEmbeddingsSupportsMultiModal
SupportsPP)SiglipVisionModel)AutoWeightsLoaderWeightsMapperinit_vllm_registered_modelmaybe_prefixc                   @   sP   e Zd ZU dZed ed< eeje	ddddf ed< eeje	df ed	< d
S )Cohere2VisionImagePixelInputsa  
    Dimensions:
        - np: The total number of patches over each image over each prompt in
              the batch
        - c: Number of channels
        - h: Height of each image patch
        - w: Width of each image patch
        - bn: Batch size * number of images
    pixel_valuestypenp   hwbnnum_patchesN)
__name__
__module____qualname____doc__r   __annotations__r   torchTensorr#    r=   r=   ]/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/cohere2_vision.pyr-   9   s   
 
r-   c                       sJ   e Zd ZdZddedef fddZdd Zd	ej	d
ej	fddZ
  ZS ) Cohere2VisionMultiModalProjectorzMultimodal projector that maps vision features to text embedding space.

    Uses pixel shuffle downsampling followed by SwiGLU activation.
     configprefixc                    s~   t    |j| _|jj|jd  }|jd | _t|| jgd dd| dd| _t	 | _
t| j|jjdd| dd| _d S )N   TFz	.linear_1)biasreturn_biasrB   z	.linear_2)super__init__downsample_factorvision_confighidden_sizealignment_intermediate_sizeintermediate_sizer   linear_1r   actr   text_configlinear_2)selfrA   rB   	input_dim	__class__r=   r>   rG   W   s&   

z)Cohere2VisionMultiModalProjector.__init__c                 C   s,   |  |}| |}| |}| |}|S N)pixel_shufflerM   rN   rP   )rQ   image_featureshidden_statesr=   r=   r>   forwardt   s
   



z(Cohere2VisionMultiModalProjector.forwardrW   returnc                 C   s   t |jd d  }}||jd ||d}| \}}}}d| j }	t ||	 }
t ||	 }|||
| j|| j|}|dddddd	 }|||
|d}|S )
zApply pixel shuffle downsampling to reduce spatial dimensions.

        Args:
            image_features: Input tensor of shape [B, S, D] where S = H*W

        Returns:
            Downsampled tensor with increased channel dimension
        r$   g      ?r   g      ?r1   rC         )intshapereshapesizerH   permute
contiguous)rQ   rW   heightwidthxnr2   r3   cscale_factornhnwr=   r=   r>   rV   {   s   	
z.Cohere2VisionMultiModalProjector.pixel_shuffle)r@   )r6   r7   r8   r9   r
   strrG   rY   r;   r<   rV   __classcell__r=   r=   rS   r>   r?   Q   s
    r?   c                   @   s   e Zd ZdefddZdedefddZdefddZde	e
ed	B f fd
dZdefddZdededed	B defddZd	S )Cohere2VisionProcessingInforZ   c                 C   s   | j tS rU   )ctxget_hf_configr
   rQ   r=   r=   r>   rp      s   z)Cohere2VisionProcessingInfo.get_hf_configkwargsc                 K   s   | j jtfi |S rU   )ro   get_hf_processorr   rQ   rr   r=   r=   r>   rs      s   z,Cohere2VisionProcessingInfo.get_hf_processorc                 K   s   | j di |jS Nr=   )rs   image_processorrt   r=   r=   r>   get_image_processor   s   z/Cohere2VisionProcessingInfo.get_image_processorNc                 C   s   dd iS )Nimager=   rq   r=   r=   r>   get_supported_mm_limits   s   z3Cohere2VisionProcessingInfo.get_supported_mm_limitsc                 C   s2   |   }|jd }|jd }|j}t|| |dS )Nrd   re   )rd   re   )rw   ra   max_patchesr   )rQ   rv   rd   re   rz   r=   r=   r>   !get_image_size_with_most_features   s
   

z=Cohere2VisionProcessingInfo.get_image_size_with_most_featuresimage_widthimage_height	processorc                C   st   |du r|   }|j}|j}|j}|j}|j}|sdS t||f|d |d f||\}	}
|	|
 }|dkr8|d7 }|S )z
        Calculate the number of image patches for a given image.
        Uses the HF processor to determine the actual number of patches.
        Nr$   rd   re   )rs   rv   min_patchesrz   ra   crop_to_patchesr   )rQ   r|   r}   r~   rv   r   rz   
patch_sizer   num_columnsnum_rowsr5   r=   r=   r>   get_num_patches   s&   	z+Cohere2VisionProcessingInfo.get_num_patches)r6   r7   r8   r
   rp   objectr   rs   rw   r   rl   r^   ry   r   r{   r   r=   r=   r=   r>   rn      s    rn   c                	   @   sX   e Zd Zdeeef defddZ	d
dedeeef deeef dB defdd	Z	dS )Cohere2VisionDummyInputsBuilder	mm_countsrZ   c                 C   s$   | dd}| j }|j}|| S )Nrx   r   )getinfors   image_token)rQ   r   
num_imagesr~   r   r=   r=   r>   get_dummy_text   s   
z.Cohere2VisionDummyInputsBuilder.get_dummy_textNseq_len
mm_optionsc                 C   sB   | dd}| j }|r| dnd }d| j|j|j||diS )Nrx   r   )re   rd   r   	overrides)r   r   r{   _get_dummy_imagesre   rd   )rQ   r   r   r   r   
image_sizeimage_overridesr=   r=   r>   get_dummy_mm_data   s   
z1Cohere2VisionDummyInputsBuilder.get_dummy_mm_datarU   )
r6   r7   r8   r   rl   r^   r   r   r   r   r=   r=   r=   r>   r      s    
r   c                
       s   e Zd Zdedeeef deeef deeef def
 fddZded	eeef deeef fd
dZ	de
d	eeef dedee fddZ  ZS ) Cohere2VisionMultiModalProcessorpromptmm_data	mm_kwargs
tok_kwargsrZ   c           	         s   t  ||||}d|vrE|d }d urEjjdi | jjd|idd}|dt fddtt	D }t
||d< |S )	Nr5   imagesrx   F)validatec                    s.   g | ]}j j|j|j d qS )r|   r}   r~   )r   r   get_image_sizere   rd   ).0ihf_processorparsed_imagesrQ   r=   r>   
<listcomp>  s    

zGCohere2VisionMultiModalProcessor._call_hf_processor.<locals>.<listcomp>r=   )rF   _call_hf_processorr   r   rs   parse_mm_data	get_itemsr   rangelenr;   tensor)	rQ   r   r   r   r   processed_outputsr   mm_itemsr5   rS   r   r>   r      s    	
z3Cohere2VisionMultiModalProcessor._call_hf_processor	hf_inputshf_processor_mm_kwargsc                 C   s4   | dtd}ttd|tdtddS )Nr5   r   rx   )r.   r5   image_embeds)r   r;   emptydictr   flat_from_sizesbatched)rQ   r   r   r5   r=   r=   r>   _get_mm_fields_config  s   
z6Cohere2VisionMultiModalProcessor._get_mm_fields_configr   out_mm_kwargsc              	      sh   j jdi |jtjd jj jdtf fdd}td|dgS )NrC   item_idxc                    sX    dt}|| }jj|j|jd}  }  ||   }t|S )Nrx   r   )	r   r   r   r   r   re   rd   r    select_text)r   r   r   r5   patch_tokensrepl	boi_token	eoi_tokenr   r   img_line_break_tokenimg_tokens_per_tiler   rQ   r=   r>   get_replacement1  s   
zMCohere2VisionMultiModalProcessor._get_prompt_updates.<locals>.get_replacementrx   )modalitytargetreplacementr=   )	r   rs   r   r^   r   r   r   r   r   )rQ   r   r   r   r   r=   r   r>   _get_prompt_updates$  s    z4Cohere2VisionMultiModalProcessor._get_prompt_updates)r6   r7   r8   rl   r   r   r   r   r   r   r   r   r   r   r   rm   r=   r=   rS   r>   r      s8    


%



r   )r   dummy_inputsc                       s  e Zd ZeddddddZddd	ed
ef fddZedd Z	de
eeejf  dee fddZdedeej fddZdededB fddZdedefddZdedefddZ		d)d ejdB d!ejd"edB d#ejdB dedejeB fd$d%Zd&ejdejdB fd'd(Z  ZS )*%Cohere2VisionForConditionalGenerationzvision_tower.zmulti_modal_projector.zlanguage_model.model.zlanguage_model.lm_head.)zmodel.vision_tower.zmodel.multi_modal_projector.zmodel.language_model.zlm_head.)orig_to_new_prefixr@   rB   vllm_configrB   c                   s   t    |jj}|j}|jj}|| _|| _|| _| || | |d t	|j
|t|dd| _t|t|dd| _W d    n1 sFw   Y  | | t||jt|d|jjd| _W d    d S 1 skw   Y  d S )Nrx   vision_towerr   multi_modal_projectorlanguage_model)r   	hf_configrB   architectures)rF   rG   model_configr   quant_configmultimodal_configrA   _patch_quant_config_mark_tower_modelr(   rI   r,   r   r?   r   _mark_language_modelr+   rO   r   r   )rQ   r   rB   rA   r   r   rS   r=   r>   rG   W  s4   




"z.Cohere2VisionForConditionalGeneration.__init__c                 C   s   t |  jS rU   )next
parametersdtyperq   r=   r=   r>   r   s  s   z+Cohere2VisionForConditionalGeneration.dtypeweightsrZ   c                 C   s   t | }|j|| jdS )N)mapper)r)   load_weightshf_to_vllm_mapper)rQ   r   loaderr=   r=   r>   r   w  s   z2Cohere2VisionForConditionalGeneration.load_weightsimage_inputc                 K   s<   |d }|d }|  |}| |}dd || D S )a  Process image pixels through vision tower and projector.

        Args:
            image_input: Validated image input containing pixel values and
                         patch counts

        Returns:
            List of flattened image embeddings, one per image
        r.   r5   c                 S   s   g | ]}| d dqS )r   rC   )flatten)r   er=   r=   r>   r     s    zNCohere2VisionForConditionalGeneration._process_image_input.<locals>.<listcomp>)r   r   splittolist)rQ   r   rr   r.   r5   rW   r   r=   r=   r>   _process_image_input{  s
   

z:Cohere2VisionForConditionalGeneration._process_image_inputrr   Nc                 K   sb   | dd }| dd }| dd }|d u sJ d|d u r d S td||| jjj| jjjddS )Nr.   r5   r   z,Cohere2Vision does not support image_embeds.)r2   r3   )r/   r.   r5   resolve_bindings)popr-   rA   rI   r   )rQ   rr   r.   r5   r   r=   r=   r>   _parse_and_validate_image_input  s   zECohere2VisionForConditionalGeneration._parse_and_validate_image_inputrA   r   c                 C   sF   t |tr|j}t|dd }|js|d ur!|jd d S d S d S d S )Nquantization_configr   )
isinstancer   rO   getattrmodules_to_not_convertappend)rQ   rA   r   rO   llm_quant_configr=   r=   r>   r     s   
z9Cohere2VisionForConditionalGeneration._patch_quant_configc                 K   s.   | j di |}|d u rg S | j|fi |S ru   )r   r   )rQ   rr   r   r=   r=   r>   embed_multimodal  s   z6Cohere2VisionForConditionalGeneration.embed_multimodal	input_ids	positionsintermediate_tensorsinputs_embedsc                 K   s$   |d urd }| j j||||d}|S )N)r   r   r   r   )r   model)rQ   r   r   r   r   rr   rX   r=   r=   r>   rY     s   z-Cohere2VisionForConditionalGeneration.forwardrX   c                 C   s   | j |S rU   )r   compute_logits)rQ   rX   r=   r=   r>   r     s   z4Cohere2VisionForConditionalGeneration.compute_logits)NN)r6   r7   r8   r*   r   r   rl   rG   propertyr   r   tupler;   r<   setr   r-   listr   r   r   r	   r   r   r%   r   r!   rY   r   rm   r=   r=   rS   r>   r   H  s`    	
$



r   )Ir9   collections.abcr   r   r   typingr   r   r;   r   transformersr   r	   "transformers.models.cohere2_visionr
   Gtransformers.models.cohere2_vision.image_processing_cohere2_vision_fastr   <transformers.models.cohere2_vision.processing_cohere2_visionr   vllm.configr   vllm.config.multimodalr   %vllm.model_executor.layers.activationr   !vllm.model_executor.layers.linearr   r   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.quantization.awqr   vllm.multimodalr   vllm.multimodal.inputsr   r   r   vllm.multimodal.parser   r   r   vllm.multimodal.processingr   r   r   r   r   r    vllm.sequencer!   vllm.utils.tensor_schemar"   r#   
interfacesr%   r&   r'   siglipr(   utilsr)   r*   r+   r,   r-   Moduler?   rn   r   r   register_processorr   r=   r=   r=   r>   <module>   sN    ?
@
 X