o
    ߥi                     @   s  d Z ddlmZ ddlmZmZ ddlZddlZ	ddl
Z
ddlmZ ddlm  mZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlm Z  ddl!m"Z"m#Z# ddl$m%Z% e% Z&dgZ'ej(e#j)ej*dG dd deZ+dS )z% Generative Multimodal Model Wrapper.    N)AnyDict)Image)
transforms)Models)
TorchModel)MODELS)	GEMMModel)
OutputKeys)	LoadImage)	ModelFileTasks)
get_loggerGEMMForMultiModalEmbedding)module_namec                       sT   e Zd ZdZd fdd	Zdd Zdd Zd	eee	f d
eee	f fddZ
  ZS )r   z Generative multi-modal model for multi-modal embedding
    Inputs could be image or text or both of them.
    Outputs could be features of input image or text,
    image caption could also be produced when image is available.
    r   c              	      s   t  j|||d| t|d| _td|tj}| j	| | j
  || _| jdkrHtj rH| jd| j td| j nd| _td ttd	td	t td
dg| _d S )N)	model_dir	device_id)r   z{}/{}r   cuda:{}zUse GPU: {}zUse CPU for inference   )g3<4'?gwgM?gy{ ?)gB91?gwt.?g	U?)super__init__r	   
gemm_modeltorchloadformatr   TORCH_MODEL_BIN_FILEload_state_dictevalr   cudais_availabletologgerinfoTComposeResize
CenterCropToTensor	Normalizeimg_preprocessor)selfr   r   argskwargspretrained_params	__class__ a/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/multi_modal/gemm/gemm_model.pyr   %   s2   

z#GEMMForMultiModalEmbedding.__init__c                 C   sD   |d u rd S t |}| |d }| jdkr |d| j}|S )N)N.r   r   )r   convert_to_imgr*   r   r!   r   )r+   	input_img
img_tensorr1   r1   r2   parse_image<   s   

z&GEMMForMultiModalEmbedding.parse_imagec                 C   sj   |d u s
t |dkrd S t|tr| j|}n	tdt| | jdkr/|d	| j}|
ddS )Nr   ztext should be str, but got r      r   )len
isinstancestrr   tokenize	TypeErrortyper   r!   r   view)r+   text_strtext_ids_tensorr1   r1   r2   
parse_textE   s   

z%GEMMForMultiModalEmbedding.parse_textinputreturnc           
   	   C   s   | d| dd }| d| dd }| dd }| |}| |}|du p+|dk}| |||}tj| dd tj| d	d tj| d
d i}	|	S )Nimageimgtexttxt
captioningT image_featuretext_featurecaption)getr6   rA   r   r
   IMG_EMBEDDINGTEXT_EMBEDDINGCAPTION)
r+   rB   image_input
text_inputcaptioning_inputrD   rF   rH   outoutputr1   r1   r2   forwardQ   s   

z"GEMMForMultiModalEmbedding.forward)r   )__name__
__module____qualname____doc__r   r6   rA   r   r:   r   rV   __classcell__r1   r1   r/   r2   r      s    	*),rZ   os.pathpathosptypingr   r   jsonnumpynpr   torch.nnnntorch.nn.functional
functionalFPILr   torchvisionr   r$   modelscope.metainfor   modelscope.models.baser   modelscope.models.builderr   ,modelscope.models.multi_modal.gemm.gemm_baser	   modelscope.outputsr
   modelscope.preprocessorsr   modelscope.utils.constantr   r   modelscope.utils.loggerr   r"   __all__register_module generative_multi_modal_embeddinggemmr   r1   r1   r1   r2   <module>   s0   