o
    ߥiR                     @   s   d Z ddlmZmZ ddlZddlmZ ddlm	Z	 ddl
mZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ e ZdgZejeje	jdG dd deZdS )z% Generative Multimodal Model Wrapper.    )AnyDictN)
transforms)Models)
TorchModel)MODELS)	RLEGModel)
OutputKeys)	LoadImage)	ModelFileTasks)
get_loggerRLEGForMultiModalEmbedding)module_namec                       sT   e Zd ZdZd fdd	Zdd Zdd Zd	eee	f d
eee	f fddZ
  ZS )r   z Generative multi-modal model for multi-modal embedding.
    The model is trained by representation learning with embedding generation.
    Inputs could be image or text or both of them.
    Outputs could be features of input image or text,
    r   c                    s   t  j|||d| t|d| _td|tj}| j	| | j
  || _| jdkrHtj rH| jd| j td| j nd| _td ttd	t td
dg| _d S )N)	model_dir	device_id)r   z{}/{}r   cuda:{}zUse GPU: {}zUse CPU for inference)   r   )g3<4'?gwgM?gy{ ?)gB91?gwt.?g	U?)super__init__r   modeltorchloadformatr   TORCH_MODEL_BIN_FILEload_state_dictevalr   cudais_availabletologgerinfoTComposeResizeToTensor	Normalizeimg_preprocessor)selfr   r   argskwargspretrained_params	__class__ [/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/multi_modal/rleg/rleg.pyr      s0   

z#RLEGForMultiModalEmbedding.__init__c                 C   sD   |d u rd S t |}| |d }| jdkr |d| j}|S )N)N.r   r   )r
   convert_to_imgr(   r   r    r   )r)   	input_img
img_tensorr/   r/   r0   parse_image5   s   

z&RLEGForMultiModalEmbedding.parse_imagec                 C   sj   |d u s
t |dkrd S t|tr| j|}n	tdt| | jdkr/|d	| j}|
ddS )Nr   ztext should be str, but got r      r   )len
isinstancestrr   tokenize	TypeErrortyper   r    r   view)r)   text_strtext_ids_tensorr/   r/   r0   
parse_text>   s   

z%RLEGForMultiModalEmbedding.parse_textinputreturnc              	   C   sz   | d| dd }| d| dd }| |}| |}| ||}tj| dd tj| dd tj| dd i}|S )Nimageimgtexttxtimage_featuretext_featurecaption)getr4   r?   r   r	   IMG_EMBEDDINGTEXT_EMBEDDINGCAPTION)r)   r@   image_input
text_inputrB   rD   outoutputr/   r/   r0   forwardJ   s   

z"RLEGForMultiModalEmbedding.forward)r   )__name__
__module____qualname____doc__r   r4   r?   r   r8   r   rQ   __classcell__r/   r/   r-   r0   r      s    	*)rU   typingr   r   r   torchvisionr   r#   modelscope.metainfor   modelscope.models.baser   modelscope.models.builderr   (modelscope.models.multi_modal.rleg.modelr   modelscope.outputsr	   modelscope.preprocessorsr
   modelscope.utils.constantr   r   modelscope.utils.loggerr   r!   __all__register_module generative_multi_modal_embeddingrlegr   r/   r/   r/   r0   <module>   s$   