o
    ߥi=                     @   s  d dl mZmZ d dlZd dlZd dlZd dlmZ d dl	m  m
Z d dlmZ d dlmZ d dlmZmZmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlm Z m!Z! d dl"m#Z# ddl$m%Z%m&Z&m'Z'm(Z( e# Z)dgZ*ej+e!j,ej-dG dd deZ.dS )    )AnyDictN)Image)BertWordPieceTokenizer)Compose	NormalizeResizeToTensor)Models)
TorchModel)MODELS)
OutputKeys)	LoadImage)	ModelFileTasks)
get_logger   )TEAMBertWrapperCLIPVisionWrapper
CrossLayerTEAMForMultiModalSimilarity)module_namec                       sj   e Zd Zd fdd	Zdd Zdeeef deeef fdd	Zd
eeef deeef fddZ	  Z
S )r   r   c                    s   t  j|||d| td|ddd}d |j_t }t||d|tj	d| _
| j
  || _| jdkrPtj rP| j
d	| j td
| j nd| _td td|tjdd| _| jjdd tdd}ttdtjdt |g| _d S )N)	model_dir	device_idz{}/text_config.jsoni   i   )config_jsonfeat_dim	token_dimz{}/{})
pretrainedr   cuda:{}zUse GPU: {}zUse CPU for inferenceF)	lowercase   )
max_length)g3<4'?gwgM?gy{ ?)gB91?gwt.?g	U?)   r$   )interpolation)super__init__r   formatbertclsr   r   r   TORCH_MODEL_BIN_FILEmodelevalr   torchcudais_availabletologgerinfor   
VOCAB_FILEtext_tokenizerenable_truncationr   r   r   r   BICUBICr	   img_preprocessor)selfr   r   argskwargs
text_modelimage_modelnorm_op	__class__ a/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/multi_modal/team/team_model.pyr'      sJ   

z$TEAMForMultiModalSimilarity.__init__c                 C   sz   | j |}d}td|f }td|f}|j|j}}t||ddt|f< t||ddt|f< ||fS )Nr"   r   r   )	r5   encoder.   zeroslongidsattention_masktensorlen)r9   text_strtokens
max_tokenstext_ids_tensortext_mask_tensortext_ids	text_maskrA   rA   rB   tokenize_textC   s   z)TEAMForMultiModalSimilarity.tokenize_textinputreturnc                 C   s  t   d|v r@|d d ur@|d }t|}| |d }| jdkr-|d| j}| j	d d |\}}}}|
  }nd\}}d|v r|d d ur|d }t|tr_| |\}}	n	tdt| | jdkr|d| j}|	d| j}	| j	||	d \}
}}}|

  }
nd\}}	|d ur|	d ur|d ur| j||	|d  }nd }tj|tj|
tj|i}|W  d    S 1 sw   Y  d S )Nimg)N.r   r   )NNtextztext should be str, but got )r.   no_gradr   convert_to_imgr8   r   r1   r(   r,   get_featurecpunumpy
isinstancestrrQ   	TypeErrortypeget_cross_scoreitemr   IMG_EMBEDDINGTEXT_EMBEDDINGSCORES)r9   rR   	input_img
img_tensor_image_featureimage_tensorsrJ   rM   rN   text_featuretext_tensorsscoreoutputrA   rA   rB   forwardM   sf   





$z#TEAMForMultiModalSimilarity.forwardinputsc                 C   s   |S )NrA   )r9   rn   rA   rA   rB   postprocess~   s   z'TEAMForMultiModalSimilarity.postprocess)r   )__name__
__module____qualname__r'   rQ   r   r\   r   rm   ro   __classcell__rA   rA   r?   rB   r      s
    %"
*1)/typingr   r   cv2rZ   npr.   torch.nnnntorch.nn.functional
functionalFPILr   
tokenizersr   torchvision.transformsr   r   r   r	   modelscope.metainfor
   modelscope.models.baser   modelscope.models.builderr   modelscope.outputsr   modelscope.preprocessorsr   modelscope.utils.constantr   r   modelscope.utils.loggerr   utilsr   r   r   r   r2   __all__register_modulemulti_modal_similarityteamr   rA   rA   rA   rB   <module>   s*   