o
    ߥi-                     @   s0  d dl mZ d dlmZmZ d dlZd dlZd dlm	  m
Z d dlm  mZ d dlm	Z	 d dlmZmZ d dlmZ G dd de	jZG d	d
 d
e	jZG dd de	jZG dd de	jZG dd de	jZG dd de	jZG dd de	jZG dd de	jZG dd de	jZG dd de	jZdS )    )OrderedDict)TupleUnionN)nn)
BertConfigBertForMaskedLM)compatible_position_idsc                       s(   e Zd ZdZdejf fddZ  ZS )	LayerNormz*Subclass torch's LayerNorm to handle fp16.xc                    s$   |j }t |tj}||S N)dtypesuperforwardtypetorchfloat32)selfr
   	orig_typeret	__class__ \/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/multi_modal/team/utils.pyr      s   
zLayerNorm.forward)__name__
__module____qualname____doc__r   Tensorr   __classcell__r   r   r   r   r	      s    r	   c                   @   s   e Zd ZdejfddZdS )	QuickGELUr
   c                 C   s   |t d|  S )NgZd;?)r   sigmoidr   r
   r   r   r   r   !   s   zQuickGELU.forwardN)r   r   r   r   r   r   r   r   r   r   r      s    r   c                       sP   e Zd Z	ddededejf fddZdejfdd	Zdejfd
dZ  Z	S )ResidualAttentionBlockNd_modeln_head	attn_maskc              
      sr   t    t||| _t|| _ttdt	||d fdt
 fdt	|d |fg| _t|| _|| _d S )Nc_fc   geluc_proj)r   __init__r   MultiheadAttentionattnr	   ln_1
Sequentialr   Linearr   mlpln_2r%   )r   r#   r$   r%   r   r   r   r*   '   s   



zResidualAttentionBlock.__init__r
   c                 C   s>   | j d ur| j j|j|jdnd | _ | j|||d| j dd S )Nr   deviceF)need_weightsr%   r   )r%   tor   r3   r,   r!   r   r   r   	attention6   s   
z ResidualAttentionBlock.attentionc                 C   s,   ||  | | }|| | | }|S r   )r6   r-   r0   r1   r!   r   r   r   r   =   s   zResidualAttentionBlock.forwardr   )
r   r   r   intr   r   r*   r6   r   r   r   r   r   r   r"   %   s    r"   c                	       sF   e Zd Z		ddedededejf fddZd	ejfd
dZ  ZS )TransformerNFwidthlayersheadsr%   c                    sB   t    || _| _|| _tj fddt|D  | _d S )Nc                    s   g | ]}t  qS r   )r"   ).0_r%   r;   r9   r   r   
<listcomp>O   s    
z(Transformer.__init__.<locals>.<listcomp>)	r   r*   use_gcr9   r:   r   r.   range	resblocks)r   r9   r:   r;   r%   r@   r   r>   r   r*   E   s   
zTransformer.__init__r
   c                 C   s,   | j r| jD ]}t||}q|S | |S r   )r@   rB   
checkpoint)r   r
   
each_blockr   r   r   r   T   s
   

zTransformer.forward)NF)	r   r   r   r7   r   r   r*   r   r   r   r   r   r   r8   C   s    r8   c                       sJ   e Zd Z	ddedededededef fdd	Zd
ejfddZ  ZS )VisionTransformerFinput_resolution
patch_sizer9   r:   r;   
output_dimc           	         s   t    || _|| _tjd|||dd| _|d }t|t	| | _
t|t	|| d d | | _t|| _t||||d| _t|| _t|t	|| | _d S )N   F)in_channelsout_channelskernel_sizestridebiasg            )r@   )r   r*   rF   rH   r   Conv2dconv1	Parameterr   randnclass_embeddingpositional_embeddingr	   ln_prer8   transformerln_postproj)	r   rF   rG   r9   r:   r;   rH   r@   scaler   r   r   r*   _   s&   




zVisionTransformer.__init__r
   c                 C   s   |  |}||jd |jd d}|ddd}| j|jtj|jd d|jd |j|j	d }tj
||gdd}|| j|j }| |}|ddd}| |}|ddd}| |d d dd d f }| jd urt|| j }|S Nr   rP   rO   r2   dim)rR   reshapeshapepermuterU   r5   r   r   zerosr3   catrV   rW   rX   rY   rZ   )r   r
   rU   r   r   r   r   |   s(   
"




zVisionTransformer.forward)F)	r   r   r   r7   r*   r   r   r   r   r   r   r   r   rE   ]   s     	rE   c                       $   e Zd Z fddZdd Z  ZS )CLIPVisionWrapperc                    s$   t    tddddddd| _d S )N                  )rF   rG   r9   r:   r;   rH   )r   r*   rE   vision_transformer)r   r   r   r   r*      s   
zCLIPVisionWrapper.__init__c                 C   s  | j |}||jd |jd d}|ddd}| j j|jtj	|jd d|jd |j|j
d }tj||gdd}|| j j|j }| j |}|ddd}| j |}|ddd}| }| j |d d dd d f }| j jd ur|| j j }||fS r\   )rm   rR   r`   ra   rb   rU   r5   r   r   rc   r3   rd   rV   rW   rX   clonerY   rZ   )r   r
   rU   x_tensorr   r   r   r      s*   "
zCLIPVisionWrapper.forwardr   r   r   r*   r   r   r   r   r   r   rf      s    
rf   c                       re   )BertWrapperc                    sH   t t|   t|}t|j| _tjd|dd| _	td|| _
d S )Nrl   F)rN   )r   rq   r*   r   from_json_filer   bertr   r/   	projectorprojector_token_embeds)r   config_jsonfeat_dim	token_dimbert_configr   r   r   r*      s
   
zBertWrapper.__init__c                 C   sT   ||d}| j di |ddi}|d }|d d dd d f }| || |fS )N)	input_idsattention_maskreturn_dictFr   r   )rs   rt   ru   )r   rz   r{   trans_featuresoutput_statesoutput_tokens
cls_tokensr   r   r   r      s   zBertWrapper.forwardrp   r   r   r   r   rq      s    rq   c                       s0   e Zd Zddejdf fdd	Zdd Z  ZS )MlpNg        c                    sN   t    |p|}|p|}t||| _| | _t||| _t|| _d S r   )	r   r*   r   r/   fc1actfc2Dropoutdrop)r   in_featureshidden_featuresout_features	act_layerr   r   r   r   r*      s   
zMlp.__init__c                 C   s6   |  |}| |}| |}| |}| |}|S r   )r   r   r   r   r!   r   r   r   r      s   




zMlp.forward)r   r   r   r   GELUr*   r   r   r   r   r   r   r      s    r   c                       re   )
CrossLayerc                    s   t t|   t|| _t|| _t|| _tj|dd| _	tj|dd| _
t||| dd| _td| _td| _td| _d S )Nrk   )	embed_dim	num_headsg?)r   r   r   )r   r   r*   r   r	   norm1norm2norm3r+   	self_attn
cross_attnr   ffnr   dropout1dropout2dropout3)r   rw   	mlp_ratior   r   r   r*      s$   zCrossLayer.__init__c              	   C   s   |  |}| j|| ddd|| ddd|ddd|dkdd ddd}|| | }| |}| || ddd|ddd|dddd ddd}|| | }| |}|| | 	| }|S )NrP   r   rO   )key_padding_mask)
r   r   rb   r   r   r   r   r   r   r   )r   text_tensors
text_masksimage_tensorsretrieved_tensorsretrieved_tensors_resr   r   r   r      s@   




zCrossLayer.forwardrp   r   r   r   r   r      s    r   c                       s.   e Zd Z fddZdddZdd Z  ZS )	TEAMc                    st   t t|   || _|| _ttdddg| _t	dd| _
t	dd| _t|d}t|d | j|dd d S )	Nri   rO   )rw   r   rl   cpuz'text_model.bert.embeddings.position_idsT)strict)r   r   r*   
text_modelimage_modelr   
ModuleListr   cross_modelr/   image_tensor_fctext_tensor_fcr   loadr   load_state_dict)r   r   r   
pretrainedparamsr   r   r   r*     s   zTEAM.__init__Nc                 C   sn   |d ur|  ||\}}tj|ddd}nd\}}|d ur-| |\}}tj|ddd}nd\}}||||fS )N       @rP   pr_   )NN)r   F	normalizer   )r   	text_data	text_mask
img_tensortext_featurer   image_featurer   r   r   r   get_feature,  s   zTEAM.get_featurec                 C   s   t |}g }| |}||j}| jD ]=}|||||}| |}	t jtj	|	dddtj	|ddd dd}
t j|
| ddt j
t j|dddd }|| q|S )Nr   rO   r   r^   rP   g      ?)min)r   
zeros_liker   r   r   r   r   sumr   r   clampappend)r   r   r   r   r   pair_score_listtext_tensors_projtext_mask_floateach_cross_modelretrieved_tensors_proj
pair_scorepair_score_reducedr   r   r   get_cross_score;  s2   



zTEAM.get_cross_score)NNN)r   r   r   r*   r   r   r   r   r   r   r   r     s    
r   )collectionsr   typingr   r   numpynpr   torch.nn.functionalr   
functionalr   torch.utils.checkpointutilsrC   transformersr   r   -modelscope.utils.compatible_with_transformersr   r	   Moduler   r"   r8   rE   rf   rq   r   r   r   r   r   r   r   <module>   s&   	7%0