o
    ߥiT                     @   s2  d Z ddlZddlmZ ddlmZmZ ddlZddlZ	ddl
Z
ddlm  mZ ddl
mZ ddlmZ ddlmZmZ G dd	 d	ejZG d
d dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZdS )z* Generative Multimodal Model Architecture.    N)OrderedDict)TupleUnion)nn)	LayerNorm)SimpleTokenizerclip_tokenizec                       s6   e Zd ZdZdZd	 fdd	ZdejfddZ  Z	S )

Bottleneckzd ResNet style bottleneck module
    From https://github.com/openai/CLIP/blob/main/clip/model.py
          c                    s  t    tj||ddd| _t|| _tj||dddd| _t|| _|dkr/t	|nt
 | _tj||| j ddd| _t|| j | _tjdd| _d | _|| _|dksb||tj krttdt	|fd	tj||| j dddd
fdt|| j fg| _d S d S )Nr   Fbias   )paddingr   Tinplacez-10)strider   1)super__init__r   Conv2dconv1BatchNorm2dbn1conv2bn2	AvgPool2dIdentityavgpool	expansionconv3bn3ReLUrelu
downsampler   r	   
Sequentialr   )selfinplanesplanesr   	__class__ `/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/multi_modal/gemm/gemm_base.pyr   #   s6   

zBottleneck.__init__xc                 C   st   |}|  | | |}|  | | |}| |}| | |}| jd ur/| |}||7 }|  |}|S N)	r$   r   r   r   r   r   r"   r!   r%   )r'   r.   identityoutr,   r,   r-   forward;   s   



zBottleneck.forwardr   )
__name__
__module____qualname____doc__r    r   torchTensorr2   __classcell__r,   r,   r*   r-   r	      s
    r	   c                   @   s    e Zd ZdZdejfddZdS )	QuickGELUzd A quick version of GELU module
    From https://github.com/openai/CLIP/blob/main/clip/model.py
    r.   c                 C   s   |t d|  S )NgZd;?)r8   sigmoidr'   r.   r,   r,   r-   r2   M   s   zQuickGELU.forwardN)r4   r5   r6   r7   r8   r9   r2   r,   r,   r,   r-   r;   H   s    r;   c                       sT   e Zd ZdZ	ddededejf fddZdejfd	d
ZdejfddZ	  Z
S )ResidualAttentionBlockzz Multihead attention block with residual link
    Adapted from https://github.com/openai/CLIP/blob/main/clip/model.py
    Nd_modeln_head	attn_maskc              
      sr   t    t||| _t|| _ttdt	||d fdt
 fdt	|d |fg| _t|| _|| _d S )Nc_fcr
   geluc_proj)r   r   r   MultiheadAttentionattnr   ln_1r&   r   Linearr;   mlpln_2rA   )r'   r?   r@   rA   r*   r,   r-   r   V   s   



zResidualAttentionBlock.__init__r.   c                 C   s   | j d ur| j j|j|jdnd | _ | j }|d ur4|jd |jd kr4| j d |jd d |jd f }| j|||d|dd S )Ndtypedevicer   F)need_weightsrA   )rA   torL   rM   shaperF   )r'   r.   rA   r,   r,   r-   	attentiond   s   
"z ResidualAttentionBlock.attentionc                 C   s,   ||  | | }|| | | }|S r/   )rQ   rG   rI   rJ   r=   r,   r,   r-   r2   m   s   zResidualAttentionBlock.forwardr/   )r4   r5   r6   r7   intr8   r9   r   rQ   r2   r:   r,   r,   r*   r-   r>   Q   s    	r>   c                       sN   e Zd ZdZ		ddedededejdef
 fd	d
ZdejfddZ	  Z
S )Transformerzh Transformer encoder module
    Adapted from https://github.com/openai/CLIP/blob/main/clip/model.py
    NFwidthlayersheadsrA   use_gcc                    sB   t    || _| _|| _tj fddt|D  | _d S )Nc                    s   g | ]}t  qS r,   )r>   ).0_rA   rV   rT   r,   r-   
<listcomp>   s    
z(Transformer.__init__.<locals>.<listcomp>)	r   r   rW   rT   rU   r   r&   range	resblocks)r'   rT   rU   rV   rA   rW   r*   rZ   r-   r   x   s   
zTransformer.__init__r.   c                 C   s
   |  |S r/   )r]   r=   r,   r,   r-   r2      s   
zTransformer.forward)NF)r4   r5   r6   r7   rR   r8   r9   boolr   r2   r:   r,   r,   r*   r-   rS   s   s     rS   c                	       s>   e Zd ZdZ	ddedededef fddZd	d
 Z  ZS )AttentionPool2dzn Pool layer with attention module
    Adapted from https://github.com/openai/CLIP/blob/main/clip/model.py
    Nspacial_dim	embed_dim	num_heads
output_dimc                    st   t    tt|d d ||d  | _t||| _t||| _	t||| _
t||p2|| _|| _d S )N   r   g      ?)r   r   r   	Parameterr8   randnpositional_embeddingrH   k_projq_projv_projrD   rb   )r'   r`   ra   rb   rc   r*   r,   r-   r      s   

zAttentionPool2d.__init__c              	   C   s>  | |jd |jd |jd |jd  ddd}tj|jddd|gdd}|| jd d d d d f |j }t	j
di d|d	|d
|d|jd d| jd| jjd| jjd| jjdd dt| jj| jj| jjgdd dd ddddd| jjd| jjddd| jdd\}}|ddd S )Nr   r   rd   r   Tdimkeepdimrl   querykeyvalueembed_dim_to_checkrb   q_proj_weightk_proj_weightv_proj_weightin_proj_weightin_proj_biasbias_kbias_vadd_zero_attnF	dropout_pout_proj_weightout_proj_biasuse_separate_proj_weighttrainingrN   r,   )reshaperP   permuter8   catmeanrg   rO   rL   Fmulti_head_attention_forwardrb   ri   weightrh   rj   r   rD   r   
contiguous)r'   r.   rY   r,   r,   r-   r2      sb   
$

	
zAttentionPool2d.forwardr/   )r4   r5   r6   r7   rR   r   r2   r:   r,   r,   r*   r-   r_      s    r_   c                       s8   e Zd ZdZddddddd fdd	
Zd
d Z  ZS )CrossAttentionz Cross attention module with query and context as input
    Adapted from https://github.com/lucidrains/CoCa-pytorch/blob/main/coca_pytorch/coca_pytorch.py
    N@      Fr
   )context_dimdim_headrV   parallel_ffff_multnorm_contextc          
   	      s   t    || _|d | _|| }|d u r|n|}t|| _|r$t|nt | _tj	||dd| _
tj	||d dd| _tj	||dd| _|| }	|rettj	||	d ddt tj	|	|dd| _d S d | _d S )N      Fr   rd   )r   r   rV   scaler   normr   r   context_normrH   to_qto_kvto_outr&   SwiGLUff)
r'   rl   r   r   rV   r   r   r   	inner_dimff_inner_dimr*   r,   r-   r      s.   
	


zCrossAttention.__init__c           	      C   s   |  |}| |}| |}||jd |jd | jddddd }|| j }| 	|j
ddd\}}td||}||jddd	 }|jdd}td
||}|dddd |jd |jd d}| |}| jdurz|| | }|S )z
        einstein notation
        b - batch
        h - heads
        n, i, j - sequence length (base sequence length, source, target)
        d - feature dimension
        r   r   rs   rd   r   rn   zb h i d, b j d -> b h i jTrk   zb h i j, b j d -> b h i dN)r   r   r   viewrP   rV   r   r   r   r   chunkr8   einsumamaxsoftmaxr   r   r   )	r'   r.   contextqkvsimrF   r1   r,   r,   r-   r2      s2   
	





zCrossAttention.forward)r4   r5   r6   r7   r   r2   r:   r,   r,   r*   r-   r      s    r   c                       s8   e Zd ZdZ		d fdd	ZdddZd	d
 Z  ZS )ModifiedResNeta   Modified ResNet backbone
    From https://github.com/openai/CLIP/blob/main/clip/model.py
    A ResNet class that is similar to torchvision's but contains the following changes:
    - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
    - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
    - The final pooling layer is a QKV attention instead of an average pool
       r   c                    s6  t    || _|| _tjd|d ddddd| _t|d | _tj|d |d dddd| _	t|d | _
tj|d |dddd| _t|| _td| _tjdd| _|| _| ||d	 | _| j|d |d dd
| _| j|d |d dd
| _| j|d |d dd
| _|d }t|d |||| _d S )Nr   rd   r   F)kernel_sizer   r   r   )r   r   r   Tr   r   )r   r
   r       )r   r   rc   input_resolutionr   r   r   r   r   r   r   r!   r"   r   r   r#   r$   	_inplanes_make_layerlayer1layer2layer3layer4r_   attnpool)r'   rU   rc   rV   r   rT   ra   r*   r,   r-   r      s4   


zModifiedResNet.__init__r   c                 C   sH   t | j||g}|t j | _td|D ]}|t | j| qtj| S )Nr   )r	   r   r    r\   appendr   r&   )r'   r)   blocksr   rU   rY   r,   r,   r-   r     s
   
zModifiedResNet._make_layerc                    sJ    fdd}||}  |} |} |} |} |}|S )Nc                    sL    j  jf j jf j jffD ]\}} ||| } q | } | S r/   )r   r   r   r   r!   r"   r$   r   )r.   convbnr'   r,   r-   stem)  s   

z$ModifiedResNet.forward.<locals>.stem)r   r   r   r   r   )r'   r.   r   r,   r   r-   r2   '  s   




zModifiedResNet.forward)r   r   r3   )r4   r5   r6   r7   r   r   r2   r:   r,   r,   r*   r-   r      s    
	r   c                       sN   e Zd ZdZdededededededef fd	d
ZdejfddZ	  Z
S )VisualTransformerz^ ViT transformer backbone
    From https://github.com/openai/CLIP/blob/main/clip/model.py
    r   
patch_sizerT   rU   rV   rc   rW   c           	         s   t    || _|| _tjd|||dd| _|d }t|t	| | _
t|t	|| d d | | _t|| _t||||d| _t|| _t|t	|| | _d S )Nr   F)in_channelsout_channelsr   r   r   r   rd   r   )rW   )r   r   r   rc   r   r   r   re   r8   rf   class_embeddingrg   r   ln_prerS   transformerln_postproj)	r'   r   r   rT   rU   rV   rc   rW   r   r*   r,   r-   r   ?  s&   




zVisualTransformer.__init__r.   c                 C   s   |  |}||jd |jd d}|ddd}tj|jd d|jd |j|jd}tj| j	
|j| |gdd}|| j
|j }| |}|ddd}| |}|ddd}| |}| jd urk|| j }|S )Nr   r   rs   rd   rK   rn   )r   r   rP   r   r8   zerosrL   rM   r   r   rO   rg   r   r   r   r   )r'   r.   zr,   r,   r-   r2   S  s    
 




zVisualTransformer.forward)r4   r5   r6   r7   rR   r^   r   r8   r9   r2   r:   r,   r,   r*   r-   r   :  s    r   c                   @   s   e Zd ZdZdededeeeeeef ef dedededed	ed
ededefddZdddZ	e
dd ZdddZdddZdd ZdS )GEVLa   Generative vision-language model
    Support learning from both generative and contrastive loss.
    Given image and text input, it could output the features of
    image and text respectively. Furthermore, caption could also
    be produced when image input is available.
    ra   image_resolutionvision_layersvision_widthvision_patch_sizecontext_length
vocab_sizetransformer_widthtransformer_headstransformer_layersrW   c              	   C   s  t j|  || _|| _|| _t|ttfr'|d d }t	|||||d| _
n|d }t|||||||d| _
t||
|	|  |d| _|| _t ||| _t t| j|| _t|| _t t||| _t jj| j| jjd d t t||| _t tg td | _ t|d	|	| | j| j | j|d| _!t "t|t #||t j#||d
d| _$t tg tt| | _%t t|| _&| jj'| j$d _'| j&| j$d _&t t(| j|| _)t*|dd| _+t|| _,d S )Nr   r   )rU   rc   rV   r   rT   )r   r   rT   rU   rV   rc   rW   )rT   rU   rV   rA   rW   r   )stdg$I$I,@r
   Fr   rs   T)rl   r   )-r   Moduler   r   vis_token_size	tokenizer
isinstancetuplelistr   visualr   rS   build_attention_maskr   r   	Embeddingtoken_embeddingre   r8   emptyrg   r   ln_finalvis_token_projectioninitnormal_rT   text_projectiononesnploglogit_scaledecoderr&   rH   	to_logitsgen_logit_scaler   r   rf   img_queriesr   img_attn_poolimg_attn_pool_norm)r'   ra   r   r   r   r   r   r   r   r   r   rW   r   vision_headsr,   r,   r-   r   m  s   
	




zGEVL.__init__Nr   c                 C   sb   |d u r| j n|}t||}|tttjj |d |dkr/d|d |d |f< |S )Nr   r   )	r   r8   r   fill_tensorfinfofloat16mintriu_)r'   
seq_lengthprefix_lengthmaskr,   r,   r-   r     s   
zGEVL.build_attention_maskc                 C   s   | j jjjS r/   )r   r   r   rL   r   r,   r,   r-   rL     s   z
GEVL.dtypeFc                 C   sd   |  |}|d d dd d f }||jdddd }|r0|d d dd d d f | j }||fS |S )Nr   rs   rd   Trl   prm   r   )r   r   r   )r'   imagereturn_tokensimage_outputsimage_featuresimage_tokensr,   r,   r-   encode_image  s   
 zGEVL.encode_imagec                 C   s   |  |}|| jd |jd d d f  }|ddd}| |}|ddd}| |}|t|jd |jdddf | j	 }||j
dddd }|rS|}||fS |S )	Nr   r   rd   rs   rn   .Tr   )r   rg   rP   r   r   r   r8   arangeargmaxr   r   )r'   textr   r.   text_featurestext_tokensr,   r,   r-   encode_text  s$   
 

zGEVL.encode_textc                 C   s  | j |dd\}}| j|jd dd}| ||}| |}| jjd }| jjd }|j|jd dt	j
d| }|}	g }
t| jD ]g}| j|dd\}}t	j||gdd	}	| |	ddd
 }| |dd df }tj| j | dd}t	j|dt	|  dd	}t||kst|dkr n|
| t	j||dd gdd	}qBt	j|
dd	dd}g }|D ]}g }|D ]	}|t| q| j|}| }|| q||d fS )NT)r   r   rs   z<|startoftext|>z<|endoftext|>r   )rL   )axisrd   .rn   g       @)r   r   expandrP   r   r   r   encodernew_onesr8   longr\   r   r   r   r   r   r   r   r   r   r   expr   	rand_likerR   r   decodestrip)r'   r   r   r   r   img_token_features	sot_token	eot_token
text_inputinput_tokenspred_tokenstext_idxr   r   out_embs
gen_logitsprobspredpred_text_tokens	text_list
out_tokenstokensr.   out_textr,   r,   r-   image_to_text  sT   



zGEVL.image_to_textNr   )F)r4   r5   r6   r7   rR   r   r   r^   r   r   propertyrL   r   r   r  r,   r,   r,   r-   r   e  s:    


I	


r   c                       sB   e Zd ZdZ fddZdd Zdd Ze dd
dZ	  Z
S )	GEMMModelz Generative multi-modal model, wrapper of GEVL module.
    It takes image or text or both of them as input, and output
    features of input or caption when image input is available.
    c                    s   t    td|ddd}t| }W d    n1 s!w   Y  t| d }|| }t	j
|d}t|| _tg || jR  | _d S )Nz{}/encoder_config.jsonrzutf-8)encodingr   zbpe_vocab_16e6.txt.gz)r   r   openformatjsonloadsreadr   keysospathjoinr   r   r   model)r'   	model_dirfmodel_config
model_nameconfig_argsbpe_pathr*   r,   r-   r     s   


zGEMMModel.__init__c                 C   s   t | j|gd }|S r  )r   r   )r'   text_strtext_tensorr,   r,   r-   tokenize  s   zGEMMModel.tokenizec                 C   s   |   }|S r/   )cpunumpy)r'   featr1   r,   r,   r-   
parse_feat  s   zGEMMModel.parse_featNTc                 C   sv   d\}}}|r|d ur| j |\}}| |}n|d ur&| | j |}|d ur3| | j |}|||d}|S )N)NNN)image_featuretext_featurecaption)r(  r  r5  r   r   )r'   r   r   
captioningimg_featurer7  r8  r1   r,   r,   r-   r2     s   
zGEMMModel.forward)NNT)r4   r5   r6   r7   r   r1  r5  r8   no_gradr2   r:   r,   r,   r*   r-   r    s    r  )r7   r%  collectionsr   typingr   r   r!  r3  r   r8   torch.nn.functionalr   
functionalr   torch.nnr   ,modelscope.models.multi_modal.gemm.tokenizerr   r   r   r	   r;   r>   rS   r_   r   r   r   r   r  r,   r,   r,   r-   <module>   s,   
,	"0<C+ !