o
    ϯi&                     @   s   d dl mZ d dlmZmZmZ d dlZd dlmZ d dl	m
  mZ d dlmZmZ G dd dejZG dd dejZG d	d
 d
ejZG dd deZG dd dejZeejddZG dd dejZdS )    )partial)CallableListOptionalN)DropPathtrunc_normal_c                       s0   e Zd Z					d	 fdd	Zdd Z  ZS )
	Attention   FN        c                    sf   t    || _|| }|p|d | _tj||d |d| _t|| _t||| _	t|| _
d S )Ng         )bias)super__init__	num_headsscalennLinearqkvDropout	attn_dropproj	proj_drop)selfdimr   qkv_biasqk_scaler   r   head_dim	__class__ P/home/ubuntu/.local/lib/python3.10/site-packages/imagebind/models/transformer.pyr      s   
	zAttention.__init__c           
      C   s   |j \}}}| |||d| j|| j ddddd}|d |d |d }}}||dd | j }	|	jdd}	| |	}	|	| dd|||}| 	|}| 
|}|S )	Nr      r         )r   )shaper   reshaper   permute	transposer   softmaxr   r   r   )
r   xBNCr   qkvattnr   r   r    forward-   s    



zAttention.forward)r	   FNr
   r
   )__name__
__module____qualname__r   r3   __classcell__r   r   r   r    r      s    r   c                       s0   e Zd Zddejdf fdd	Zdd Z  ZS )MlpNr
   c                    sN   t    |p|}|p|}t||| _| | _t||| _t|| _d S N)	r   r   r   r   fc1actfc2r   drop)r   in_featureshidden_featuresout_features	act_layerr=   r   r   r    r   E   s   
zMlp.__init__c                 C   s6   |  |}| |}| |}| |}| |}|S r9   )r:   r;   r=   r<   )r   r+   r   r   r    r3   U   s   




zMlp.forward)r4   r5   r6   r   GELUr   r3   r7   r   r   r   r    r8   D   s    r8   c                       *   e Zd Zdejdejf fddZ  ZS )MultiheadAttentionr+   	attn_maskc                    s   t  j|||d|dd S )NF)need_weightsrE   r   r   r3   r   r+   rE   r   r   r    r3   _   s   zMultiheadAttention.forwardr4   r5   r6   torchTensorr3   r7   r   r   r   r    rD   ^       "rD   c                       rC   )ViTAttentionr+   rE   c                    s   |d u sJ t  |S r9   rG   rH   r   r   r    r3   d   s   zViTAttention.forwardrI   r   r   r   r    rM   c   rL   rM   c                       sp   e Zd Zdejejddddfdedededed	ed
edede	e
 def fddZdejdejfddZ  ZS )BlockWithMaskingr#   r
   N-C6?r   attn_target	mlp_ratiorA   
norm_layerffn_dropout_rate	drop_pathlayer_scale_typelayer_scale_init_valuec
                    s  t    t|tjrJ d| | _|dkrt|| _nt | _||| _	t
|| }
t||
||d| _||| _|| _| jd ur| jdv sPJ d| j | jdkr[dd|g}n	| jdkrdg d	}tjtj|d
|	 dd| _tjtj|d
|	 dd| _d S d S )NzPattn_target should be a Callable. Otherwise attn_target is shared across blocks!r
   )r>   r?   rA   r=   )per_channelscalarzFound Layer scale type rW   r"   rX   )r"   r"   r"   )sizeT)requires_grad)r   r   
isinstancer   Moduler2   r   rT   Identitynorm_1intr8   mlpnorm_2rU   	ParameterrJ   oneslayer_scale_gamma1layer_scale_gamma2)r   r   rP   rQ   rA   rR   rS   rT   rU   rV   mlp_hidden_dimgamma_shaper   r   r    r   j   sJ   







zBlockWithMasking.__init__r+   rE   c                 C   s   | j d u r"|| | | || }|| | | | }|S || | | ||| j  }|| | | || j  }|S r9   )rU   rT   r2   r^   r`   ra   rd   re   rH   r   r   r    r3      s   
 zBlockWithMasking.forward)r4   r5   r6   r   rB   	LayerNormr_   r   floatr   strr   rJ   rK   r3   r7   r   r   r   r    rN   i   s8    	
5rN   gư>)epsc                       s   e Zd Zeddddedddddfdeded	ed
edee dee dede	dedededee	 dede	f fddZ
dd Z				d"dejdejdededeee  f
d d!Z  ZS )#SimpleTransformerNr
   progressiver#   rO   jaxrP   	embed_dim
num_blocksblockpre_transformer_layerpost_transformer_layerdrop_path_ratedrop_path_typerR   rQ   rS   rU   rV   weight_init_stylec              
      s   t    || _|dkrdd td|D n|dkr)fddt|D ntd| tj 	f	ddt|D  | _	|| _
|| _| | j d	S )
a  
        Simple Transformer with the following features
        1. Supports masked attention
        2. Supports DropPath
        3. Supports LayerScale
        4. Supports Dropout in Attention and FFN
        5. Makes few assumptions about the input except that it is a Tensor
        rm   c                 S   s   g | ]}|  qS r   )item).0r+   r   r   r    
<listcomp>   s    z.SimpleTransformer.__init__.<locals>.<listcomp>r   uniformc                    s   g | ]} qS r   r   rx   i)rt   r   r    ry      s    zUnknown drop_path_type: c                    s(   g | ]} | d qS ))r   rP   rQ   rS   rT   rR   rU   rV   r   r{   )	rP   rq   dprro   rS   rV   rU   rQ   rR   r   r    ry      s    N)r   r   rr   rJ   linspacerange
ValueErrorr   
Sequentialblocksrs   rv   apply_init_weights)r   rP   ro   rp   rq   rr   rs   rt   ru   rR   rQ   rS   rU   rV   rv   r   )
rP   rq   r}   rt   ro   rS   rV   rU   rQ   rR   r    r      s   
zSimpleTransformer.__init__c                 C   s   t |tjr1| jdkrtjj|j n| jdkr t|jdd |j	d ur/tj
|j	d d S d S t |tjrItj
|j	d tj
|jd d S d S )Nrn   pytorchg{Gz?)stdr   g      ?)r[   r   r   rv   rJ   initxavier_uniform_weightr   r   	constant_rh   )r   mr   r   r    r      s   


zSimpleTransformer._init_weightsFr"   tokensrE   use_checkpointcheckpoint_every_ncheckpoint_blk_idsc                    s   | j r|  |}|r|du r fddtt| jD }|r"t|}t| jD ]\}}|r;||v r;tj|||dd}q'|||d}q'| jrJ| |}|S )a  
        Inputs
        - tokens: data of shape N x L x D (or L x N x D depending on the attention implementation)
        - attn: mask of shape L x L

        Output
        - x: data of shape N x L x D (or L x N x D depending on the attention implementation)
        Nc                    s   g | ]
}|  d kr|qS )r   r   )rx   blk_idr   r   r    ry     s
    z-SimpleTransformer.forward.<locals>.<listcomp>F)use_reentrant)rE   )rr   r   lenr   set	enumerate
checkpointrs   )r   r   rE   r   r   r   r   blkr   r   r    r3      s"   


zSimpleTransformer.forward)NFr"   N)r4   r5   r6   rN   _LAYER_NORMr   r_   r   ri   rj   r   r   rJ   rK   boolr   r3   r7   r   r   r   r    rl      sr    	
5
rl   )	functoolsr   typingr   r   r   rJ   torch.nnr   torch.utils.checkpointutilsr   timm.layersr   r   r\   r   r8   rD   rM   rN   rh   r   rl   r   r   r   r    <module>   s   -D