o
    }oi+z                     @   s  d Z ddlZddlmZ ddlZddlmZ ddlm  m	Z
 ddlm  mZ ddlmZ ddlmZ ejrDddlmZmZmZmZ nddlmZmZmZmZ ddlmZmZmZmZmZmZm Z  dd	 Z!zdd
l"m#Z#m$Z$ e! Z%W n e&y~   dZ%Y nw G dd dej'Z(G dd dej'Z)G dd dej*e)e(Z+G dd de)Z,G dd de)Z-G dd dej'Z.G dd de.Z/G dd de.Z0G dd de(Z1G dd dej'Z2G d d! d!e(Z3G d"d# d#ej'Z4G d$d% d%ej'Z5G d&d' d'e(Z6dS )(z]
Adapted from:
https://github.com/openai/guided-diffusion/blob/main/guided_diffusion/unet.py
    N)abstractmethod)	rearrange)attention_alt)QKVAttentionQKVMaskedAttentionQKVStableAttentionQKVStableMaskedAttention)
DownsampleUpsampleUpsampleLearnableconv_ndlinearnormalizationzero_modulec                  C   sN   t j s	tdt j } t j| }|jdko|jdk}|jdk}|p&|S )NzCUDA is not available         )thcudais_availableRuntimeErrorcurrent_deviceget_device_propertiesmajorminor)
cur_devicedpropsis_sm75is_sm8x_or_later r   v/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/multimodal/modules/imagen/diffusionmodules/blocks.py
check_cuda6   s   


r!   )flash_attn_varlen_funcflash_attn_varlen_kvpacked_funcFc                   @      e Zd ZdZedd ZdS )TextConditionedBlockzH
    Any module where forward() takes text embeddings as arguments.
    c                 C      dS )zj
        Apply the module to `x` given `text_emb` text embedding and 'text_mask' text valid mask.
        Nr   )selfxtext_emb	text_maskr   r   r    forwardO       zTextConditionedBlock.forwardN__name__
__module____qualname____doc__r   r+   r   r   r   r    r%   J       r%   c                   @   r$   )TimestepBlockzT
    Any module where forward() takes timestep embeddings as a second argument.
    c                 C   r&   )zJ
        Apply the module to `x` given `emb` timestep embeddings.
        Nr   r'   r(   embr   r   r    r+   [   r,   zTimestepBlock.forwardNr-   r   r   r   r    r3   V   r2   r3   c                   @   s   e Zd ZdZdd ZdS )ConditionalSequentialz
    A sequential module that accepts timestep embeddings, text embedding and text mask in addition to the input x.
    Depending on the type of block, we either pass timestep embedding or text embeddings as inputs.
    c                 C   sD   | D ]}t |tr|||}qt |tr||||}q||}q|S N)
isinstancer3   r%   )r'   r(   r5   r)   r*   layerr   r   r    r+   h   s   


zConditionalSequential.forwardN)r.   r/   r0   r1   r+   r   r   r   r    r6   b   s    r6   c                       sB   e Zd ZdZ								d fdd	Zdd Zd	d
 Z  ZS )ResBlocka  
    A residual block that can optionally change the number of channels.

    :param channels: the number of input channels.
    :param emb_channels: the number of timestep embedding channels.
    :param dropout: the rate of dropout.
    :param out_channels: if specified, the number of out channels.
    :param use_conv: if True and out_channels is specified, use a spatial
        convolution instead of a smaller 1x1 convolution to change the
        channels in the skip connection.
    :param dims: determines if the signal is 1D, 2D, or 3D.
    :param use_checkpoint: if True, use gradient checkpointing on this module.
    :param up: if True, use this block for upsampling.
    :param down: if True, use this block for downsampling.
    NF   c                    sz  t    || _|| _|| _|p|| _|| _|| _|| _t	
t|t	 t||| jddd| _|	p3|
| _|r:t}nt}|	rM||d|| _||d|| _n|
r^t|d|| _t|d|| _nt	  | _| _t	
t	 t||rtd| j n| j| _t	
t| jt	 t	j|dtt|| j| jddd| _| j|krt	 | _d S |rt||| jddd| _d S t||| jd| _d S )N      paddingFr;   )p)super__init__channelsemb_channelsdropoutout_channelsuse_convuse_checkpointuse_scale_shift_normnn
Sequentialr   SiLUr   	in_layersupdownr   r
   h_updx_updr	   Identityr   
emb_layersDropoutr   
out_layersskip_connection)r'   rC   rD   rE   rF   rG   rI   dimsrH   updownlearnable_upsamplingupsample_fn	__class__r   r    rB      sT   




zResBlock.__init__c                 C   "   | j rt| j||S | ||S a	  
        Apply the block to a Tensor, conditioned on a timestep embedding.

        :param x: an [N x C x ...] Tensor of features.
        :param emb: an [N x emb_channels] Tensor of timestep embeddings.
        :return: an [N x C x ...] Tensor of outputs.
        rH   
checkpoint_forwardr4   r   r   r    r+         zResBlock.forwardc                 C   s   | j r#| jd d | jd }}||}| |}| |}||}n| |}| |}t|jt|jk rE|d }t|jt|jk s7| jrn| jd | jdd  }}t	j
|ddd\}	}
||d|	  |
 }||}n	|| }| |}| || S )N.Nr   r=   r;   dim)rN   rM   rO   rP   rR   lenshaperI   rT   r   chunkrU   )r'   r(   r5   in_restin_convhemb_outout_normout_restscaleshiftr   r   r    ra      s&   






zResBlock._forward)NFFr;   FFFFr.   r/   r0   r1   rB   r+   ra   __classcell__r   r   r[   r    r:   s   s    Ar:   c                       s<   e Zd ZdZ					d fdd	Zdd Zd	d
 Z  ZS )EfficientResBlocka  
    A residual block that can optionally change the number of channels.
    Follow Figure A.27 in Imagen Paper.
    :param channels: the number of input channels.
    :param emb_channels: the number of timestep embedding channels.
    :param out_channels: if specified, the number of out channels.
    :param use_conv: if True and out_channels is specified, use a spatial
        convolution instead of a smaller 1x1 convolution to change the
        channels in the skip connection.
    :param dims: determines if the signal is 1D, 2D, or 3D.
    :param use_checkpoint: if True, use gradient checkpointing on this module.
    :param up: if True, use this block for upsampling.
    :param down: if True, use this block for downsampling.
    NFr;   c                    s   t    |p|}|| _|| _tt|t t|||ddd| _	tt t
||r0d| n|| _tt|t tt|||ddd| _t|||d| _|r^dtd | _d S d| _d S )Nr<   r=   r>   r;   )rA   rB   rI   rH   rJ   rK   r   rL   r   rM   LinearrR   r   rT   shortcutmathsqrtshortcut_scale)r'   rC   rD   rF   rI   rV   rH   skip_connection_scalingr[   r   r    rB      s*   

"zEfficientResBlock.__init__c                 C   r]   r^   r_   r4   r   r   r    r+     rb   zEfficientResBlock.forwardc           	      C   s   |  |}| |}t|jt|jk r"|d }t|jt|jk s| jrK| jd | jdd  }}tj|ddd\}}||d|  | }||}n	|| }| |}|| || j	  S )Nrd   r   r=   r;   re   )
rM   rR   rg   rh   rI   rT   r   ri   rv   ry   )	r'   r(   r5   rl   rm   rn   ro   rp   rq   r   r   r    ra   ,  s   



zEfficientResBlock._forward)NFr;   FFrr   r   r   r[   r    rt      s    &rt   c                       sD   e Zd Z												d fdd		Zedd
dZ  ZS )BlockNTr;   r   Frc   r   c              	      s   t    p||| _|| _t| dg}| fddt|d D 7 }t|| _| jd ur~| jdv sAJ t	 }| jdkrLt
}n!| jdkrYt}| j|d< n| jd	krft}| j|d< nt}| j|d< |f||
||	d
|| _d S d S )NrF   rI   rV   rH   rz   c                    s"   g | ]}t  d qS )r|   )rt   ).0_rV   rD   rF   rz   rH   rI   r   r    
<listcomp>b  s    
z"Block.__init__.<locals>.<listcomp>r=   )r'   crossfusedstackedr'   r   context_dimr   )	num_headsnum_head_channelsrH   stable_attentionflash_attention)rA   rB   attention_typetext_embed_dimrt   rangerJ   
ModuleListblocksdictSelfAttentionBlockCrossAttentionBlockStackedCrossAttentionBlockFusedCrossAttentionBlockattention_layer)r'   rC   rD   rF   rI   num_resblocksr   r   r   r   r   r   rV   rH   rz   r   attention_kwargsattention_fnr[   r   r    rB   >  sV   







zBlock.__init__c                 C   s   d S r7   r   )r'   r(   r5   
text_embedr*   r   r   r    r+     s   zBlock.forward)NTr;   Nr   TFrc   r   r;   FFNN)r.   r/   r0   rB   r   r+   rs   r   r   r[   r    r{   =  s     Pr{   c                       D   e Zd Z														d fdd		Zdd
dZ  ZS )DBlockNTr;   r   Frc   r   c                    sR   t  j|||||||	|
||||||d || _| jr'tj||d|dd| _d S d S )NrF   rI   r   r   r   r   r   r   r   rV   rH   rz      r=   )strider?   )rA   rB   	conv_downrJ   Conv2dconv)r'   rC   rD   rF   rI   r   r   r   r   r   r   r   r   r   rV   rH   rz   r[   r   r    rB     s(   zDBlock.__init__c                 C   sZ   | j r| |}| jD ]}|||}q| jdv r!| |||}|S | jdkr+| |}|S N)r   r   r   r'   )r   r   r   r   r   r'   r(   r5   r   r*   blockr   r   r    r+     s   




zDBlock.forwardNTTr;   r;   Nr   TFrc   r   r;   FFr   r.   r/   r0   rB   r+   rs   r   r   r[   r    r     s"    )r   c                       r   )UBlockNTr;   r   Frc   r   c                    sP   t  j|||||||	|
||||||d || _| jr&t||d|d| _d S d S )Nr   r   r=   )rA   rB   conv_uprJ   ConvTranspose2dr   )r'   rC   rD   rF   rI   r   r   r   r   r   r   r   r   r   rV   rH   rz   r[   r   r    rB     s(   zUBlock.__init__c                 C   sX   | j D ]}|||}q| jdv r| |||}n
| jdkr"| |}| jr*| |}|S r   )r   r   r   r   r   r   r   r   r    r+     s   




zUBlock.forwardr   r   r   r   r   r[   r    r     s"    (r   c                       <   e Zd ZdZ					d fdd	Zdd	 Zd
d Z  ZS )r   za
    An attention block that fuses self-attention and cross-attention
    in a single block.
    r=   rc   FTc                    s   t    || _|dkr|| _n|| dks J d| d| || | _|| _|| _t|| _t|| _t|| _	t
d||d| _t
d||d d| _t
d||d d| _|rets^J d|rdJ dn|rnt| j| _nt| j| _tt
d||d| _d S 	Nrc   r   q,k,v channels ' is not divisible by num_head_channels r=   r;    FlashAttention is not installed./FlashAttention doesn't support the stable form.)rA   rB   rC   r   rH   r   r   normnorm_context	norm_selfr   q
kv_contextkv_selfflash_attn_installedr   	attentionr   r   proj_outr'   rC   r   r   r   rH   r   r   r[   r   r    rB   
  s.   





z!FusedCrossAttentionBlock.__init__c                 C   &   | j rt| j|||S | |||S r7   r_   r'   r(   contextmaskr   r   r    r+   7     z FusedCrossAttentionBlock.forwardc                  C   s  |j ^}}}|||d}| | |}| | |}|jddd\}	}
|	 }	|
 }
t	|d}| 
|}| |}|jddd\}}| }| }tj|	|gdd}tj|
|gdd}| jr|j d }|j d |j d |j d  }}t|d| jd}tj||f|jtjd	}| }tj||gdd}|dd| }|j d }||| jd}|dd| }||| jd}tj||gdd}tjd|d | |tj|jd
}tj|d tj|jd}tj|jdddd|dd < ||7 }t||||||d}t|d|| jd}nCtj|	j d |j d |	j d |jd}||j d d|j d }|d|j d d}tj||gdd}|tj}| ||||\}}| |}|| j||g|R  S )Nrc   r;   r=   re   r   r;   r=   r   b (h d) s -> (b s) h drl   )devicedtypestepr   r   r   r           (b s) h d -> b (h d) sbrl   )r   ) rh   reshaper   r   r   r   ri   
contiguousr   permuter   r   catr   r   r   onesr   bool	transposeviewstackarangeint32zeroscumsumsumr#   repeattor   r   ) r'   r(   r   r   r   cspatialr   r   k_selfv_self	context_nr   	k_context	v_contextk_fullv_full
batch_sizemax_seqlen_qmax_seqlen_k	mask_selfmask_context	mask_fullk_full_unpaddedtotal_kv_full_unpaddedkv_full_unpaddedcu_seqlens_qcu_seqlens_koutrl   r~   r   r   r    ra   =  s\   


 
&
z!FusedCrossAttentionBlock._forwardr=   rc   FTFrr   r   r   r[   r    r     s    	-r   c                       s<   e Zd ZdZ					d fdd	Zdd Zd	d
 Z  ZS )r   a  
    An attention block that allows spatial positions to attend to each other.

    Originally ported from here, but adapted to the N-d case.
    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66.
    r=   rc   Fc                    s   t    || _|dkr|| _n|| dks J d| d| || | _|| _t|| _td||d d| _|| _	|rIt
sBJ d|rHJ dn|rRt| j| _nt| j| _ttd||d| _d S )	Nrc   r   r   r   r=   r<   r   r   )rA   rB   rC   r   rH   r   r   r   qkvr   r   r   r   r   r   r   )r'   rC   r   r   rH   r   r   r[   r   r    rB     s&   
	

zSelfAttentionBlock.__init__c                 C   s   | j r
t| j|S | |S r7   r_   )r'   r(   r   r   r    r+     s   
zSelfAttentionBlock.forwardc              	   C   s6  |j ^}}}|||d}| | |}| jr|j \}}}| j}|jddd\}}	}
|j d |	j d }}t|d| jd}t|	d| jd}	t|
d| jd}
tj	d|d | |tj
|jd	}tj	d|d | |tj
|	jd	}t||	|
||||d
}t|d|| jd}n| |\}}| |}|| j||g|R  S )Nrc   r<   r=   re   r;   r   r   r   r   r   r   r   )rh   r   r   r   r   r   ri   r   r   r   r   r   r"   r   r   )r'   r(   r   r   r   r   r~   rl   r   kvr   r   r   r   r   r   r    ra     s$     
zSelfAttentionBlock._forward)r=   rc   FFFrr   r   r   r[   r    r     s    
 r   c                       r   )r   z
    An attention block that allows spatial positions to attend to context.
    In our case, context is the token-wise text embeddings.
    r=   rc   FTc                    s   t    || _|dkr|| _n|| dks J d| d| || | _|| _t|| _t|| _|| _t	d||d| _
t	d||d d| _|rVtsOJ d|rUJ dn|r_t| j| _nt| j| _tt	d||d| _d S r   )rA   rB   rC   r   rH   r   r   r   r   r   r   kvr   r   r   r   r   r   r   r[   r   r    rB     s*   




zCrossAttentionBlock.__init__c                 C   r   r7   r_   r   r   r   r    r+      r   zCrossAttentionBlock.forwardc                 C   s  |j ^}}}|||d}| | |}t|d}| |}| |}	|	jddd\}
}|
	 }
|	 }| j
r|j d }|j d |
j d }}t|d| jd}|tj}|
dd| }|j d }||| jd}|dd| }||| jd}tj||gdd}tjd|d | |tj|jd	}tj|d tj|jd
}tj|jdddd|dd < t||||||d}t|d|| jd}n'||j d d|j d }|d|j d d}|tj}| ||
||\}}| |}|| j||g|R  S )Nrc   r   r;   r=   re   r   r   r   r   r   r   r   r   )rh   r   r   r   r   r   r   r   ri   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r#   r   r   r   )r'   r(   r   r   r   r   r   r   r   r   r   r   r   r   r   
k_unpaddedr   
v_unpaddedkv_unpaddedr   r   r   rl   r~   r   r   r    ra     sF   




zCrossAttentionBlock._forwardr   rr   r   r   r[   r    r     s    	'r   c                       s$   e Zd Z fddZdd Z  ZS )GEGLUc                    s    t    t||d | _d S )Nr;   )rA   rB   rJ   ru   proj)r'   dim_indim_outr[   r   r    rB   4  s   
zGEGLU.__init__c                 C   s&   |  |jddd\}}|t| S )Nr;   rc   re   )r   ri   Fgelu)r'   r(   gater   r   r    r+   8  s   zGEGLU.forwardr   r   r   r[   r    r   3  s    r   c                       s&   e Zd Zd fdd	Zdd Z  ZS )	FeedForwardr   Fr   c                    sh   t    t|| }|stt||t nt||}t|| _	t|t
|t||| _d S r7   )rA   rB   intrJ   rK   ru   GELUr   r   r   rS   net)r'   rf   multglurE   	inner_dim
project_inr[   r   r    rB   >  s
   
&
"zFeedForward.__init__c                 C   s^   |j ^}}}|||d}| |}t|d}| |}t|d}|| j||g|R  S )Nrc   r   )rh   r   r   r   r   r  )r'   r(   r   r   r   rl   r   r   r    r+   F  s   

zFeedForward.forward)r   Fr   r   r   r   r[   r    r   =  s    r   c                       r   )r   zi
    An attention block that stacks self-attention and cross-attention layers
    in a single block.
    r=   rc   FTc              	      s~   t    td||d| _t|| _|| _t||||||d| _t	|||||||d| _
t|dd| _ttd||d| _d S )Nr;   r=   )rC   r   r   rH   r   r   )rC   r   r   r   rH   r   r   T)rf   r  )rA   rB   r   proj_inr   r   rH   r   self_attention_blockr   cross_attention_blockr   ffr   r   r   r[   r   r    rB   ^  s.   


	
z#StackedCrossAttentionBlock.__init__c                 C   r   r7   r_   r   r   r   r    r+     r   z"StackedCrossAttentionBlock.forwardc                 C   sH   |  |}| |}| |}| |||}| |}| |}|| S r7   )r   r  r  r	  r
  r   )r'   r(   r   r   rl   r   r   r    ra     s   




z#StackedCrossAttentionBlock._forwardr   rr   r   r   r[   r    r   X  s    	%r   )7r1   rw   abcr   torchr   torch.nnrJ   torch.nn.functional
functionalr   torch.utils.checkpointutilsr`   einopsr   ;nemo.collections.multimodal.modules.imagen.diffusionmodulesr   USE_ALTInemo.collections.multimodal.modules.imagen.diffusionmodules.attention_altr   r   r   r   Enemo.collections.multimodal.modules.imagen.diffusionmodules.attentionBnemo.collections.multimodal.modules.imagen.diffusionmodules.layersr	   r
   r   r   r   r   r   r!   
flash_attnr"   r#   r   ImportErrorModuler%   r3   rK   r6   r:   rt   r{   r   r   r   r   r   r   r   r   r   r   r   r    <module>   sF   $
vTV98 L`
