o
    }oi ,                     @   s   d dl Z d dlmZ d dl mZmZ z
d dlmZ dZW n ey+   e	d dZY nw dd
dZ
dd ZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd deZdddZdS )    N)	rearrange)Tensornn	GroupNormTz2Fused optimized group norm has not been installed.F     c                 C   s   t || dd|dS )av  Creates a group normalization layer with specified activation.

    Args:
        in_channels (int): Number of channels in the input.
        num_groups (int, optional): Number of groups for GroupNorm. Defaults to 32.
        act (str, optional): Activation function name. Defaults to "".

    Returns:
        GroupNorm: A normalization layer with optional activation.
    gư>T)
num_groupsnum_channelsepsaffineactr   )in_channelsr	   r    r   Y/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/diffusion/vae/blocks.py	Normalize   s   r   c                 C   s   | t |  S )zNonlinearity function used in temporal embedding projection.

    Currently implemented as a SiLU (Swish) function.

    Args:
        x (Tensor): Input tensor.

    Returns:
        Tensor: Output after applying SiLU activation.
    )torchsigmoid)xr   r   r   nonlinearity+   s   r   c                       s*   e Zd ZdZd
 fdd	Zdd	 Z  ZS )ResnetBlocka  A ResNet-style block that can optionally apply a temporal embedding and shortcut projections.

    This block consists of two convolutional layers, normalization, and optional temporal embedding.
    It can adjust channel dimensions between input and output via shortcuts.
    NF        r   c                    s   t    || _|du r|n|}|| _|| _t|dd| _tjj	||dddd| _
|dkr5tj||| _t|dd| _tj|| _tjj	||dddd| _| j| jkrt| jrftjj	||dddd| _dS tjj	||dddd| _dS dS )a  
        Args:
            in_channels (int): Number of input channels.
            out_channels (int, optional): Number of output channels. Defaults to in_channels.
            conv_shortcut (bool, optional): Whether to use a convolutional shortcut. Defaults to False.
            dropout (float, optional): Dropout probability. Defaults to 0.0.
            temb_channels (int, optional): Number of channels in temporal embedding. Defaults to 0.
        Nsilur         kernel_sizestridepaddingr   )super__init__r   out_channelsuse_conv_shortcutr   norm1r   r   Conv2dconv1Linear	temb_projnorm2Dropoutdropoutconv2conv_shortcutnin_shortcut)selfr   r"   r-   r+   temb_channels	__class__r   r   r!   @   s"   
	zResnetBlock.__init__c                 C   s   |}|  |}| |}|dur#|| t|ddddddf  }| |}| |}| |}| j| jkrI| j	rD| 
|}|| S | |}|| S )a&  Forward pass of the ResnetBlock.

        Args:
            x (Tensor): Input feature map of shape (B, C, H, W).
            temb (Tensor): Temporal embedding tensor of shape (B, temb_channels).

        Returns:
            Tensor: Output feature map of shape (B, out_channels, H, W).
        N)r$   r&   r(   r   r)   r+   r,   r   r"   r#   r-   r.   )r/   r   tembhr   r   r   forward\   s   


&




zResnetBlock.forward)NFr   r   __name__
__module____qualname____doc__r!   r5   __classcell__r   r   r1   r   r   9   s    r   c                       (   e Zd ZdZ fddZdd Z  ZS )UpsamplezUpsampling block that increases spatial resolution by a factor of 2.

    Can optionally include a convolution after upsampling.
    c                    s6   t    || _| jrtjj||dddd| _dS dS )z
        Args:
            in_channels (int): Number of input channels.
            with_conv (bool): If True, apply a convolution after upsampling.
        r   r   r   Nr    r!   	with_convr   r   r%   convr/   r   r?   r1   r   r   r!      s
   
zUpsample.__init__c                 C   sX   |j }|tjkr|tj}tjjj|ddd}|tjkr"||}| jr*| 	|}|S )zForward pass of the Upsample block.

        Args:
            x (Tensor): Input feature map (B, C, H, W).

        Returns:
            Tensor: Upsampled feature map (B, C, 2H, 2W).
        g       @nearest)scale_factormode)
dtyper   bfloat16tofloat32r   
functionalinterpolater?   r@   )r/   r   rE   r   r   r   r5      s   




zUpsample.forwardr6   r   r   r1   r   r=   z   s    r=   c                       r<   )
DownsamplezDownsampling block that reduces spatial resolution by a factor of 2.

    Can optionally include a convolution before downsampling.
    c                    s6   t    || _| jrtjj||dddd| _dS dS )z
        Args:
            in_channels (int): Number of input channels.
            with_conv (bool): If True, apply a convolution before downsampling.
        r      r   r   Nr>   rA   r1   r   r   r!      s
   
zDownsample.__init__c                 C   sF   | j rd}tjjj||ddd}| |}|S tjjj|ddd}|S )zForward pass of the Downsample block.

        Args:
            x (Tensor): Input feature map (B, C, H, W).

        Returns:
            Tensor: Downsampled feature map (B, C, H/2, W/2).
        )r   r   r   r   constantr   )rD   valuerL   )r   r   )r?   r   r   rI   padr@   
avg_pool2d)r/   r   rO   r   r   r   r5      s   	
zDownsample.forwardr6   r   r   r1   r   rK      s    rK   c                       sJ   e Zd ZdZdef fddZdedefddZd	edefd
dZ  Z	S )	AttnBlockzSelf-attention block that applies scaled dot-product attention to feature maps.

    Normalizes input, computes queries, keys, and values, then applies attention and a projection.
    r   c                    sj   t    || _t|dd| _tj||dd| _tj||dd| _tj||dd| _	tj||dd| _
dS )W
        Args:
            in_channels (int): Number of input/output channels.
        r   r   r   )r   N)r    r!   r   r   normr   r%   qkvproj_outr/   r   r1   r   r   r!      s   
zAttnBlock.__init__h_returnc           	      C   s   |  |}| |}| |}| |}|j\}}}}t|d }t|d }t|d }tj	|||}t|d||||dS )zCompute the attention over the input feature maps.

        Args:
            h_ (Tensor): Normalized input feature map (B, C, H, W).

        Returns:
            Tensor: Output after applying scaled dot-product attention (B, C, H, W).
        zb c h w -> b 1 (h w) czb 1 (h w) c -> b c h w)r4   wcb)
rS   rT   rU   rV   shaper   
contiguousr   rI   scaled_dot_product_attention)	r/   rY   rT   rU   rV   r]   r\   r4   r[   r   r   r   	attention   s   
	


zAttnBlock.attentionr   c                 C   s   ||  | | S )zForward pass of the AttnBlock.

        Args:
            x (Tensor): Input feature map (B, C, H, W).

        Returns:
            Tensor: Output feature map after self-attention (B, C, H, W).
        )rW   ra   )r/   r   r   r   r   r5      s   	zAttnBlock.forward)
r7   r8   r9   r:   intr!   r   ra   r5   r;   r   r   r1   r   rQ      s
    rQ   c                       s*   e Zd ZdZd fdd	Zdd Z  ZS )	LinearAttentionzLinear Attention block for efficient attention computations.

    Uses linear attention mechanisms to reduce complexity and memory usage.
       r   c                    sD   t    || _|| }tj||d ddd| _t||d| _dS )z
        Args:
            dim (int): Input channel dimension.
            heads (int, optional): Number of attention heads. Defaults to 4.
            dim_head (int, optional): Dimension per attention head. Defaults to 32.
        r   r   F)biasN)r    r!   headsr   r%   to_qkvto_out)r/   dimrf   dim_head
hidden_dimr1   r   r   r!     s
   
zLinearAttention.__init__c                 C   sv   |j \}}}}| |}t|d| jdd\}}}	|jdd}td||	}
td|
|}t|d| j||d	}| |S )
zForward pass of the LinearAttention block.

        Args:
            x (Tensor): Input feature map (B, C, H, W).

        Returns:
            Tensor: Output feature map after linear attention (B, C, H, W).
        z*b (qkv heads c) h w -> qkv b heads c (h w)r   )rf   qkv)ri   zbhdn,bhen->bhdezbhde,bhdn->bhenz"b heads c (h w) -> b (heads c) h w)rf   r4   r[   )r^   rg   r   rf   softmaxr   einsumrh   )r/   r   r]   r\   r4   r[   rl   rT   rU   rV   contextoutr   r   r   r5     s   	

zLinearAttention.forward)rd   r   r6   r   r   r1   r   rc      s    rc   c                       s    e Zd ZdZ fddZ  ZS )LinAttnBlockzcWrapper class to provide a linear attention block in a form compatible with other attention blocks.c                    s   t  j|d|d dS )rR   r   )ri   rf   rj   N)r    r!   rX   r1   r   r   r!   $  s   zLinAttnBlock.__init__)r7   r8   r9   r:   r!   r;   r   r   r1   r   rr   !  s    rr   vanillac                 C   sX   |dv sJ d| dt d| d|  d |dkrt| S |dkr(t| S t| S )	ao  Factory function to create an attention block.

    Args:
        in_channels (int): Number of input/output channels.
        attn_type (str, optional): Type of attention block to create. Options: "vanilla", "linear", "none".
                                   Defaults to "vanilla".

    Returns:
        nn.Module: An instance of the requested attention block.
    )rs   linearnonez
attn_type z unknownzmaking attention of type 'z' with z in_channelsrs   ru   )printrQ   r   Identityrr   )r   	attn_typer   r   r   	make_attn,  s   
ry   )r   r   )rs   )r   einopsr   r   r   apex.contrib.group_normr   OPT_GROUP_NORM	Exceptionrv   r   r   Moduler   r=   rK   rQ   rc   rr   ry   r   r   r   r   <module>   s&   
A&$7&