o
    }oi                     @   sB   d dl mZ d dlmZ d dlmZ dd ZG dd dejZdS )    )TupleN)
functionalc                 C   s   |   D ]}tj| q| S )z
    Initializes all parameters of the given module to zero.

    Args:
        module (nn.Module): The module whose parameters will be initialized to zero.

    Returns:
        nn.Module: The same module with zero-initialized parameters.
    )
parametersnninitzeros_)modulep r
   l/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/diffusion/models/flux_controlnet/layers.pyzero_module   s   
r   c                	       sD   e Zd ZdZ		ddededeedf f fdd	Zd
d Z  ZS )ControlNetConditioningEmbeddingu  
    Quoting from https://arxiv.org/abs/2302.05543: "Stable Diffusion uses a pre-processing method similar to VQ-GAN
    [11] to convert the entire dataset of 512 × 512 images into smaller 64 × 64 “latent images” for stabilized
    training. This requires ControlNets to convert image-based conditions to 64 × 64 feature space to match the
    convolution size. We use a tiny network E(·) of four convolution layers with 4 × 4 kernels and 2 × 2 strides
    (activated by ReLU, channels are 16, 32, 64, 128, initialized with Gaussian weights, trained jointly with the full
    model) to encode image-space conditions ... into feature maps ..."
              `      conditioning_embedding_channelsconditioning_channelsblock_out_channels.c              
      s   t    tj||d ddd| _tg | _tt|d D ]'}|| }||d  }| j	tj||ddd | j	tj||dddd qt
tj|d |ddd| _dS )	a  
        Initializes the model with convolutional layers for processing conditioning inputs.

        Args:
            conditioning_embedding_channels (int):
                Number of output channels for the conditioning embedding.
            conditioning_channels (int):
                Number of input channels for the conditioning data. Default is 3.
            block_out_channels (Tuple[int, ...]):
                Tuple specifying the output channels for each block. Default is (16, 32, 96, 256).
        r   r      )kernel_sizepadding   )r   r   strideN)super__init__r   Conv2dconv_in
ModuleListblocksrangelenappendr   conv_out)selfr   r   r   i
channel_inchannel_out	__class__r
   r   r   .   s   

z(ControlNetConditioningEmbedding.__init__c                 C   s@   |  |}t|}| jD ]}||}t|}q| |}|S )a0  
        Passes the conditioning input through the model to produce an embedding.

        Args:
            conditioning (torch.Tensor): Input tensor representing conditioning data.

        Returns:
            torch.Tensor: The resulting embedding tensor after processing through the network.
        )r    Fsilur"   r&   )r'   conditioning	embeddingblockr
   r
   r   forwardO   s   




z'ControlNetConditioningEmbedding.forward)r   r   )	__name__
__module____qualname____doc__intr   r   r2   __classcell__r
   r
   r+   r   r   $   s    
!r   )	typingr   torch.nnr   r   r-   r   Moduler   r
   r
   r
   r   <module>   s
   