o
    ̳i&                     @   sF  d dl mZmZ d dlZd dlm  mZ d dlmZmZ d dl	m
Z
 G dd dejZG dd dejZG d	d
 d
ejZdedejfddZdededejfddZG dd dejZG dd dejZdedededejfddZG dd dejZG dd dejZG dd dejZG d d! d!ejZd"edefd#d$ZdS )%    )ListTupleN)nnTensor)MultiHeadAttentionc                       sr   e Zd ZdZdeeeef dejdejf fddZde	de	fd	d
Z
de	de	fddZde	de	fddZ  ZS )FluxAutoencodera  
    The image autoencoder for Flux diffusion models.

    Args:
        img_shape (Tuple[int, int, int]): The shape of the input image (without the batch dimension).
        encoder (nn.Module): The encoder module.
        decoder (nn.Module): The decoder module.
    	img_shapeencoderdecoderc                    s    t    || _|| _|| _d S N)super__init__
_img_shaper	   r
   )selfr   r	   r
   	__class__ V/home/ubuntu/.local/lib/python3.10/site-packages/torchtune/models/flux/_autoencoder.pyr      s   

zFluxAutoencoder.__init__xreturnc                 C      |  | |S )z
        Args:
            x (Tensor): input image of shape [bsz, ch_in, img resolution, img resolution]

        Returns:
            Tensor: output image of the same shape
        )decodeencoder   r   r   r   r   forward&   s   zFluxAutoencoder.forwardc                 C   s"   |j dd | jksJ | |S )a   
        Encode images into their latent representations.

        Args:
            x (Tensor): input images (shape = [bsz, ch_in, img resolution, img resolution])

        Returns:
            Tensor: latent encodings (shape = [bsz, ch_z, latent resolution, latent resolution])
           N)shaper   r	   r   r   r   r   r   0   s   

zFluxAutoencoder.encodezc                 C   s
   |  |S )a  
        Decode latent representations into images.

        Args:
            z (Tensor): latent encodings (shape = [bsz, ch_z, latent resolution, latent resolution])

        Returns:
            Tensor: output images (shape = [bsz, ch_in, img resolution, img resolution])
        )r
   )r   r   r   r   r   r   =   s   

zFluxAutoencoder.decode)__name__
__module____qualname____doc__r   intr   Moduler   r   r   r   r   __classcell__r   r   r   r   r      s    	
r   c                       P   e Zd ZdZdededee dededef fdd	Zd
edefddZ	  Z
S )FluxEncodera	  
    The encoder half of the Flux diffusion model's image autoencoder.

    Args:
        ch_in (int): The number of channels of the input image.
        ch_z (int): The number of latent channels (dimension of the latent vector `z`).
        channels (List[int]): The number of output channels for each downsample block.
        n_layers_per_down_block (int): Number of resnet layers per upsample block.
        scale_factor (float): Constant for scaling `z`.
        shift_factor (float): Constant for shifting `z`.
    ch_inch_zchannelsn_layers_per_down_blockscale_factorshift_factorc                    sz   t    || _|| _tj| d dddd| _t fddtt	 D | _
t d | _t d d| | _d S )	Nr      r   kernel_sizestridepaddingc              	      D   g | ]}t |d kr |d  n d   | |t d k dqS )r   r   )n_layersr'   ch_out
downsample)	DownBlocklen.0ir)   r*   r   r   
<listcomp>g       z(FluxEncoder.__init__.<locals>.<listcomp>   )r   r   r+   r,   r   Conv2dconv_in
ModuleListranger7   down	mid_blockmid	end_blockend)r   r'   r(   r)   r*   r+   r,   r   r;   r   r   W   s   
	
zFluxEncoder.__init__r   r   c                 C   sJ   |  |}| jD ]}||}q| |}| |}t|}| j|| j  S )z
        Args:
            x (Tensor): input images (shape = [bsz, ch_in, img resolution, img resolution])

        Returns:
            Tensor: latent encodings (shape = [bsz, ch_z, latent resolution, latent resolution])
        )rA   rD   rF   rH   diagonal_gaussianr+   r,   )r   r   hblockr   r   r   r   r   v   s   




zFluxEncoder.forwardr   r   r    r!   r"   r   floatr   r   r   r$   r   r   r   r   r&   J        r&   c                       r%   )FluxDecodera  
    The encoder half of the Flux diffusion model's image autoencoder.

    Args:
        ch_out (int): The number of channels of the output image.
        ch_z (int): The number of latent channels (dimension of the latent vector `z`).
        channels (List[int]): The number of output channels for each upsample block.
        n_layers_per_up_block (int): Number of resnet layers per upsample block.
        scale_factor (float): Constant for scaling `z`.
        shift_factor (float): Constant for shifting `z`.
    r4   r(   r)   n_layers_per_up_blockr+   r,   c                    sv   t    || _|| _tj| d dddd| _t d | _t	 fddt
t D | _t d || _d S )Nr   r-   r   r.   c              	      r2   )r   r   )r3   r'   r4   upsample)UpBlockr7   r8   r)   rP   r   r   r<      r=   z(FluxDecoder.__init__.<locals>.<listcomp>r>   )r   r   r+   r,   r   r@   rA   rE   rF   rB   rC   r7   uprG   rH   )r   r4   r(   r)   rP   r+   r,   r   rS   r   r      s   
	
zFluxDecoder.__init__r   r   c                 C   sF   || j  | j }| |}| |}| jD ]}||}q| |}|S )z
        Args:
            z (Tensor): latent encodings (shape = [bsz, ch_z, latent resolution, latent resolution])

        Returns:
            Tensor: output images (shape = [bsz, ch_in, img resolution, img resolution])
        )r+   r,   rA   rF   rT   rH   )r   r   rJ   rK   r   r   r   r   r      s   




zFluxDecoder.forwardrL   r   r   r   r   rO      rN   rO   chr   c                 C   s"   t t| | dt| t| | dS )Nr'   r4   )r   
SequentialResnetLayer	AttnLayer)rU   r   r   r   rE      s
   

rE   r'   r4   c                 C   s0   t t jd| dddt  t j| |ddddS )N    ư>T
num_groupsnum_channelsepsaffiner-   r   r.   )r   rW   	GroupNormSiLUr@   rV   r   r   r   rG      s
   rG   c                       @   e Zd Zdedededef fddZdedefd	d
Z  ZS )r6   r3   r'   r4   r5   c                    8   t    t|||| _|rt|| _d S t | _d S r   )r   r   resnet_layerslayers
Downsampler   Identityr5   )r   r3   r'   r4   r5   r   r   r   r         
 zDownBlock.__init__r   r   c                 C   r   r   )r5   rf   r   r   r   r   r         zDownBlock.forward	r   r   r    r"   boolr   r   r   r$   r   r   r   r   r6          r6   c                       rc   )rR   r3   r'   r4   rQ   c                    rd   r   )r   r   re   rf   Upsampler   rh   rQ   )r   r3   r'   r4   rQ   r   r   r   r      ri   zUpBlock.__init__r   r   c                 C   r   r   )rQ   rf   r   r   r   r   r      rj   zUpBlock.forwardrk   r   r   r   r   rR      rm   rR   nc                    s   t j fddt| D  S )Nc                    s$   g | ]}t |d kr ndqS )r   rV   )rX   r8   rV   r   r   r<      s    z!resnet_layers.<locals>.<listcomp>)r   rW   rC   )ro   r'   r4   r   rV   r   re      s
   re   c                       4   e Zd Zdef fddZdedefddZ  ZS )rY   dimc                    sd   t    || _tjd|ddd| _t|dd|t||t||t||t||dd	| _d S )NrZ   r[   Tr\   r   F)		embed_dim	num_headsnum_kv_headshead_dimq_projk_projv_projoutput_proj	is_causal)	r   r   rq   r   ra   normr   Linearattn)r   rq   r   r   r   r      s   




zAttnLayer.__init__r   r   c                 C   sj   |j \}}}}|}| |}td|}|||| |}| ||}|||||}td|}|| S )Nzbchw -> bhwczbhwc -> bchw)r   r{   torcheinsumreshaper}   )r   r   bcrJ   wresidualr   r   r   r     s   
zAttnLayer.forwardr   r   r    r"   r   r   r   r$   r   r   r   r   rY      s    rY   c                       s8   e Zd Zdedef fddZdedefddZ  ZS )	rX   r'   r4   c                    s   t    tjtjd|dddt tj||ddddtjd|dddt tj||ddddg | _||kr>t | _	d S tj||dddd| _	d S )	NrZ   r[   Tr\   r-   r   r.   r   )
r   r   r   rW   ra   rb   r@   mainrh   shortcut)r   r'   r4   r   r   r   r     s   
zResnetLayer.__init__r   r   c                 C   s   |  || | S r   )r   r   r   r   r   r   r   (  s   zResnetLayer.forwardr   r   r   r   r   rX     s    rX   c                       rp   )rg   rU   c                    s$   t    tj||dddd| _d S )Nr-   r?   r   r.   r   r   r   r@   convr   rU   r   r   r   r   -     
zDownsample.__init__r   r   c                 C   s   |  tj|ddddS )N)r   r   r   r   constantr   )modevalue)r   Fpadr   r   r   r   r   1  s   zDownsample.forwardr   r   r   r   r   rg   ,      rg   c                       rp   )rn   rU   c                    s$   t    tj||dddd| _d S )Nr-   r   r.   r   r   r   r   r   r   6  r   zUpsample.__init__r   r   c                 C   s   |  tj|dddS )Ng       @nearest)r+   r   )r   r   interpolater   r   r   r   r   :  s   zUpsample.forwardr   r   r   r   r   rn   5  r   rn   r   c                 C   s4   t j| ddd\}}t d| }||t |  S )Nr?   r   )rq   g      ?)r~   chunkexp
randn_like)r   meanlogvarstdr   r   r   rI   >  s   rI   )typingr   r   r~   torch.nn.functionalr   
functionalr   r   torchtune.modules.attentionr   r#   r   r&   rO   r"   rE   rG   r6   rR   re   rY   rX   rg   rn   rI   r   r   r   r   <module>   s$   9==

	$		