o
    ۷i                     @   s~  d dl Zd dlZd dlmZ d dlm  mZ d dlZddl	m
Z
mZ ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZ d
dlmZmZmZ eeZG dd dejZG dd dejZG dd dejZ G dd dejZ!G dd dejZ"G dd dejZ#G dd dejZ$G dd dejZ%G dd dejZ&G dd dejZ'G d d! d!ejZ(G d"d# d#eee
Z)dS )$    N   )ConfigMixinregister_to_config)logging)apply_forward_hook   )get_activation)AutoencoderKLOutput)
ModelMixin   )AutoencoderMixinDecoderOutputDiagonalGaussianDistributionc                       s   e Zd Z						ddededeeeeef B d	eeeeef B d
eeeeef B deeeeef B dededdf fddZdej	dej	fddZ
  ZS )HunyuanVideo15CausalConv3dr   r   r   T	replicatein_channelsout_channelskernel_sizestridepaddingdilationbiaspad_modereturnNc	           	   	      s   t    t|tr|||fn|}|| _|d d |d d |d d |d d |d d df| _tj|||||||d| _d S )Nr   r   r   )r   )	super__init__
isinstanceintr   time_causal_paddingnnConv3dconv)	selfr   r   r   r   r   r   r   r   	__class__ q/home/ubuntu/vllm_env/lib/python3.10/site-packages/diffusers/models/autoencoders/autoencoder_kl_hunyuanvideo15.pyr   #   s   





	z#HunyuanVideo15CausalConv3d.__init__hidden_statesc                 C   s   t j|| j| jd}| |S )N)mode)Fpadr   r   r!   )r"   r'   r%   r%   r&   forward>   s   
z"HunyuanVideo15CausalConv3d.forward)r   r   r   r   Tr   )__name__
__module____qualname__r   tupleboolstrr   torchTensorr+   __classcell__r%   r%   r#   r&   r   "   s6    	
r   c                       s@   e Zd ZdZddededededd	f
 fd
dZdd Z  ZS )HunyuanVideo15RMS_norma  
    A custom RMS normalization layer.

    Args:
        dim (int): The number of dimensions to normalize over.
        channel_first (bool, optional): Whether the input tensor has channels as the first dimension.
            Default is True.
        images (bool, optional): Whether the input represents image data. Default is True.
        bias (bool, optional): Whether to include a learnable bias term. Default is False.
    TFdimchannel_firstimagesr   r   Nc                    sr   t    |s	dnd}|r|g|R n|f}|| _|d | _tt|| _|r4tt	|| _
d S d| _
d S )N)r   r   r   )r   r   g      ?g        )r   r   r7   scaler   	Parameterr2   onesgammazerosr   )r"   r6   r7   r8   r   broadcastable_dimsshaper#   r%   r&   r   O   s   

$zHunyuanVideo15RMS_norm.__init__c                 C   s*   t j|| jrdndd| j | j | j S )Nr   r6   )r)   	normalizer7   r9   r<   r   )r"   xr%   r%   r&   r+   Y   s   *zHunyuanVideo15RMS_norm.forward)TTF)	r,   r-   r.   __doc__r   r0   r   r+   r4   r%   r%   r#   r&   r5   C   s    $
r5   c                       sT   e Zd Zdef fddZeddededefdd	Zd
ejdejfddZ	  Z
S )HunyuanVideo15AttnBlockr   c                    sj   t    || _t|dd| _tj||dd| _tj||dd| _tj||dd| _	tj||dd| _
d S )NFr8   r   r   )r   r   r   r5   normr   r    to_qto_kto_vproj_out)r"   r   r#   r%   r&   r   ^   s   
z HunyuanVideo15AttnBlock.__init__Nn_framen_hw
batch_sizec           	      C   sp   | | }t j||ftd||d}t|D ]}|| }d||d|d | f< q|dur6|d|dd}|S )a  Prepare a causal attention mask for 3D videos.

        Args:
            n_frame (int): Number of frames (temporal length).
            n_hw (int): Product of height and width.
            dtype: Desired mask dtype.
            device: Device for the mask.
            batch_size (int, optional): If set, expands for batch.

        Returns:
            torch.Tensor: Causal attention mask.
        z-inf)dtypedevicer   Nr   r@   )r2   fullfloatrange	unsqueezeexpand)	rM   rN   rP   rQ   rO   seq_lenmaskii_framer%   r%   r&   prepare_causal_attention_maski   s   z5HunyuanVideo15AttnBlock.prepare_causal_attention_maskrC   r   c                 C   s"  |}|  |}| |}| |}| |}|j\}}}}	}
|||||	 |
 dddd }|||||	 |
 dddd }|||||	 |
 dddd }| j	||	|
 |j
|j|d}tjj||||d}|d|||	|
|ddddd}| |}|| S )Nr   r   r   )rO   )	attn_mask   r   )rH   rI   rJ   rK   r?   reshapepermuterU   
contiguousr[   rP   rQ   r   
functionalscaled_dot_product_attentionsqueezerL   )r"   rC   identityquerykeyvaluerO   channelsframesheightwidthattention_maskr%   r%   r&   r+      s    



***&
zHunyuanVideo15AttnBlock.forwardN)r,   r-   r.   r   r   staticmethodr[   r2   r3   r+   r4   r%   r%   r#   r&   rE   ]   s
    rE   c                       J   e Zd Zddededef fddZedd	d
Zdej	fddZ
  ZS )HunyuanVideo15UpsampleTr   r   add_temporal_upsamplec                    sB   t    |r	dnd}t||| dd| _|| _|| | | _d S N   r]   r   rG   )r   r   r   r!   rq   repeats)r"   r   r   rq   factorr#   r%   r&   r      
   
zHunyuanVideo15Upsample.__init__r   r   c              
   C   sr   | j \}}}}}|| | }	||	 }
| |||||
|||} | dddddddd} | ||
|| || || S )	a  
        Convert (b, r1*r2*r3*c, f, h, w) -> (b, c, r1*f, r2*h, r3*w)

        Args:
            tensor: Input tensor of shape (b, r1*r2*r3*c, f, h, w)
            r1: temporal upsampling factor
            r2: height upsampling factor
            r3: width upsampling factor
        r   r]      r      r      r   r?   viewr_   r^   )tensorr1r2r3bpacked_cfhwru   cr%   r%   r&   _dcae_upsample_rearrange   s   z/HunyuanVideo15Upsample._dcae_upsample_rearrangerC   c           	      C   s  | j rdnd}| |}| j r|d d d d d dd d d d f }| j|dddd}|d d d |jd d f }|d d d d dd d d d d f }| j||ddd}tj||gdd}|d d d d d dd d d d f }| j|dddd}|j| jd dd}|d d d d dd d d d d f }| j||ddd}|j| jdd}tj||gdd}|| S | j||ddd}|j| jdd}| j||ddd}|| S )Nr   r   r}   r~   r   rA   rt   r6   )rq   r!   r   r?   r2   catrepeat_interleavert   )	r"   rC   r}   r   h_firsth_nextx_firstx_nextshortcutr%   r%   r&   r+      s*   
&&&&zHunyuanVideo15Upsample.forwardTr   r   r   )r,   r-   r.   r   r0   r   rn   r   r2   r3   r+   r4   r%   r%   r#   r&   rp      s
    rp   c                       ro   )HunyuanVideo15DownsampleTr   r   add_temporal_downsamplec                    sB   t    |r	dnd}t||| dd| _|| _|| | | _d S rr   )r   r   r   r!   r   
group_size)r"   r   r   r   ru   r#   r%   r&   r      rv   z!HunyuanVideo15Downsample.__init__r   r   c              
   C   sz   | j \}}}}}|| || || }	}
}| |||	||
|||} | dddddddd} | ||| | | |	|
|S )	z
        Convert (b, c, r1*f, r2*h, r3*w) -> (b, r1*r2*r3*c, f, h, w)

        This packs spatial/temporal dimensions into channels (opposite of upsample)
        r   r   rw   ry   r   r   r]   rx   rz   )r|   r}   r~   r   r   r   packed_fpacked_hpacked_wr   r   r   r%   r%   r&   _dcae_downsample_rearrange   s
   z3HunyuanVideo15Downsample._dcae_downsample_rearrangerC   c                 C   s  | j rdnd}| |}| j r|d d d d d dd d d d f }| j|dddd}tj||gdd}|d d d d dd d d d d f }| j||ddd}tj||gdd}|d d d d d dd d d d f }| j|dddd}|j\}}}	}
}|||jd | jd |	|
|jdd}|d d d d dd d d d d f }| j||ddd}|j\}}}	}
}|||jd | j|	|
|jdd}tj||gdd}|| S | j||ddd}| j||ddd}|j\}}}	}
}|||jd | j|	|
|jdd}|| S )Nr   r   r   rA   )	r   r!   r   r2   r   r?   r{   r   mean)r"   rC   r}   r   r   r   r   BCTHWr   r   r%   r%   r&   r+      s0   
&&&(&$$z HunyuanVideo15Downsample.forwardr   r   )r,   r-   r.   r   r0   r   rn   r   r2   r3   r+   r4   r%   r%   r#   r&   r      s
    r   c                	       sN   e Zd Z		ddededB deddf fddZd	ejdejfd
dZ  Z	S )HunyuanVideo15ResnetBlockNswishr   r   non_linearityr   c                    s   t    |p|}t|| _t|dd| _t||dd| _t|dd| _t||dd| _	d | _
||kr@tj||dddd| _
d S d S )NFrF   r   rG   r   r   )r   r   r   )r   r   r   nonlinearityr5   norm1r   conv1norm2conv2conv_shortcutr   r    )r"   r   r   r   r#   r%   r&   r     s   

z"HunyuanVideo15ResnetBlock.__init__r'   c                 C   s\   |}|  |}| |}| |}| |}| |}| |}| jd ur*| |}|| S rm   )r   r   r   r   r   r   )r"   r'   residualr%   r%   r&   r+      s   







z!HunyuanVideo15ResnetBlock.forward)Nr   )
r,   r-   r.   r   r1   r   r2   r3   r+   r4   r%   r%   r#   r&   r   
  s    r   c                	       sJ   e Zd Z		ddedededdf fdd	Zd
ejdejfddZ  Z	S )HunyuanVideo15MidBlockr   Tr   
num_layersadd_attentionr   Nc                    s   t    || _t||dg}g }t|D ]}| jr"|t| n|d  |t||d qt|| _	t|| _
d| _d S )Nr   r   F)r   r   r   r   rT   appendrE   r   
ModuleList
attentionsresnetsgradient_checkpointing)r"   r   r   r   r   r   _r#   r%   r&   r   2  s*   


zHunyuanVideo15MidBlock.__init__r'   c                 C   sJ   | j d |}t| j| j dd  D ]\}}|d ur||}||}q|S )Nr   r   )r   zipr   )r"   r'   attnresnetr%   r%   r&   r+   V  s   
zHunyuanVideo15MidBlock.forward)r   T
r,   r-   r.   r   r0   r   r2   r3   r+   r4   r%   r%   r#   r&   r   1  s    $r   c                       sX   e Zd Z			ddededededB ded	df fd
dZdejd	ejfddZ  ZS )HunyuanVideo15DownBlock3Dr   NTr   r   r   downsample_out_channelsr   r   c                    s|   t    g }t|D ]}|dkr|n|}|t||d qt|| _|d ur6tt|||dg| _	nd | _	d| _
d S )Nr   r   )r   r   F)r   r   rT   r   r   r   r   r   r   downsamplersr   )r"   r   r   r   r   r   r   rY   r#   r%   r&   r   b  s,   


z"HunyuanVideo15DownBlock3D.__init__r'   c                 C   s6   | j D ]}||}q| jd ur| jD ]}||}q|S rm   )r   r   )r"   r'   r   downsamplerr%   r%   r&   r+     s   




z!HunyuanVideo15DownBlock3D.forwardr   NT)	r,   r-   r.   r   r   r2   r3   r+   r4   r%   r%   r#   r&   r   a  s$    %r   c                       sX   e Zd Z			ddededededB ded	df fd
dZdejd	ejfddZ  Z	S )HunyuanVideo15UpBlock3Dr   NTr   r   r   upsample_out_channelsrq   r   c           	         s|   t    g }t|D ]}|dkr|n|}|t||d qt|| _|d ur6tt|||dg| _	nd | _	d| _
d S )Nr   r   )r   rq   F)r   r   rT   r   r   r   r   r   rp   
upsamplersr   )	r"   r   r   r   r   rq   r   rY   input_channelsr#   r%   r&   r     s,   


z HunyuanVideo15UpBlock3D.__init__r'   c                 C   s^   t  r| jr| jD ]}| ||}q
n
| jD ]}||}q| jd ur-| jD ]}||}q&|S rm   )r2   is_grad_enabledr   r   _gradient_checkpointing_funcr   )r"   r'   r   	upsamplerr%   r%   r&   r+     s   





zHunyuanVideo15UpBlock3D.forwardr   r   r%   r%   r#   r&   r     s$    &r   c                       sp   e Zd ZdZ							dd	ed
edeedf dededededdf fddZdej	dej	fddZ
  ZS )HunyuanVideo15Encoder3Dz1
    3D vae encoder for HunyuanImageRefiner.
    r   @               r   r   r]      Tr   r   block_out_channels.layers_per_blocktemporal_compression_ratiospatial_compression_ratiodownsample_match_channelr   Nc                    s*  t    || _|| _|d | j | _t||d dd| _d | _t	g | _
|d }tt|D ]?}	|	t|k }
||	 }|
sKt|||d dd}|}n|	t|| k}|r\||	d  n|}t|||||d}|}| j
| q0t|d d| _t|d dd	| _t | _t|d |dd| _d| _d S )
Nr@   r   r   rG   F)r   r   r   r   r   r   r   rF   )r   r   r   r   r   r   conv_in	mid_blockr   r   down_blocksrT   lennplog2r   r   r   r5   norm_outSiLUconv_actconv_outr   )r"   r   r   r   r   r   r   r   input_channelrY   add_spatial_downsampleoutput_channel
down_blockr   r   r#   r%   r&   r     sH   



z HunyuanVideo15Encoder3D.__init__r'   c           	      C   s   |  |}t r | jr | jD ]}| ||}q| | j|}n| jD ]}||}q#| |}|j\}}}}}||d| j	|||j
dd}| |}| |}| |}||7 }|S )Nr@   r   rA   )r   r2   r   r   r   r   r   r?   r{   r   r   r   r   r   )	r"   r'   r   rO   r   framerj   rk   	short_cutr%   r%   r&   r+     s   







zHunyuanVideo15Encoder3D.forward)r   r   r   r   r]   r   Tr,   r-   r.   rD   r   r/   r0   r   r2   r3   r+   r4   r%   r%   r#   r&   r     s6    
	7r   c                       sl   e Zd ZdZ							dd	ed
edeedf dedededef fddZdej	dej	fddZ
  ZS )HunyuanVideo15Decoder3DzR
    Causal decoder for 3D video-like data used for HunyuanImage-1.5 Refiner.
        r   r   r   r   r   r   r   r   r]   Tr   r   r   .r   r   r   upsample_match_channelc                    s8  t    || _|| _|| _|d | j | _t| j|d dd| _t	g | _
t|d d| _|d }tt|D ]E}	||	 }
|	t|k }|	t|k }|sQ|rj|rY||	d  n|
}t| jd ||
||d}|}nt| jd ||
d dd}|
}| j
| q9t|d dd	| _t | _t|d |dd| _d| _d S )
Nr   r   rG   r   r   )r   r   r   r   rq   Fr@   rF   )r   r   r   r   r   repeatr   r   r   r   	up_blocksr   r   rT   r   r   r   r   r   r5   r   r   r   r   r   )r"   r   r   r   r   r   r   r   r   rY   r   add_spatial_upsamplerq   r   up_blockr#   r%   r&   r   $  sH   



z HunyuanVideo15Decoder3D.__init__r'   r   c                 C   s   |  ||j| jdd }t r(| jr(| | j|}| jD ]}| ||}qn| |}| jD ]}||}q0| 	|}| 
|}| |}|S )Nr   r   )r   r   r   r2   r   r   r   r   r   r   r   r   )r"   r'   r   r%   r%   r&   r+   ^  s   






zHunyuanVideo15Decoder3D.forward)r   r   r   r   r   r]   Tr   r%   r%   r#   r&   r     s2    
:r   c                       s  e Zd ZdZdZe											d=d
edededee dededededede	ddf fddZ
					d>dedB dedB dedB dedB de	dB ddfddZdejdejfd d!Ze	d?dejd"edeee B fd#d$Zd%ejdejfd&d'Zed?d%ejd"edeejB fd(d)Zd*ejd+ejd,edejfd-d.Zd*ejd+ejd,edejfd/d0Zd*ejd+ejd,edejfd1d2Zdejdejfd3d4Zd%ejdejfd5d6Z	7		d@d8ejd9ed"ed:ejdB deejB f
d;d<Z  ZS )AAutoencoderKLHunyuanVideo15a=  
    A VAE model with KL loss for encoding videos into latents and decoding latent representations into videos. Used for
    HunyuanVideo-1.5.

    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
    for all models (such as downloading or saving).
    Tr   r   r   r   r   r]   fk}Ж?r   r   latent_channelsr   r   r   r   r   r   scaling_factorr   Nc              	      s   t    t||d |||||d| _t||tt|||||	d| _|| _|| _	d| _
d| _d| _d| _| j| | _| j| | _d| _d S )Nr   )r   r   r   r   r   r   r   )r   r   r   r   r   r   r   Fr   g      ?)r   r   r   encoderr   listreverseddecoderr   r   use_slicing
use_tilingtile_sample_min_heighttile_sample_min_widthtile_latent_min_heighttile_latent_min_widthtile_overlap_factor)r"   r   r   r   r   r   r   r   r   r   r   r#   r%   r&   r   ~  s8   




z$AutoencoderKLHunyuanVideo15.__init__r   r   r   r   r   c                 C   sF   d| _ |p| j| _|p| j| _|p| j| _|p| j| _|p| j| _dS )a  
        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
        processing larger images.

        Args:
            tile_sample_min_height (`int`, *optional*):
                The minimum height required for a sample to be separated into tiles across the height dimension.
            tile_sample_min_width (`int`, *optional*):
                The minimum width required for a sample to be separated into tiles across the width dimension.
            tile_latent_min_height (`int`, *optional*):
                The minimum height required for a latent to be separated into tiles across the height dimension.
            tile_latent_min_width (`int`, *optional*):
                The minimum width required for a latent to be separated into tiles across the width dimension.
        TN)r   r   r   r   r   r   )r"   r   r   r   r   r   r%   r%   r&   enable_tiling  s   z)AutoencoderKLHunyuanVideo15.enable_tilingrC   c                 C   sB   |j \}}}}}| jr|| jks|| jkr| |S | |}|S rm   )r?   r   r   r   tiled_encoder   )r"   rC   r   rj   rk   r%   r%   r&   _encode  s
   

z#AutoencoderKLHunyuanVideo15._encodereturn_dictc                    s^    j r|jd dkr fdd|dD }t|}n |}t|}|s*|fS t|dS )a  
        Encode a batch of images into latents.

        Args:
            x (`torch.Tensor`): Input batch of images.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.

        Returns:
                The latent representations of the encoded videos. If `return_dict` is True, a
                [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
        r   r   c                       g | ]}  |qS r%   )r   ).0x_slicer"   r%   r&   
<listcomp>      z6AutoencoderKLHunyuanVideo15.encode.<locals>.<listcomp>)latent_dist)r   r?   splitr2   r   r   r   r	   )r"   rC   r   encoded_slicesr   	posteriorr%   r   r&   encode  s   

z"AutoencoderKLHunyuanVideo15.encodezc                 C   sB   |j \}}}}}| jr|| jks|| jkr| |S | |}|S rm   )r?   r   r   r   tiled_decoder   )r"   r  r   rj   rk   decr%   r%   r&   _decode  s
   

z#AutoencoderKLHunyuanVideo15._decodec                    sV    j r|jd dkr fdd|dD }t|}n |}|s&|fS t|dS )a  
        Decode a batch of images.

        Args:
            z (`torch.Tensor`): Input batch of latent vectors.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.

        Returns:
            [`~models.vae.DecoderOutput`] or `tuple`:
                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
                returned.
        r   r   c                    r   r%   )r  )r   z_slicer   r%   r&   r    r  z6AutoencoderKLHunyuanVideo15.decode.<locals>.<listcomp>)sample)r   r?   r  r2   r   r  r   )r"   r  r   decoded_slicesdecodedr%   r   r&   decode  s   

z"AutoencoderKLHunyuanVideo15.decodear   blend_extentc              	   C   s   t |jd |jd |}t|D ]@}|d d d d d d | | d d f d||   |d d d d d d |d d f ||   |d d d d d d |d d f< q|S )Nr   minr?   rT   )r"   r  r   r  yr%   r%   r&   blend_v     R&z#AutoencoderKLHunyuanVideo15.blend_vc                 C   s   t |jd |jd |}t|D ]@}|d d d d d d d d | | f d||   |d d d d d d d d |f ||   |d d d d d d d d |f< q|S )Nr@   r   r  r"   r  r   r  rC   r%   r%   r&   blend_h&  r  z#AutoencoderKLHunyuanVideo15.blend_hc              	   C   s   t |jd |jd |}t|D ]@}|d d d d | | d d d d f d||   |d d d d |d d d d f ||   |d d d d |d d d d f< q|S )Nr   r  r  r%   r%   r&   blend_t.  r  z#AutoencoderKLHunyuanVideo15.blend_tc                 C   s  |j \}}}}}t| jd| j  }t| jd| j  }t| j| j }t| j| j }| j| }	| j| }
g }td||D ]5}g }td||D ]%}|dddddd||| j ||| j f }| |}|	| qH|	| q>g }t
|D ]O\}}g }t
|D ]:\}}|dkr| ||d  | ||}|dkr| ||d  ||}|	|ddddddd|	d|
f  q|	tj|dd qztj|dd}|S )zEncode a batch of images using a tiled encoder.

        Args:
            x (`torch.Tensor`): Input batch of videos.

        Returns:
            `torch.Tensor`:
                The latent representation of the encoded videos.
        r   r   Nr@   rA   r  )r?   r   r   r   r   r   r   rT   r   r   	enumerater  r  r2   r   )r"   rC   r   rj   rk   overlap_heightoverlap_widthblend_heightblend_widthrow_limit_heightrow_limit_widthrowsrY   rowjtileresult_rows
result_rowmomentsr%   r%   r&   r   6  sD   



.z(AutoencoderKLHunyuanVideo15.tiled_encodec                 C   s  |j \}}}}}t| jd| j  }t| jd| j  }t| j| j }t| j| j }| j| }	| j| }
g }td||D ]5}g }td||D ]%}|dddddd||| j ||| j f }| |}|	| qH|	| q>g }t
|D ]O\}}g }t
|D ]:\}}|dkr| ||d  | ||}|dkr| ||d  ||}|	|ddddddd|	d|
f  q|	tj|dd qztj|dd}|S )a  
        Decode a batch of images using a tiled decoder.

        Args:
            z (`torch.Tensor`): Input batch of latent vectors.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.

        Returns:
            [`~models.vae.DecoderOutput`] or `tuple`:
                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
                returned.
        r   r   Nr@   rA   r  )r?   r   r   r   r   r   r   rT   r   r   r  r  r  r2   r   )r"   r  r   rj   rk   r  r  r   r!  r"  r#  r$  rY   r%  r&  r'  r  r(  r)  r
  r%   r%   r&   r	  f  sD   


.z(AutoencoderKLHunyuanVideo15.tiled_decodeFr  sample_posterior	generatorc           	      C   s<   |}|  |j}|r|j|d}n| }| j||d}|S )aa  
        Args:
            sample (`torch.Tensor`): Input sample.
            sample_posterior (`bool`, *optional*, defaults to `False`):
                Whether to sample from the posterior.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
        )r,  )r   )r  r  r  r(   r  )	r"   r  r+  r   r,  rC   r  r  r
  r%   r%   r&   r+     s   z#AutoencoderKLHunyuanVideo15.forward)
r   r   r   r   r   r   r]   TTr   )NNNNNr   )FTN)r,   r-   r.   rD    _supports_gradient_checkpointingr   r   r/   r0   rS   r   r   r2   r3   r   r   r	   r   r  r  r   r  r  r  r  r   r	  	Generatorr+   r4   r%   r%   r#   r&   r   s  s    	
:
	

"   08r   )*numpyr   r2   torch.nnr   torch.nn.functionalra   r)   torch.utils.checkpointconfiguration_utilsr   r   utilsr   utils.accelerate_utilsr   activationsr   modeling_outputsr	   modeling_utilsr
   vaer   r   r   
get_loggerr,   loggerModuler   r5   rE   rp   r   r   r   r   r   r   r   r   r%   r%   r%   r&   <module>   s2   
!@85'017VT