o
    ۷i                     @   s,  d dl Z d dlZd dlmZ ddlmZmZ ddlmZ ddl	m
Z
mZ ddlmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ G dd dejZG dd dejZG dd dejZG dd dejZG dd dejZ G dd dejZ!G dd deeeZ"dd Z#dS )    N   )ConfigMixinregister_to_config)apply_forward_hook   )	AttentionSpatialNorm)DecoderOutputDiagonalGaussianDistribution)Downsample2D)AutoencoderKLOutput)
ModelMixin)ResnetBlock2D)
Upsample2D   )AutoencoderMixinc                       s   e Zd ZdZ						ddededB d	ed
ededededdf fddZede	j
de	j
fddZde	j
dede	j
fddZ  ZS )AllegroTemporalConvLayera
  
    Temporal convolutional layer that can be used for video (sequence of images) input. Code adapted from:
    https://github.com/modelscope/modelscope/blob/1509fdb973e5871f37148a4b5e5964cafd43e64d/modelscope/models/multi_modal/video_synthesis/unet_sd.py#L1016
    N            Fr   in_dimout_dimdropoutnorm_num_groups	up_sampledown_samplestridereturnc                    s  t    |p|}t|d d  }}	d}
|| _|| _|r:tt||t tj	||d||fdd||	fd| _
n;|rZtt||t tj	||d d||fd||	fd| _
ntt||t tj	||d||f|
||	fd| _
tt||t t|tj	||d||f|
||	fd| _tt||t t|tj	||d||f|
||fd| _tt||t tj	||d||f|
||fd| _d S )	Nr   g      ?r   r   )r   r   r   )r   paddingr   r   )super__init__intr   r   nn
Sequential	GroupNormSiLUConv3dconv1Dropoutconv2conv3conv4)selfr   r   r   r   r   r   r   pad_hpad_wpad_t	__class__ j/home/ubuntu/vllm_env/lib/python3.10/site-packages/diffusers/models/autoencoders/autoencoder_kl_allegro.pyr    '   sP   



 




z!AllegroTemporalConvLayer.__init__hidden_statesc                 C   sT   t j| d d d d ddf | fdd} t j| | d d d d dd f fdd} | S )Nr   r   r   )dim)torchcat)r4   r2   r2   r3   _pad_temporal_dim^   s   ((z*AllegroTemporalConvLayer._pad_temporal_dim
batch_sizec                 C   s   | d|dfddddd}| jr!|d d d d d d df }n| jr2|jdd|jd d d}n|}| js:| jr@| |}n
| |}| |}| jr_| dddddddd	dd}| |}| 	|}| |}| 
|}| |}| |}|| }|ddddddd}|S )
Nr   r6   r   r   r      )r5   output_size)r   r6      )	unflattenpermuter   r   repeat_interleaveshaper'   r9   flattenr)   r*   r+   )r,   r4   r:   identityr2   r2   r3   forwardd   s*   

$





z AllegroTemporalConvLayer.forward)Nr   r   FFr   )__name__
__module____qualname____doc__r!   floatboolr    staticmethodr7   Tensorr9   rD   __classcell__r2   r2   r0   r3   r   !   s8    	7"r   c                       s   e Zd Z												dd
ededededededededededededef fddZdej	dej	fddZ
  ZS )AllegroDownBlock3Dr   r   ư>defaultswishr   T      ?Fin_channelsout_channelsr   
num_layers
resnet_epsresnet_time_scale_shiftresnet_act_fnresnet_groupsresnet_pre_normoutput_scale_factorspatial_downsampletemporal_downsampledownsample_paddingc                    s   t    g }g }t|D ]&}|dkr|n|}|t||d ||||||
|	d
 |t||d|d qt|| _t|| _	|rMt||d|ddd| _
|| _|rbtt|d||dd	g| _d S d | _d S )
Nr   
rS   rT   temb_channelsepsgroupsr   time_embedding_normnon_linearityr[   pre_norm皙?r   r   Tr   )r   r   r   r   op)use_convrT   r   name)r   r    rangeappendr   r   r"   
ModuleListresnets
temp_convstemp_convs_downadd_temp_downsampler   downsamplers)r,   rS   rT   r   rU   rV   rW   rX   rY   rZ   r[   r\   r]   r^   rn   ro   ir0   r2   r3   r       sT   
	


zAllegroDownBlock3D.__init__r4   r   c                 C      |j d }|ddddddd}t| j| jD ]\}}||d d}|||d}q| jr4| j||d}| jd urC| jD ]}||}q<|	d|dfddddd}|S 	Nr   r   r   r   r;   )temb)r:   r6   )
rA   r?   rB   ziprn   ro   rq   rp   rr   r>   )r,   r4   r:   resnet	temp_convdownsamplerr2   r2   r3   rD         



zAllegroDownBlock3D.forward)r   r   rO   rP   rQ   r   TrR   TFr   rE   rF   rG   r!   rI   strrJ   r    r7   rL   rD   rM   r2   r2   r0   r3   rN      sP    	
BrN   c                       s   e Zd Z												
ddededededededededededededed
B f fddZdej	dej	fddZ
  ZS )AllegroUpBlock3Dr   r   rO   rP   rQ   r   TrR   FNrS   rT   r   rU   rV   rW   rX   rY   rZ   r[   spatial_upsampletemporal_upsampler`   c                    s   t    g }g }t|D ]&}|dkr|n|}|t|||||||||
|	d
 |t||d|d qt|| _t|| _	|| _
|rPt||d|ddd| _|r`tt|d|dg| _d S d | _d S )	Nr   r_   rf   rg   Tr   )r   r   r   r   )ri   rT   )r   r    rk   rl   r   r   r"   rm   rn   ro   add_temp_upsampletemp_conv_upr   
upsamplers)r,   rS   rT   r   rU   rV   rW   rX   rY   rZ   r[   r   r   r`   rn   ro   rs   input_channelsr0   r2   r3   r       sJ   
	
zAllegroUpBlock3D.__init__r4   r   c                 C   rt   ru   )
rA   r?   rB   rw   rn   ro   r   r   r   r>   )r,   r4   r:   rx   ry   	upsamplerr2   r2   r3   rD     r{   zAllegroUpBlock3D.forward)r   r   rO   rP   rQ   r   TrR   TFNr|   r2   r2   r0   r3   r~      sP    	
=r~   c                       sz   e Zd Z										dd	ed
ededededededededededef fddZdej	dej	fddZ
  ZS )AllegroMidBlock3DConvr   r   rO   rP   rQ   r   TrR   rS   r`   r   rU   rV   rW   rX   rY   rZ   add_attentionattention_head_dimr[   c                    s  t    t||||||||||	d
g}t||d|dg}g }|d u r%|}t|D ]F}|
rN|t||| ||||dkr=|nd |dkrD|nd ddddd n|d  |t||||||||||	d
 |t||d|d q)t|| _	t|| _
t|| _d S )Nr_   rf   rg   rP   spatialT)
headsdim_headrescale_output_factorra   r   spatial_norm_dimresidual_connectionbiasupcast_softmax_from_deprecated_attn_block)r   r    r   r   rk   rl   r   r"   rm   rn   ro   
attentions)r,   rS   r`   r   rU   rV   rW   rX   rY   rZ   r   r   r[   rn   ro   r   _r0   r2   r3   r    0  s   

	zAllegroMidBlock3DConv.__init__r4   r   c                 C   s   |j d }|ddddddd}| jd |d d}| jd ||d}t| j| jdd  | jdd  D ]\}}}||}||d d}|||d}q5|d|dfddddd}|S ru   )rA   r?   rB   rn   ro   rw   r   r>   )r,   r4   r:   attnrx   ry   r2   r2   r3   rD     s   
,zAllegroMidBlock3DConv.forward)
r   r   rO   rP   rQ   r   TTr   rR   r|   r2   r2   r0   r3   r   /  sJ    	
]r   c                       s   e Zd Zddddg dddddf	d	ed
edeedf deedf deedf dedededef fddZdej	dej	fddZ
  ZS )AllegroEncoder3Dr   rN   rN   rN   rN            r   TTFFr   r   siluTrS   rT   down_block_types.block_out_channelstemporal_downsample_blockslayers_per_blockr   act_fndouble_zc
                    sP  t    tj||d dddd| _tj|d |d ddd| _tg | _|d }
t	|D ]1\}}|
}|| }
|t
|d k}|dkrUt|||
| || d	d||d
	}ntd| j| q.t|d d	|dd|d |d d| _tj|d |d	d| _t | _|	rd| n|}tj|d |d ddd| _tj|d |ddd| _d| _d S )Nr   r   r   kernel_sizer   r   r   r   r   r   r   r   )rS   rT   r   r   rN   rO   )	rU   rS   rT   r\   r]   rV   r^   rX   rY   zCInvalid `down_block_type` encountered. Must be `AllegroDownBlock3D`r6   rP   rS   rV   rX   r[   rW   r   rY   r`   num_channels
num_groupsra   r   r   F)r   r    r"   Conv2dconv_inr&   temp_conv_inrm   down_blocks	enumeratelenrN   
ValueErrorrl   r   	mid_blockr$   conv_norm_outr%   conv_acttemp_conv_outconv_outgradient_checkpointing)r,   rS   rT   r   r   r   r   r   r   r   output_channelrs   down_block_typeinput_channelis_final_block
down_blockconv_out_channelsr0   r2   r3   r      sd   


zAllegroEncoder3D.__init__sampler   c                 C   s^  |j d }|ddddddd}| |}|d|dfddddd}|}| |}|| }t rL| jrL| j	D ]}| 
||}q;| 
| j|}n| j	D ]}||}qO| |}|ddddddd}| |}| |}|d|dfddddd}|}| |}|| }|ddddddd}| |}|d|dfddddd}|S Nr   r   r   r   r;   r6   )rA   r?   rB   r   r>   r   r7   is_grad_enabledr   r   _gradient_checkpointing_funcr   r   r   r   r   )r,   r   r:   residualr   r2   r2   r3   rD     s2   










zAllegroEncoder3D.forwardrE   rF   rG   r!   tupler}   rJ   r    r7   rL   rD   rM   r2   r2   r0   r3   r     s<    



Sr   c                       s   e Zd Zdddg dddddd	f	d
ededeedf deedf deedf dedededef fddZdej	dej	fddZ
  ZS )AllegroDecoder3Dr;   r   r~   r~   r~   r~   FTTFr   r   r   r   grouprS   rT   up_block_types.temporal_upsample_blocksr   r   r   r   	norm_typec
                    s  t    tj||d dddd| _tj|d |d ddd| _d | _tg | _	|	dkr/|nd }
t
|d d	|d|	d
kr>dn|	|d ||
d| _tt|}|d }t|D ]6\}}|}|| }|t|d k}|dkrt|d ||| || d	|||
|	d
}ntd| j	| |}qV|	dkrt|d |
| _ntj|d |d	d| _t | _tj|d |d ddd| _tj|d |ddd| _d| _d S )Nr6   r   r   r   r   r   r   r   rO   r   rP   r   r   r~   )
rU   rS   rT   r   r   rV   rX   rY   r`   rW   z?Invalid `UP_block_type` encountered. Must be `AllegroUpBlock3D`r   F)r   r    r"   r   r   r&   r   r   rm   	up_blocksr   listreversedr   r   r~   r   rl   r   r   r$   r%   r   r   r   r   )r,   rS   rT   r   r   r   r   r   r   r   r`   reversed_block_out_channelsr   rs   up_block_typeprev_output_channelr   up_blockr0   r2   r3   r      sf   


zAllegroDecoder3D.__init__r   r   c                 C   s|  |j d }|ddddddd}| |}|d|dfddddd}|}| |}|| }tt| j	 j
}t rV| jrV| | j|}| jD ]}| ||}qLn| |}||}| jD ]}||}qc|ddddddd}| |}| |}|d|dfddddd}|}| |}|| }|ddddddd}| |}|d|dfddddd}|S r   )rA   r?   rB   r   r>   r   nextiterr   
parametersdtyper7   r   r   r   r   tor   r   r   r   )r,   r   r:   r   upscale_dtyper   r2   r2   r3   rD   u  s8   











zAllegroDecoder3D.forwardr   r2   r2   r0   r3   r     s<    



Wr   c                "       s  e Zd ZdZdZe										
							d8dededeedf deedf deedf dee	df dee	df dedededede
dede
de	dd f  fd!d"Zd#ejdejfd$d%Ze	d9d#ejd&e	deee B fd'd(Zd)ejdejfd*d+Zed9d)ejd&e	deejB fd,d-Zd#ejdejfd.d/Zd)ejdejfd0d1Z	2		 d:d3ejd4e	d&e	d5ejd B deejB f
d6d7Z  ZS );AutoencoderKLAllegroa!  
    A VAE model with KL loss for encoding videos into latents and decoding latent representations into videos. Used in
    [Allegro](https://github.com/rhymes-ai/Allegro).

    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
    for all models (such as downloading or saving).

    Parameters:
        in_channels (int, defaults to `3`):
            Number of channels in the input image.
        out_channels (int, defaults to `3`):
            Number of channels in the output.
        down_block_types (`tuple[str, ...]`, defaults to `("AllegroDownBlock3D", "AllegroDownBlock3D", "AllegroDownBlock3D", "AllegroDownBlock3D")`):
            tuple of strings denoting which types of down blocks to use.
        up_block_types (`tuple[str, ...]`, defaults to `("AllegroUpBlock3D", "AllegroUpBlock3D", "AllegroUpBlock3D", "AllegroUpBlock3D")`):
            tuple of strings denoting which types of up blocks to use.
        block_out_channels (`tuple[int, ...]`, defaults to `(128, 256, 512, 512)`):
            tuple of integers denoting number of output channels in each block.
        temporal_downsample_blocks (`tuple[bool, ...]`, defaults to `(True, True, False, False)`):
            tuple of booleans denoting which blocks to enable temporal downsampling in.
        latent_channels (`int`, defaults to `4`):
            Number of channels in latents.
        layers_per_block (`int`, defaults to `2`):
            Number of resnet or attention or temporal convolution layers per down/up block.
        act_fn (`str`, defaults to `"silu"`):
            The activation function to use.
        norm_num_groups (`int`, defaults to `32`):
            Number of groups to use in normalization layers.
        temporal_compression_ratio (`int`, defaults to `4`):
            Ratio by which temporal dimension of samples are compressed.
        sample_size (`int`, defaults to `320`):
            Default latent size.
        scaling_factor (`float`, defaults to `0.13235`):
            The component-wise standard deviation of the trained latent space computed using the first batch of the
            training set. This is used to scale the latent space to have unit variance when training the diffusion
            model. The latents are scaled with the formula `z = z * scaling_factor` before being passed to the
            diffusion model. When decoding, the latents are scaled back to the original scale with the formula: `z = 1
            / scaling_factor * z`. For more details, refer to sections 4.3.2 and D.1 of the [High-Resolution Image
            Synthesis with Latent Diffusion Models](https://huggingface.co/papers/2112.10752) paper.
        force_upcast (`bool`, default to `True`):
            If enabled it will force the VAE to run in float32 for high image resolution pipelines, such as SD-XL. VAE
            can be fine-tuned / trained to a lower range without losing too much precision in which case `force_upcast`
            can be set to `False` - see: https://huggingface.co/madebyollin/sdxl-vae-fp16-fix
    Tr   r   r   r   r   r   r;   r   r   r   @  p=
ף?rS   rT   r   .r   r   r   r   latent_channelsr   r   r   temporal_compression_ratiosample_sizescaling_factorforce_upcastr   Nc                    s   t    t||||||	|
|dd	| _t||||||	||
d| _td| d| d| _t||d| _	d| _
d| _dt|d  | _d| _d| _d	| _d
}|||f| _|| j || j || j f| _d S )NT)	rS   rT   r   r   r   r   r   r   r   )rS   rT   r   r   r   r   r   r   r   r   F   x   P      )r   r    r   encoderr   decoderr"   r   
quant_convpost_quant_convuse_slicing
use_tilingr   spatial_compression_ratiotile_overlap_ttile_overlap_htile_overlap_wkernelr   )r,   rS   rT   r   r   r   r   r   r   r   r   r   r   r   r   r   sample_framesr0   r2   r3   r      sH   


zAutoencoderKLAllegro.__init__xc                 C      | j r| |S td)Nz5Encoding without tiling has not been implemented yet.)r   tiled_encodeNotImplementedError)r,   r   r2   r2   r3   _encode     
zAutoencoderKLAllegro._encodereturn_dictc                    s^    j r|jd dkr fdd|dD }t|}n |}t|}|s*|fS t|dS )a  
        Encode a batch of videos into latents.

        Args:
            x (`torch.Tensor`):
                Input batch of videos.
            return_dict (`bool`, defaults to `True`):
                Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.

        Returns:
                The latent representations of the encoded videos. If `return_dict` is True, a
                [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
        r   r   c                       g | ]}  |qS r2   )r   ).0x_slicer,   r2   r3   
<listcomp>8      z/AutoencoderKLAllegro.encode.<locals>.<listcomp>)latent_dist)r   rA   splitr7   r8   r   r
   r   )r,   r   r   encoded_slicesh	posteriorr2   r   r3   encode&  s   

zAutoencoderKLAllegro.encodezc                 C   r   )Nz5Decoding without tiling has not been implemented yet.)r   tiled_decoder   )r,   r  r2   r2   r3   _decodeC  r   zAutoencoderKLAllegro._decodec                    sV    j r|jd dkr fdd|dD }t|}n |}|s&|fS t|dS )a  
        Decode a batch of videos.

        Args:
            z (`torch.Tensor`):
                Input batch of latent vectors.
            return_dict (`bool`, defaults to `True`):
                Whether to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.

        Returns:
            [`~models.vae.DecoderOutput`] or `tuple`:
                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
                returned.
        r   r   c                    r   r2   )r  )r   z_slicer   r2   r3   r   \  r   z/AutoencoderKLAllegro.decode.<locals>.<listcomp>r   )r   rA   r   r7   r8   r  r	   )r,   r  r   decoded_slicesdecodedr2   r   r3   decodeK  s   

zAutoencoderKLAllegro.decodec                 C   s  d}| j }| jj}|j\}}}}}	t|| jd  | jd  d }
t|| jd  | jd  d }t|	| jd  | jd  d }d}||
| | d| jj	 | jd | | jd | | jd | f}|||| jd | jd | jd f}t
|
D ]}t
|D ]}t
|D ]}|| jd  || jd  | jd  }}|| jd  || jd  | jd  }}|| jd  || jd  | jd  }}|d d d d ||||||f }|||| < || |d ks||
| | d kr?| |}||
| | d kr || |d kr |d || d  ||||  d < n|||| d |d < |||| jd | jd | jd f}|d7 }qqq~||d| jj	 || || |	| f}| jd | | jd | | jd | f}| jd | | jd | | jd | f}|d |d  |d |d  |d |d  f}t
|
D ]}||d  ||d  |d  }}t
|D ]i}||d  ||d  |d  }}t
|D ]P}||d  ||d  |d  }}t||
|d f|||d f|||d f||| | ||  |  d}|d d d d ||||||f  |7  < qːqq|ddddddd}| |}|d|dfddddd}|S Nr   r   r   r   r;   r6   )r   configr   rA   mathfloorr   r   	new_zerosr   rk   r   _prepare_for_blend	unsqueezer?   rB   r   r>   )r,   r   local_batch_sizersrtr:   r   
num_framesheightwidthoutput_num_framesoutput_heightoutput_widthcountoutput_latentvae_batch_inputrs   jkn_startn_endh_starth_endw_startw_end
video_cubelatentoutput_kerneloutput_strideoutput_overlaplatent_meanr2   r2   r3   r   e  s   """

&	(((&
&
 ((""" 2
z!AutoencoderKLAllegro.tiled_encodec                  C   s~  d}| j }| jj}| jd | | jd | | jd | f}| jd | | jd | | jd | f}|j\}}}	}
}|ddddddd}| |}|	d|dfddddd}t
|	|d  |d  d }t
|
|d  |d  d }t
||d  |d  d }d}||| | | jj| jd | jd | jd f}||||d |d |d f}t|D ]}t|D ]}t|D ]}||d  ||d  |d  }}||d  ||d  |d  }}||d  ||d  |d  }}|d d d d ||||||f }|||| < || |d ks$||| | d krl| |}||| | d krP|| |d krP|d || d  ||||  d < n|||| d |d < ||||d |d |d f}|d7 }qqq||| jj|	| |
| || f}| jd | jd  | jd | jd  | jd | jd  f}t|D ]}|| jd  || jd  | jd  }}t|D ]o}|| jd  || jd  | jd  }}t|D ]S}|| jd  || jd  | jd  }}t|||d f|||d f|||d f||| | ||  |  d}|d d d d ||||||f  |7  < qڐqq|ddddd }|S r  )r   r  r   r   r   rA   r?   rB   r   r>   r  r  r  rT   rk   r   r  r  
contiguous) r,   r  r  r  r  latent_kernellatent_strider:   r   r  r  r  r  r  r  r  decoded_videosr  rs   r  r   r!  r"  r#  r$  r%  r&  current_latentcurrent_videovideovideo_overlapout_video_blendr2   r2   r3   r    s   ((

	"""&

$((( 2z!AutoencoderKLAllegro.tiled_decodeFr   sample_posterior	generatorc           	      C   sJ   |}|  |j}|r|j|d}n| }| |j}|s |fS t|dS )a  
        Args:
            sample (`torch.Tensor`): Input sample.
            sample_posterior (`bool`, *optional*, defaults to `False`):
                Whether to sample from the posterior.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
            generator (`torch.Generator`, *optional*):
                PyTorch random number generator.
        )r7  r  )r  r   r   moder  r	   )	r,   r   r6  r   r7  r   r  r  decr2   r2   r3   rD     s   
zAutoencoderKLAllegro.forward)r   r   r   r   r   r   r   r;   r   r   r   r;   r   r   T)T)FTN)rE   rF   rG   rH    _supports_gradient_checkpointingr   r!   r   r}   rJ   rI   r    r7   rL   r   r   r   r
   r  r  r	   r  r   r  	GeneratorrD   rM   r2   r2   r0   r3   r     s    -





I
"S\r   c                 C   s   | \}}}|\}}}	|\}
}}|dkr|dkrN|d d d d d|d d d d f t d| |j| |dd |d d d d d|d d d d f< ||d k r|d d d d | d d d d d f dt d| |j|  |dd |d d d d | d d d d d f< |dkr|d d d d d d d|	d d f t d|	 |j|	 |	d |d d d d d d d|	d d f< ||d k r
|d d d d d d |	 d d d f dt d|	 |j|	  |	d |d d d d d d |	 d d d f< |
dkrA|d d d d d d d d d|f t d| |j|  |d d d d d d d d d|f< |
|d k r~|d d d d d d d d | d f dt d| |j|   |d d d d d d d d | d f< |S )Nr   r   )r7   arangerI   r   devicereshape)n_paramh_paramw_paramr   nn_max	overlap_nr  h_max	overlap_hww_max	overlap_wr2   r2   r3   r  0  sB   


$
&&
($&&(
$&&(r  )$r  r7   torch.nnr"   configuration_utilsr   r   utils.accelerate_utilsr   attention_processorr   r   autoencoders.vaer	   r
   downsamplingr   modeling_outputsr   modeling_utilsr   rx   r   
upsamplingr   vaer   Moduler   rN   r~   r   r   r   r   r  r2   r2   r2   r3   <module>   s2   eWRo    