o
    Gis                     @   s  d dl mZ d dlZd dlmZ ddlmZmZ ddlm	Z	 ddl
mZ ddlmZ dd	lmZmZmZ dd
lmZmZmZmZmZ ddlmZmZ ddlmZ ddlmZ ddlm Z m!Z!m"Z" ddl#m$Z$ e%e&Z'G dd dej(Z)G dd deeee	Z*dS )    )AnyN   )ConfigMixinregister_to_config)UNet2DConditionLoadersMixin)logging   )get_activation)	AttentionAttentionMixinFeedForward)ADDED_KV_ATTENTION_PROCESSORSCROSS_ATTENTION_PROCESSORSAttnAddedKVProcessorAttnProcessorFusedAttnProcessor2_0)TimestepEmbedding	Timesteps)
ModelMixin)TransformerTemporalModel   )UNetMidBlock3DCrossAttnget_down_blockget_up_block)UNet3DConditionOutputc                       s^   e Zd Z				ddedededed	ed
edB def fddZdejdejfddZ	  Z
S )"I2VGenXLTransformerTemporalEncodergegluFN        dimnum_attention_headsattention_head_dimactivation_fnupcast_attentionff_inner_dimdropoutc              	      sN   t    tj|ddd| _t||||d|dd| _t|||d|dd| _d S )NTh㈵>)elementwise_affineepsF)	query_dimheadsdim_headr$   biasr"   out_bias)r$   r!   final_dropout	inner_dimr+   )	super__init__nn	LayerNormnorm1r
   attn1r   ff)selfr   r   r    r!   r"   r#   r$   	__class__ Y/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/models/unets/unet_i2vgen_xl.pyr0   /   s&   

	z+I2VGenXLTransformerTemporalEncoder.__init__hidden_statesreturnc                 C   s^   |  |}| j|d d}|| }|jdkr|d}| |}|| }|jdkr-|d}|S )N)encoder_hidden_states   r   )r3   r4   ndimsqueezer5   )r6   r;   norm_hidden_statesattn_output	ff_outputr9   r9   r:   forwardM   s   





z*I2VGenXLTransformerTemporalEncoder.forward)r   FNr   )__name__
__module____qualname__intstrboolr0   torchTensorrD   __classcell__r9   r9   r7   r:   r   .   s2    r   c                       sd  e Zd ZdZdZe										
		d8dedB dededeedf deedf deedf dededB dedeee B deee B dB f fddZ	d9dedB deddfddZ
d d! Zd"d# Zd$d% Zd&d' Zd(d) Zd*d+ Z					,d:d-ejd.ejeB eB d/ejd0ejd1ejdB d2ejdB d3ejdB d4eeef dB d5edeeej B fd6d7Z  ZS );I2VGenXLUNeta	  
    I2VGenXL UNet. It is a conditional 3D UNet model that takes a noisy sample, conditional state, and a timestep and
    returns a sample-shaped output.

    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
    for all models (such as downloading or saving).

    Parameters:
        sample_size (`int` or `tuple[int, int]`, *optional*, defaults to `None`):
            Height and width of input/output sample.
        in_channels (`int`, *optional*, defaults to 4): The number of channels in the input sample.
        out_channels (`int`, *optional*, defaults to 4): The number of channels in the output.
        down_block_types (`tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
            The tuple of downsample blocks to use.
        up_block_types (`tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D")`):
            The tuple of upsample blocks to use.
        block_out_channels (`tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
            The tuple of output channels for each block.
        layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
        norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
            If `None`, normalization and activation layers is skipped in post-processing.
        cross_attention_dim (`int`, *optional*, defaults to 1280): The dimension of the cross attention features.
        attention_head_dim (`int`, *optional*, defaults to 64): Attention head dim.
        num_attention_heads (`int`, *optional*): The number of attention heads.
    FNr>   CrossAttnDownBlock3DrP   rP   DownBlock3D	UpBlock3DCrossAttnUpBlock3DrT   rT   i@  i     rV   r          @   sample_sizein_channelsout_channelsdown_block_types.up_block_typesblock_out_channelslayers_per_blocknorm_num_groupscross_attention_dimr    r   c                    s  t    |
}t|t|krtd| d| dt|t|kr-td| d| dt|tsEt|t|krEtd| d| dtj|| |d dd	d
| _t	d||d d	|d| _
ttjd|d dd	dt tj|d |d dd	d	dt tj|d |dd	d	d| _t|d|d |dd| _ttjd|d dd	dt tdtj|d |d ddd	dt tj|d |	ddd	d| _|d d }t|d dd| _|d }t||dd| _tt|	|t t||	| | _tt||t t||| _tg | _tg | _t|tr#|ft| }|d }t|D ]-\}}|}|| }|t|d	 k}t|||||| dd||	|| d	dd}| j| q+t|d |ddd	|	|d |dd	| _ d| _!t"t#|}t"t#|}|d }t|D ]M\}}|t|d	 k}|}|| }|t$|d	 t|d	  }|sd}|  j!d	7  _!nd}t%||d	 |||||dd||	|| d|d}| j| |}qtj&|d |dd| _'t(d| _)tj|d |dd	d
| _*d S )Nz\Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: z. `up_block_types`: .zbMust provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: z. `down_block_types`: zdMust provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: r   r   r   )kernel_sizepadding   )r   r    r[   
num_layersra   r>   )re   )stridere   r   gelu)r   r   r#   r    r!   )rW   rW      Tsilu)act_fnr%   F)rg   r[   r\   temb_channelsadd_downsample
resnet_epsresnet_act_fnresnet_groupsrb   r   downsample_paddingdual_cross_attention)	r[   rm   ro   rp   output_scale_factorrb   r   rq   rs   )rg   r[   r\   prev_output_channelrm   add_upsamplero   rp   rq   rb   r   rs   resolution_idx)num_channels
num_groupsr'   )+r/   r0   len
ValueError
isinstancerH   r1   Conv2dconv_inr   transformer_in
SequentialSiLUimage_latents_proj_inr   image_latents_temporal_encoderAdaptiveAvgPool2dimage_latents_context_embeddingr   	time_projr   time_embeddingLinearcontext_embeddingfps_embedding
ModuleListdown_blocks	up_blocks	enumerater   appendr   	mid_blocknum_upsamplerslistreversedminr   	GroupNormconv_norm_outr	   conv_actconv_out)r6   rZ   r[   r\   r]   r^   r_   r`   ra   rb   r    r   time_embed_dimtimestep_input_dimoutput_channelidown_block_typeinput_channelis_final_block
down_blockreversed_block_out_channelsreversed_num_attention_headsup_block_typerv   rw   up_blockr7   r9   r:   r0   |   s   
	


zI2VGenXLUNet.__init__r   
chunk_sizer   r<   c                    sZ   |dvrt d| |pd}dtjjdtdtf fdd |  D ]} ||| q"d	S )
aX  
        Sets the attention processor to use [feed forward
        chunking](https://huggingface.co/blog/reformer#2-chunked-feed-forward-layers).

        Parameters:
            chunk_size (`int`, *optional*):
                The chunk size of the feed-forward layers. If not specified, will run feed-forward layer individually
                over each tensor of dim=`dim`.
            dim (`int`, *optional*, defaults to `0`):
                The dimension over which the feed-forward computation should be chunked. Choose between dim=0 (batch)
                or dim=1 (sequence length).
        )r   r   z-Make sure to set `dim` to either 0 or 1, not r   moduler   r   c                    6   t | dr| j||d |  D ]} ||| qd S Nset_chunk_feed_forward)r   r   hasattrr   childrenr   r   r   childfn_recursive_feed_forwardr9   r:   r   O  
   
zGI2VGenXLUNet.enable_forward_chunking.<locals>.fn_recursive_feed_forwardN)r|   rK   r1   ModulerH   r   )r6   r   r   r   r9   r   r:   enable_forward_chunking<  s   z$I2VGenXLUNet.enable_forward_chunkingc                    s<   dt jjdtdtf fdd |  D ]} |d d qd S )Nr   r   r   c                    r   r   r   r   r   r9   r:   r   [  r   zHI2VGenXLUNet.disable_forward_chunking.<locals>.fn_recursive_feed_forwardr   )rK   r1   r   rH   r   )r6   r   r9   r   r:   disable_forward_chunkingZ  s   z%I2VGenXLUNet.disable_forward_chunkingc                 C   sj   t dd | j D rt }nt dd | j D r t }ntdtt| j  | | dS )ze
        Disables custom attention processors and sets the default attention implementation.
        c                 s       | ]}|j tv V  qd S N)r8   r   .0procr9   r9   r:   	<genexpr>j      z:I2VGenXLUNet.set_default_attn_processor.<locals>.<genexpr>c                 s   r   r   )r8   r   r   r9   r9   r:   r   l  r   zOCannot call `set_default_attn_processor` when attention processors are of type N)	allattn_processorsvaluesr   r   r|   nextiterset_attn_processor)r6   	processorr9   r9   r:   set_default_attn_processorf  s   z'I2VGenXLUNet.set_default_attn_processorc                 C   sH   t | jD ]\}}t|d| t|d| t|d| t|d| qdS )aF  Enables the FreeU mechanism from https://huggingface.co/papers/2309.11497.

        The suffixes after the scaling factors represent the stage blocks where they are being applied.

        Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of values that
        are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.

        Args:
            s1 (`float`):
                Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
                mitigate the "oversmoothing effect" in the enhanced denoising process.
            s2 (`float`):
                Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
                mitigate the "oversmoothing effect" in the enhanced denoising process.
            b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
            b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
        s1s2b1b2N)r   r   setattr)r6   r   r   r   r   r   upsample_blockr9   r9   r:   enable_freeuv  s   zI2VGenXLUNet.enable_freeuc                 C   sP   h d}t | jD ]\}}|D ]}t||st||ddur$t||d qq	dS )zDisables the FreeU mechanism.>   r   r   r   r   N)r   r   r   getattrr   )r6   
freeu_keysr   r   kr9   r9   r:   disable_freeu  s   zI2VGenXLUNet.disable_freeuc                 C   sn   d| _ | j D ]\}}dt|jjv rtdq| j| _ |  D ]}t|t	r.|j
dd q!| t  dS )u  
        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
        are fused. For cross-attention modules, key and value projection matrices are fused.

        > [!WARNING] > This API is 🧪 experimental.
        NAddedzQ`fuse_qkv_projections()` is not supported for models having added KV projections.T)fuse)original_attn_processorsr   itemsrI   r8   rE   r|   modulesr}   r
   fuse_projectionsr   r   )r6   _attn_processorr   r9   r9   r:   fuse_qkv_projections  s   
z!I2VGenXLUNet.fuse_qkv_projectionsc                 C   s   | j dur| | j  dS dS )un   Disables the fused QKV projection if enabled.

        > [!WARNING] > This API is 🧪 experimental.

        N)r   r   )r6   r9   r9   r:   unfuse_qkv_projections  s   
z#I2VGenXLUNet.unfuse_qkv_projectionsTsampletimestepfpsimage_latentsimage_embeddingsr=   timestep_condcross_attention_kwargsreturn_dictc
           &   
      s   |j \}
}}}}d| j  d}d}t fdd|j dd D r(td d}|}t|s`|jjd	k}|jjd
k}t	|t
rK|sD|rGtjntj}n
|sO|rRtjntj}tj|g||jd}nt|j dkro|d |j}||j d }| |}|j| jd}| ||}||j d }| | |j| jd}|| }|j|d|j d | d}||
d| jj}tj||gdd}|ddddddddf }|ddddd|j d |j d  |j d |j d |j d }| |}|j \}}}}|dddd||| |}tj||gdd}| |}| d| jj!| jj}tj||gdd}|j|d|j d | d}|ddddd|j d |j d  |j d |j d |j d }| "|}|dddf |
||||ddddd|
| | ||}| #|}||
||||ddddd}tj||gdd}|ddddd|j d | df|j dd  }| $|}| j%|||ddd }|f} | j&D ]&}!t'|!dr|!j(r|!|||||d\}}"n	|!|||d\}}"| |"7 } q| j)dur| j)|||||d}t*| j+D ]Q\}#}$|#t| j+d k}%| t|$j, d }"| dt|$j,  } |%s9|r9| d j dd }t'|$drP|$j(rP|$|||"||||d}q	|$|||"||d}q	| -|}| .|}| /|}|dddf d|f|j dd  ddddd}|	s|fS t0|dS )a  
        The [`I2VGenXLUNet`] forward method.

        Args:
            sample (`torch.Tensor`):
                The noisy input tensor with the following shape `(batch, num_frames, channel, height, width`.
            timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input.
            fps (`torch.Tensor`): Frames per second for the video being generated. Used as a "micro-condition".
            image_latents (`torch.Tensor`): Image encodings from the VAE.
            image_embeddings (`torch.Tensor`):
                Projection embeddings of the conditioning image computed with a vision encoder.
            encoder_hidden_states (`torch.Tensor`):
                The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
            cross_attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
                `self.processor` in
                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~models.unets.unet_3d_condition.UNet3DConditionOutput`] instead of a plain
                tuple.

        Returns:
            [`~models.unets.unet_3d_condition.UNet3DConditionOutput`] or `tuple`:
                If `return_dict` is True, an [`~models.unets.unet_3d_condition.UNet3DConditionOutput`] is returned,
                otherwise a `tuple` is returned where the first element is the sample tensor.
        r   FNc                 3   s    | ]	}|  d kV  qdS )r   Nr9   )r   sdefault_overall_up_factorr9   r:   r     s    z'I2VGenXLUNet.forward.<locals>.<genexpr>z9Forward upsample size to force interpolation output size.Tmpsnpu)dtypedevicer   )r   )r   output_sizer   )r   r   r>   rt   )
num_framesr   r   has_cross_attention)r;   tembr=   r   r   )r;   r   r   )r=   r   r   )r;   r   res_hidden_states_tupler=   upsample_sizer   r   )r;   r   r   r   r   )r   )1shaper   anyloggerinforK   	is_tensorr   typer}   floatfloat32float64int32int64tensorr{   toexpandr   r   r   r   repeat_interleave	new_zerosconfigrb   catpermutereshaper   r   viewr[   r   r   r   r   r   r   r   r   r   r   resnetsr   r   r   r   )&r6   r   r   r   r   r   r=   r   r   r   
batch_sizechannelsr   heightwidthforward_upsample_sizer   	timestepsis_mpsis_npur   t_embfps_embembcontext_embimage_latents_for_context_embdsimage_latents_context_embs_batch_size	_channels_height_width	image_embdown_block_res_samplesdownsample_blockres_samplesr   r   r   r9   r   r:   rD     s   &
 



 




 4





	

6
zI2VGenXLUNet.forward)Nr>   r>   rO   rR   rU   r   rW   rX   rY   N)Nr   )NNNNT)rE   rF   rG   __doc__ _supports_gradient_checkpointingr   rH   tuplerI   r0   r   r   r   r   r   r   r   rK   rL   r   dictr   rJ   r   rD   rM   r9   r9   r7   r:   rN   _   s    



 @		
rN   )+typingr   rK   torch.nnr1   configuration_utilsr   r   loadersr   utilsr   activationsr	   	attentionr
   r   r   attention_processorr   r   r   r   r   
embeddingsr   r   modeling_utilsr   !transformers.transformer_temporalr   unet_3d_blocksr   r   r   unet_3d_conditionr   
get_loggerrE   r   r   r   rN   r9   r9   r9   r:   <module>   s"   
1