o
    pi                     @   s(  d dl mZmZmZmZmZ d dlZd dlmZ d dl	Zddl
mZmZ ddlmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZmZmZmZmZmZ ddlmZmZ ddl m!Z! ddl"m#Z# ddl$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+ ddl,m-Z- e.e/Z0G dd dej1Z2G dd de!eeZ3dS )    )AnyDictOptionalTupleUnionN   )ConfigMixinregister_to_config)UNet2DConditionLoadersMixin)logging   )get_activation)	AttentionFeedForward)ADDED_KV_ATTENTION_PROCESSORSCROSS_ATTENTION_PROCESSORSAttentionProcessorAttnAddedKVProcessorAttnProcessorFusedAttnProcessor2_0)TimestepEmbedding	Timesteps)
ModelMixin)TransformerTemporalModel   )CrossAttnDownBlock3DCrossAttnUpBlock3DDownBlock3DUNetMidBlock3DCrossAttn	UpBlock3Dget_down_blockget_up_block)UNet3DConditionOutputc                       s^   e Zd Z				ddedededed	ed
ee def fddZdej	dej	fddZ
  ZS )"I2VGenXLTransformerTemporalEncodergegluFN        dimnum_attention_headsattention_head_dimactivation_fnupcast_attentionff_inner_dimdropoutc              	      sN   t    tj|ddd| _t||||d|dd| _t|||d|dd| _d S )NTh㈵>)elementwise_affineepsF)	query_dimheadsdim_headr,   biasr*   out_bias)r,   r)   final_dropout	inner_dimr3   )	super__init__nn	LayerNormnorm1r   attn1r   ff)selfr&   r'   r(   r)   r*   r+   r,   	__class__ c/home/ubuntu/SoloSpeech/.venv/lib/python3.10/site-packages/diffusers/models/unets/unet_i2vgen_xl.pyr8   5   s&   

	z+I2VGenXLTransformerTemporalEncoder.__init__hidden_statesreturnc                 C   s^   |  |}| j|d d}|| }|jdkr|d}| |}|| }|jdkr-|d}|S )N)encoder_hidden_states   r   )r;   r<   ndimsqueezer=   )r>   rC   norm_hidden_statesattn_output	ff_outputrA   rA   rB   forwardS   s   





z*I2VGenXLTransformerTemporalEncoder.forward)r$   FNr%   )__name__
__module____qualname__intstrboolr   r8   torchTensorrL   __classcell__rA   rA   r?   rB   r#   4   s2    r#   c                       s  e Zd ZdZdZe										
		d@dee dededee	df dee	df deedf dedee dede
eee f dee
eee f  f fddZedee	ef fddZde
eee	ef f fddZdAd!ee d"eddfd#d$Zd%d& Zd'd( ZdBd)eddfd*d+Zd,d- Zd.d/ Zd0d1 Zd2d3 Z					4dCd5ejd6e
ejeef d7ejd8ejd9eej d:eej d;eej d<eee	ef  d=ede
eeej f fd>d?Z  Z S )DI2VGenXLUNeta	  
    I2VGenXL UNet. It is a conditional 3D UNet model that takes a noisy sample, conditional state, and a timestep and
    returns a sample-shaped output.

    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
    for all models (such as downloading or saving).

    Parameters:
        sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
            Height and width of input/output sample.
        in_channels (`int`, *optional*, defaults to 4): The number of channels in the input sample.
        out_channels (`int`, *optional*, defaults to 4): The number of channels in the output.
        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
            The tuple of downsample blocks to use.
        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D")`):
            The tuple of upsample blocks to use.
        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
            The tuple of output channels for each block.
        layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
        norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
            If `None`, normalization and activation layers is skipped in post-processing.
        cross_attention_dim (`int`, *optional*, defaults to 1280): The dimension of the cross attention features.
        attention_head_dim (`int`, *optional*, defaults to 64): Attention head dim.
        num_attention_heads (`int`, *optional*): The number of attention heads.
    FNrF   r   r   r   r   r   r   r   r   i@  i     rZ   r          @   sample_sizein_channelsout_channelsdown_block_types.up_block_typesblock_out_channelslayers_per_blocknorm_num_groupscross_attention_dimr(   r'   c                    s  t    |
}t|t|krtd| d| dt|t|kr-td| d| dt|tsEt|t|krEtd| d| dtj|| |d dd	d
| _t	d||d d	|d| _
ttjd|d dd	dt tj|d |d dd	d	dt tj|d |dd	d	d| _t|d|d |dd| _ttjd|d dd	dt tdtj|d |d ddd	dt tj|d |	ddd	d| _|d d }t|d dd| _|d }t||dd| _tt|	|t t||	| | _tt||t t||| _tg | _tg | _t|tr#|ft| }|d }t|D ]-\}}|}|| }|t|d	 k}t|||||| dd||	|| d	dd}| j| q+t|d |ddd	|	|d |dd	| _ d| _!t"t#|}t"t#|}|d }t|D ]M\}}|t|d	 k}|}|| }|t$|d	 t|d	  }|sd}|  j!d	7  _!nd}t%||d	 |||||dd||	|| d|d}| j| |}qtj&|d |dd| _'t(d| _)tj|d |dd	d
| _*d S )Nz\Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: z. `up_block_types`: .zbMust provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: z. `down_block_types`: zdMust provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: r   r   r   )kernel_sizepadding   )r'   r(   r_   
num_layersre   rF   )ri   )strideri   r   gelu)r&   r'   r+   r(   r)   )r[   r[      Tsilu)act_fnr-   F)rk   r_   r`   temb_channelsadd_downsample
resnet_epsresnet_act_fnresnet_groupsrf   r'   downsample_paddingdual_cross_attention)	r_   rq   rs   rt   output_scale_factorrf   r'   ru   rw   )rk   r_   r`   prev_output_channelrq   add_upsamplers   rt   ru   rf   r'   rw   resolution_idx)num_channels
num_groupsr/   )+r7   r8   len
ValueError
isinstancerP   r9   Conv2dconv_inr   transformer_in
SequentialSiLUimage_latents_proj_inr#   image_latents_temporal_encoderAdaptiveAvgPool2dimage_latents_context_embeddingr   	time_projr   time_embeddingLinearcontext_embeddingfps_embedding
ModuleListdown_blocks	up_blocks	enumerater    appendr   	mid_blocknum_upsamplerslistreversedminr!   	GroupNormconv_norm_outr   conv_actconv_out)r>   r^   r_   r`   ra   rb   rc   rd   re   rf   r(   r'   time_embed_dimtimestep_input_dimoutput_channelidown_block_typeinput_channelis_final_block
down_blockreversed_block_out_channelsreversed_num_attention_headsup_block_typerz   r{   up_blockr?   rA   rB   r8      s   
	


zI2VGenXLUNet.__init__rD   c                    sL   i }dt dtjjdtt tf f fdd |  D ]
\}} ||| q|S )z
        Returns:
            `dict` of attention processors: A dictionary containing all attention processors used in the model with
            indexed by its weight name.
        namemodule
processorsc                    sH   t |dr| ||  d< | D ]\}} |  d| || q|S )Nget_processor
.processorrg   )hasattrr   named_children)r   r   r   sub_namechildfn_recursive_add_processorsrA   rB   r   L  s
   
zAI2VGenXLUNet.attn_processors.<locals>.fn_recursive_add_processors)rQ   rS   r9   Moduler   r   r   )r>   r   r   r   rA   r   rB   attn_processorsA  s
   	&	zI2VGenXLUNet.attn_processors	processorc                    s   t | j }t|tr"t ||kr"tdt | d| d| ddtdtjj	f fdd | 
 D ]
\}} ||| q3d	S )
a4  
        Sets the attention processor to use to compute attention.

        Parameters:
            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
                The instantiated processor class or a dictionary of processor classes that will be set as the processor
                for **all** `Attention` layers.

                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
                processor. This is strongly recommended when setting trainable attention processors.

        z>A dict of processors was passed, but the number of processors z0 does not match the number of attention layers: z. Please make sure to pass z processor classes.r   r   c                    sb   t |drt|ts|| n|||  d | D ]\}} |  d| || qd S )Nset_processorr   rg   )r   r   dictr   popr   )r   r   r   r   r   fn_recursive_attn_processorrA   rB   r   p  s   

zDI2VGenXLUNet.set_attn_processor.<locals>.fn_recursive_attn_processorN)r   r   keysr   r   r   rQ   rS   r9   r   r   )r>   r   countr   r   rA   r   rB   set_attn_processor[  s   
zI2VGenXLUNet.set_attn_processorr   
chunk_sizer&   c                    sZ   |dvrt d| |pd}dtjjdtdtf fdd |  D ]} ||| q"d	S )
aX  
        Sets the attention processor to use [feed forward
        chunking](https://huggingface.co/blog/reformer#2-chunked-feed-forward-layers).

        Parameters:
            chunk_size (`int`, *optional*):
                The chunk size of the feed-forward layers. If not specified, will run feed-forward layer individually
                over each tensor of dim=`dim`.
            dim (`int`, *optional*, defaults to `0`):
                The dimension over which the feed-forward computation should be chunked. Choose between dim=0 (batch)
                or dim=1 (sequence length).
        )r   r   z-Make sure to set `dim` to either 0 or 1, not r   r   r   r&   c                    6   t | dr| j||d |  D ]} ||| qd S Nset_chunk_feed_forward)r   r&   r   r   childrenr   r   r&   r   fn_recursive_feed_forwardrA   rB   r     
   
zGI2VGenXLUNet.enable_forward_chunking.<locals>.fn_recursive_feed_forwardN)r   rS   r9   r   rP   r   )r>   r   r&   r   rA   r   rB   enable_forward_chunking~  s   z$I2VGenXLUNet.enable_forward_chunkingc                    s<   dt jjdtdtf fdd |  D ]} |d d qd S )Nr   r   r&   c                    r   r   r   r   r   rA   rB   r     r   zHI2VGenXLUNet.disable_forward_chunking.<locals>.fn_recursive_feed_forwardr   )rS   r9   r   rP   r   )r>   r   rA   r   rB   disable_forward_chunking  s   z%I2VGenXLUNet.disable_forward_chunkingc                 C   sj   t dd | j D rt }nt dd | j D r t }ntdtt| j  | | dS )ze
        Disables custom attention processors and sets the default attention implementation.
        c                 s       | ]}|j tv V  qd S N)r@   r   .0procrA   rA   rB   	<genexpr>      z:I2VGenXLUNet.set_default_attn_processor.<locals>.<genexpr>c                 s   r   r   )r@   r   r   rA   rA   rB   r     r   zOCannot call `set_default_attn_processor` when attention processors are of type N)	allr   valuesr   r   r   nextiterr   )r>   r   rA   rA   rB   set_default_attn_processor  s   z'I2VGenXLUNet.set_default_attn_processorvaluec                 C   s    t |ttttfr||_d S d S r   )r   r   r   r   r   gradient_checkpointing)r>   r   r   rA   rA   rB   _set_gradient_checkpointing  s   
z(I2VGenXLUNet._set_gradient_checkpointingc                 C   sH   t | jD ]\}}t|d| t|d| t|d| t|d| qdS )a>  Enables the FreeU mechanism from https://arxiv.org/abs/2309.11497.

        The suffixes after the scaling factors represent the stage blocks where they are being applied.

        Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of values that
        are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.

        Args:
            s1 (`float`):
                Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
                mitigate the "oversmoothing effect" in the enhanced denoising process.
            s2 (`float`):
                Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
                mitigate the "oversmoothing effect" in the enhanced denoising process.
            b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
            b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
        s1s2b1b2N)r   r   setattr)r>   r   r   r   r   r   upsample_blockrA   rA   rB   enable_freeu  s   zI2VGenXLUNet.enable_freeuc                 C   sP   h d}t | jD ]\}}|D ]}t||st||ddur$t||d qq	dS )zDisables the FreeU mechanism.>   r   r   r   r   N)r   r   r   getattrr   )r>   
freeu_keysr   r   krA   rA   rB   disable_freeu  s   zI2VGenXLUNet.disable_freeuc                 C   sn   d| _ | j D ]\}}dt|jjv rtdq| j| _ |  D ]}t|t	r.|j
dd q!| t  dS )u1  
        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
        are fused. For cross-attention modules, key and value projection matrices are fused.

        <Tip warning={true}>

        This API is 🧪 experimental.

        </Tip>
        NAddedzQ`fuse_qkv_projections()` is not supported for models having added KV projections.T)fuse)original_attn_processorsr   itemsrQ   r@   rM   r   modulesr   r   fuse_projectionsr   r   )r>   _attn_processorr   rA   rA   rB   fuse_qkv_projections  s   
z!I2VGenXLUNet.fuse_qkv_projectionsc                 C   s   | j dur| | j  dS dS )u   Disables the fused QKV projection if enabled.

        <Tip warning={true}>

        This API is 🧪 experimental.

        </Tip>

        N)r   r   )r>   rA   rA   rB   unfuse_qkv_projections  s   

z#I2VGenXLUNet.unfuse_qkv_projectionsTsampletimestepfpsimage_latentsimage_embeddingsrE   timestep_condcross_attention_kwargsreturn_dictc
           %   
      s  |j \}
}}}}d| j  d}d}t fdd|j dd D r(td d}|}t|sV|jjd	k}t	|t
rC|r?tjntj}n|rHtjntj}tj|g||jd
}nt|j dkre|d |j}||j d }| |}|j| jd}| ||}||j d }| | |j| jd}|| }|j|dd}||
d| jj}tj||gdd}|ddddddddf }|ddddd|j d |j d  |j d |j d |j d }| |}|j \}}}}|dddd||| |}tj||gdd}| |}| d| jj!| jj}tj||gdd}|j|dd}|ddddd|j d |j d  |j d |j d |j d }| "|}|dddf |
||||ddddd|
| | ||}| #|}||
||||ddddd}tj||gdd}|ddddd|j d | df|j dd  }| $|}| j%|||ddd }|f}| j&D ]&} t'| dr| j(r| |||||d\}}!n	| |||d\}}!||!7 }q| j)dur| j)|||||d}t*| j+D ]Q\}"}#|"t| j+d k}$|t|#j, d }!|dt|#j,  }|$s#|r#|d j dd }t'|#dr:|#j(r:|#|||!||||d}q|#|||!||d}q| -|}| .|}| /|}|dddf d|f|j dd  ddddd}|	su|fS t0|dS )a  
        The [`I2VGenXLUNet`] forward method.

        Args:
            sample (`torch.Tensor`):
                The noisy input tensor with the following shape `(batch, num_frames, channel, height, width`.
            timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input.
            fps (`torch.Tensor`): Frames per second for the video being generated. Used as a "micro-condition".
            image_latents (`torch.Tensor`): Image encodings from the VAE.
            image_embeddings (`torch.Tensor`):
                Projection embeddings of the conditioning image computed with a vision encoder.
            encoder_hidden_states (`torch.Tensor`):
                The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
            cross_attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
                `self.processor` in
                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~models.unets.unet_3d_condition.UNet3DConditionOutput`] instead of a plain
                tuple.

        Returns:
            [`~models.unets.unet_3d_condition.UNet3DConditionOutput`] or `tuple`:
                If `return_dict` is True, an [`~models.unets.unet_3d_condition.UNet3DConditionOutput`] is returned,
                otherwise a `tuple` is returned where the first element is the sample tensor.
        r   FNc                 3   s    | ]	}|  d kV  qdS )r   NrA   )r   sdefault_overall_up_factorrA   rB   r   8  s    z'I2VGenXLUNet.forward.<locals>.<genexpr>z9Forward upsample size to force interpolation output size.Tmps)dtypedevicer   )r   )repeatsr&   r   )r&   r   rF   rx   )
num_framesr   r   has_cross_attention)rC   tembrE   r   r   )rC   r  r   )rE   r   r   )rC   r  res_hidden_states_tuplerE   upsample_sizer   r   )rC   r  r  r  r   )r   )1shaper   anyloggerinforS   	is_tensorr   typer   floatfloat32float64int32int64tensorr   toexpandr   r   r   r   repeat_interleave	new_zerosconfigrf   catpermutereshaper   r   viewr_   r   r   r   r   r   r   r  r   r   r   resnetsr   r   r   r"   )%r>   r   r   r   r   r   rE   r   r   r   
batch_sizechannelsr   heightwidthforward_upsample_sizer  	timestepsis_mpsr   t_embfps_embembcontext_embimage_latents_for_context_embdsimage_latents_context_embs_batch_size	_channels_height_width	image_embdown_block_res_samplesdownsample_blockres_samplesr   r   r   rA   r   rB   rL     s   &
 



 




 4





	

6
zI2VGenXLUNet.forward)NrF   rF   rW   rX   rY   r   r[   r\   r]   N)Nr   )F)NNNNT)!rM   rN   rO   __doc__ _supports_gradient_checkpointingr	   r   rP   r   rQ   r   r8   propertyr   r   r   r   r   r   r   rR   r   r   r   r   r   rS   rT   r  r   r"   rL   rU   rA   rA   r?   rB   rV   e   s    


 ?#		
rV   )4typingr   r   r   r   r   rS   torch.nnr9   torch.utils.checkpointconfiguration_utilsr   r	   loadersr
   utilsr   activationsr   	attentionr   r   attention_processorr   r   r   r   r   r   
embeddingsr   r   modeling_utilsr   !transformers.transformer_temporalr   unet_3d_blocksr   r   r   r   r   r    r!   unet_3d_conditionr"   
get_loggerrM   r  r   r#   rV   rA   rA   rA   rB   <module>   s$    $	
1