o
    Giw                    @   s  d dl mZ d dlmZ d dlZd dlmZ d dlm  mZ	 ddl
mZmZmZ ddlmZmZmZ ddlmZmZmZmZ ddlmZ d	d
lmZmZ d	dlmZmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z% d	dl&m'Z'm(Z( d	dl)m*Z* d	dl+m,Z,m-Z-m.Z. d	dl/m0Z0 d	dl1m2Z2 ddl3m4Z4 ddl5m6Z6 e7e8Z9eG dd deZ:G dd dej;Z<G dd dej;Z=G dd dej;Z>G dd dej;Z?G dd dej;Z@G d d! d!ej;ZAG d"d# d#ej;ZBG d$d% d%e*eeZCG d&d' d'e*eeeeZDdS )(    )	dataclass)AnyN   )ConfigMixin
FrozenDictregister_to_config)FromOriginalModelMixinPeftAdapterMixinUNet2DConditionLoadersMixin)
BaseOutputapply_lora_scale	deprecatelogging)apply_freeu   )AttentionMixinBasicTransformerBlock)	ADDED_KV_ATTENTION_PROCESSORSCROSS_ATTENTION_PROCESSORS	AttentionAttnAddedKVProcessorAttnProcessorAttnProcessor2_0FusedAttnProcessor2_0IPAdapterAttnProcessorIPAdapterAttnProcessor2_0)TimestepEmbedding	Timesteps)
ModelMixin)Downsample2DResnetBlock2D
Upsample2D)DualTransformer2DModel)Transformer2DModel   )UNetMidBlock2DCrossAttn)UNet2DConditionModelc                   @   s   e Zd ZU dZejed< dS )UNetMotionOutputa  
    The output of [`UNetMotionOutput`].

    Args:
        sample (`torch.Tensor` of shape `(batch_size, num_channels, num_frames, height, width)`):
            The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model.
    sampleN)__name__
__module____qualname____doc__torchTensor__annotations__ r0   r0   \/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/models/unets/unet_motion_model.pyr'   2   s   
 r'   c                        s   e Zd ZdZ													
	
		d%dedededB dedB dededededB dededB dededededB dedB f fddZ					d&de	j
de	jdB de	jdB de	jdB d ed!eeef dB d"e	j
fd#d$Z  ZS )'AnimateDiffTransformer3Das  
    A Transformer model for video-like data.

    Parameters:
        num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
        attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
        in_channels (`int`, *optional*):
            The number of channels in the input and output (specify if the input is **continuous**).
        num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
        cross_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
        attention_bias (`bool`, *optional*):
            Configure if the `TransformerBlock` attention should contain a bias parameter.
        sample_size (`int`, *optional*): The width of the latent images (specify if the input is **discrete**).
            This is fixed during training since it is used to learn a number of position embeddings.
        activation_fn (`str`, *optional*, defaults to `"geglu"`):
            Activation function to use in feed-forward. See `diffusers.models.activations.get_activation` for supported
            activation functions.
        norm_elementwise_affine (`bool`, *optional*):
            Configure if the `TransformerBlock` should use learnable elementwise affine parameters for normalization.
        double_self_attention (`bool`, *optional*):
            Configure if each `TransformerBlock` should contain two self-attention layers.
        positional_embeddings: (`str`, *optional*):
            The type of positional embeddings to apply to the sequence input before passing use.
        num_positional_embeddings: (`int`, *optional*):
            The maximum length of the sequence over which to apply positional embeddings.
       X   Nr$               FgegluTnum_attention_headsattention_head_dimin_channelsout_channels
num_layersdropoutnorm_num_groupscross_attention_dimattention_biassample_sizeactivation_fnnorm_elementwise_affinedouble_self_attentionpositional_embeddingsnum_positional_embeddingsc                    s   t    | _| _ || _tj||ddd| _t|| _	t
 	
fddt|D | _t|| _d S )Nư>T)
num_groupsnum_channelsepsaffinec                    s*   g | ]}t  
	d qS ))r=   r?   rB   r@   rD   rC   rE   rF   )r   ).0_rB   r@   r9   r?   rD   r=   	inner_dimrC   r8   rF   rE   r0   r1   
<listcomp>z   s     z5AnimateDiffTransformer3D.__init__.<locals>.<listcomp>)super__init__r8   r9   r:   nn	GroupNormnormLinearproj_in
ModuleListrangetransformer_blocksproj_out)selfr8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   	__class__rN   r1   rR   \   s   
z!AnimateDiffTransformer3D.__init__hidden_statesencoder_hidden_statestimestepclass_labels
num_framescross_attention_kwargsreturnc                 C   s   |j \}}}	}
|| }|}|dddf ||||	|
}|ddddd}| |}|ddddd||	 |
 ||}| j|d}| jD ]}||||||d}qE| j|d}|ddddf ||	|
||ddddd }||||	|
}|| }|S )	a0  
        The [`AnimateDiffTransformer3D`] forward method.

        Args:
            hidden_states (`torch.LongTensor` of shape `(batch size, num latent pixels)` if discrete, `torch.Tensor` of shape `(batch size, channel, height, width)` if continuous):
                Input hidden_states.
            encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, encoder_hidden_states dim)`, *optional*):
                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
                self-attention.
            timestep ( `torch.LongTensor`, *optional*):
                Used to indicate denoising step. Optional timestep to be applied as an embedding in `AdaLayerNorm`.
            class_labels ( `torch.LongTensor` of shape `(batch size, num classes)`, *optional*):
                Used to indicate class labels conditioning. Optional class labels to be applied as an embedding in
                `AdaLayerZeroNorm`.
            num_frames (`int`, *optional*, defaults to 1):
                The number of frames to be processed per batch. This is used to reshape the hidden states.
            cross_attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
                `self.processor` in
                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).

        Returns:
            torch.Tensor:
                The output tensor.
        Nr   r   r$   r      )input)r_   r`   ra   rd   rb   )shapereshapepermuterU   rW   rZ   r[   
contiguous)r\   r_   r`   ra   rb   rc   rd   batch_frameschannelheightwidth
batch_sizeresidualblockoutputr0   r0   r1   forward   s2   #
$
	z AnimateDiffTransformer3D.forward)r3   r4   NNr$   r5   r6   NFNr7   TTNN)NNNr$   N)r)   r*   r+   r,   intfloatboolstrrR   r-   r.   
LongTensordictr   rt   __classcell__r0   r0   r]   r1   r2   ?   s    	
5r2   c                %       s   e Zd Z																d%d
edededededededededededededeee B ded	B dedeee B def$ fddZ			d&de	j
de	j
d	B d ed!e	j
ee	j
d"f B fd#d$Z  ZS )'DownBlockMotionr5   r$   rG   defaultswishr6   T      ?Nr:   r;   temb_channelsr=   r<   
resnet_epsresnet_time_scale_shiftresnet_act_fnresnet_groupsresnet_pre_normoutput_scale_factoradd_downsampledownsample_paddingtemporal_num_attention_headstemporal_cross_attention_dimtemporal_max_seq_length%temporal_transformer_layers_per_blocktemporal_double_self_attentionc                    s0  t    g }g }t|tr|f| }nt||kr!td| t|tr,|f| }nt||kr9td| t|D ]5}|dkrE|n|}|t|||||	|||||
d
 |t	|| ||| |	|ddd||||  |d q=t
|| _t
|| _|rt
t|d	||d
dg| _nd | _d| _d S )Nz\`temporal_transformer_layers_per_block` must be an integer or a tuple of integers of length zS`temporal_num_attention_heads` must be an integer or a tuple of integers of length r   
r:   r;   r   rJ   groupsr=   time_embedding_normnon_linearityr   pre_normFr7   
sinusoidalr8   r:   r<   r>   r?   r@   rB   rE   rF   r9   rD   Topuse_convr;   paddingname)rQ   rR   
isinstanceru   len
ValueErrorrY   appendr    r2   rS   rX   resnetsmotion_modulesr   downsamplersgradient_checkpointing)r\   r:   r;   r   r=   r<   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   ir]   r0   r1   rR      sz   




zDownBlockMotion.__init__r_   tembrc   re   .c                 O   s   t |dks|dd d urd}tdd| d}t| j| j}|D ]$\}	}
t r4| jr4| 	|	||}n|	||d}|
||d}||f }q!| j
d ur[| j
D ]}||d}qN||f }||fS )	Nr   scaleThe `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`.1.0.0r0   input_tensorr   rc   r_   )r   getr   zipr   r   r-   is_grad_enabledr   _gradient_checkpointing_funcr   )r\   r_   r   rc   argskwargsdeprecation_messageoutput_statesblocksresnetmotion_moduledownsamplerr0   r0   r1   rt   4  s    


zDownBlockMotion.forward)r5   r$   rG   r}   r~   r6   Tr   Tr$   r$   Nr6   r$   T)Nr$   r)   r*   r+   ru   rv   rx   rw   tuplerR   r-   r.   rt   r{   r0   r0   r]   r1   r|      s    	


`r|   c                5       s0  e Zd Z															
	
	
	
						d3dedededededeee B dededededededededededededed ed!ed"edB d#ed$ed%eee B d&ef4 fd'd(Z							d4d)e	j
d*e	j
dB d+e	j
dB d,e	j
dB d-ed.e	j
dB d/eeef dB d0e	j
dB fd1d2Z  ZS )5CrossAttnDownBlockMotionr5   r$   rG   r}   r~   r6   T   r   FN   r:   r;   r   r=   r<   transformer_layers_per_blockr   r   r   r   r   r8   r?   r   r   r   dual_cross_attentionuse_linear_projectiononly_cross_attentionupcast_attentionattention_typer   r   r   r   r   c                    s  t    g }g }g }d| _|| _t|tr|f| }nt||kr)td| t|tr4|f| }nt||krAtd| t|D ]X}|dkrM|n|}|	t
|||||
|||	||d
 |sx|	t||| ||| ||
||||d
 n|	t||| |d||
d |	t|||| |
|d	d
d||| |d qEt|| _t|| _t|| _|rtt|d||ddg| _nd | _d	| _d S )NTPtransformer_layers_per_block must be an integer or a list of integers of length Ytemporal_transformer_layers_per_block must be an integer or a list of integers of length r   r   r:   r<   r?   r>   r   r   r   r   r$   r:   r<   r?   r>   Fr7   r   r   r   r   )rQ   rR   has_cross_attentionr8   r   ru   r   r   rY   r   r    r#   r"   r2   rS   rX   
attentionsr   r   r   r   r   )r\   r:   r;   r   r=   r<   r   r   r   r   r   r   r8   r?   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r]   r0   r1   rR   W  s   



z!CrossAttnDownBlockMotion.__init__r_   r   r`   attention_maskrc   encoder_attention_maskrd   additional_residualsc	              	   C   s   |d ur| dd d urtd d}	tt| j| j| j}
t|
D ]C\}\}}}t	
 r8| jr8| |||}n|||d}||||||ddd }|||d}|t|
d	 kr`|d ur`|| }|	|f }	q"| jd ur{| jD ]}||d
}qn|	|f }	||	fS )Nr   SPassing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.r0   r   Fr_   r`   rd   r   r   return_dictr   r   r$   r   )r   loggerwarninglistr   r   r   r   	enumerater-   r   r   r   r   r   )r\   r_   r   r`   r   rc   r   rd   r   r   r   r   r   attnr   r   r0   r0   r1   rt     s8   
	


z CrossAttnDownBlockMotion.forward)r5   r$   r$   rG   r}   r~   r6   Tr$   r   r   r$   TFFFFr}   Nr   r6   r$   T)NNNr$   NNNr)   r*   r+   ru   rv   r   rx   rw   rR   r-   r.   rz   r   rt   r{   r0   r0   r]   r1   r   V  s    
	

 
	r   c                7       sD  e Zd Z													
										d6dedededededB dededeee B dededededededededededed ed!ed"ed#edB d$ed%ed&eee B f4 fd'd(Z							d7d)e	j
d*ee	j
d+f d,e	j
dB d-e	j
dB d.eeef dB d/edB d0e	j
dB d1e	j
dB d2ed3e	j
fd4d5Z  ZS )8CrossAttnUpBlockMotionNr5   r$   rG   r}   r~   r6   Tr   r   Fr   r:   r;   prev_output_channelr   resolution_idxr=   r<   r   r   r   r   r   r   r8   r?   r   add_upsampler   r   r   r   r   r   r   r   r   c           !         s  t    g }g }g }d| _|| _t|tr|f| }nt||kr.td| dt| t|tr9|f| }nt||krKtd| dt| t|D ]c}||d krY|n|}|dkra|n|} |	t
| | |||	|||
|||d
 |s|	t||| ||| ||||||d
 n|	t||| |d||d	 |	t|||| ||d
dd||| d
 qOt|| _t|| _t|| _|rtt|d|dg| _nd | _d
| _|| _d S )NTr   z, got r   r$   r   r   r   r   Fr7   r   
r8   r:   r<   r>   r?   r@   rB   rE   rF   r9   r   r;   )rQ   rR   r   r8   r   ru   r   r   rY   r   r    r#   r"   r2   rS   rX   r   r   r   r!   
upsamplersr   r   )!r\   r:   r;   r   r   r   r=   r<   r   r   r   r   r   r   r8   r?   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   res_skip_channelsresnet_in_channelsr]   r0   r1   rR     s   




zCrossAttnUpBlockMotion.__init__r_   res_hidden_states_tuple.r   r`   rd   upsample_sizer   r   rc   re   c
              
   C   s6  |d ur| dd d urtd t| dd o(t| dd o(t| dd o(t| dd }
t| j| j| j}|D ]S\}}}|d }|d d }|
rWt| j	||| j
| j| j| jd\}}tj||gd	d
}t ro| jro| |||}n|||d}||||||ddd }|||	d}q4| jd ur| jD ]}|||d}q|S )Nr   r   s1s2b1b2r   r   r   r   r$   dimr   Fr   r   r   r_   output_size)r   r   r   getattrr   r   r   r   r   r   r   r   r   r   r-   catr   r   r   r   )r\   r_   r   r   r`   rd   r   r   r   rc   is_freeu_enabledr   r   r   r   res_hidden_states	upsamplerr0   r0   r1   rt     sV   





	

zCrossAttnUpBlockMotion.forward)Nr5   r$   r$   rG   r}   r~   r6   Tr$   r   r   TFFFFr}   Nr   r6   r$   )NNNNNNr$   r   r0   r0   r]   r1   r     s    
	

 	
r   c                '       s   e Zd Z													
		d'dedededededB dededededededededededB dededeee B f$ fddZ			d(de	j
d ee	j
d!f d"e	j
dB d#ed$e	j
f
d%d&Z  ZS ))UpBlockMotionNr5   r$   rG   r}   r~   r6   Tr   r   r:   r   r;   r   r   r=   r<   r   r   r   r   r   r   r   r   r   r   r   c                    s  t    g }g }t|tr|f| }nt||kr!td| t|D ]<}||d kr/|n|}|dkr7|n|}|t|| ||||||	|
||d
 |t	|||| ||ddd||| d
 q%t
|| _t
|| _|r}t
t|d	|d
g| _nd | _d| _|| _d S )Nr   r$   r   r   Fr7   r   r   Tr   )rQ   rR   r   ru   r   r   rY   r   r    r2   rS   rX   r   r   r!   r   r   r   )r\   r:   r   r;   r   r   r=   r<   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r]   r0   r1   rR     s^   


zUpBlockMotion.__init__r_   r   .r   rc   re   c              
   O   s"  t |dks|dd d urd}tdd| t| dd o-t| dd o-t| dd o-t| dd }	t| j| j}
|
D ]F\}}|d	 }|d d	 }|	rYt| j||| j	| j
| j| jd
\}}tj||gdd}t rq| jrq| |||}n|||d}|||d}q7| jd ur| jD ]}|||d}q|S )Nr   r   r   r   r   r   r   r   r   r   r$   r   r   r   r   )r   r   r   r   r   r   r   r   r   r   r   r   r   r-   r   r   r   r   r   )r\   r_   r   r   r   rc   r   r   r   r   r   r   r   r   r   r0   r0   r1   rt     sB   







zUpBlockMotion.forward)Nr5   r$   rG   r}   r~   r6   Tr   TNr   r6   r$   )NNr$   r   r0   r0   r]   r1   r     s    	

R	r   c                -       s  e Zd Z													
	
	
					d-dededededeee B dededededededededededededededB ded eee B f* fd!d"Z						d.d#e	j
d$e	j
dB d%e	j
dB d&e	j
dB d'eeef dB d(e	j
dB d)ed*e	j
fd+d,Z  ZS )/UNetMidBlockCrossAttnMotionr5   r$   rG   r}   r~   r6   Tr   r   FNr:   r   r=   r<   r   r   r   r   r   r   r8   r   r?   r   r   r   r   r   r   r   r   c                    s  t    d| _|| _|	d ur|	nt|d d}	t|tr#|f| }nt||kr1td| dt|tr<|f| }nt||krJtd| dt	|||||	|||||
d
g}g }g }t
|D ]N}|sz|t||| ||| ||	|||d	 n|t||| |d	||	d
 |t	|||||	|||||
d
 |t||| ||| |	|dd|dd
 qat|| _t|| _t|| _d| _d S )NTrf   r6   zT`transformer_layers_per_block` should be an integer or a list of integers of length .z]`temporal_transformer_layers_per_block` should be an integer or a list of integers of length r   )r:   r<   r?   r>   r   r   r   r$   r   Fr   r7   )
r8   r9   r:   r<   r>   r?   r@   rE   rF   rB   )rQ   rR   r   r8   minr   ru   r   r   r    rY   r   r#   r"   r2   rS   rX   r   r   r   r   )r\   r:   r   r=   r<   r   r   r   r   r   r   r8   r   r?   r   r   r   r   r   r   r   r   r   r   r   r   r]   r0   r1   rR   W  s   






z$UNetMidBlockCrossAttnMotion.__init__r_   r   r`   r   rd   r   rc   re   c              
   C   s   |d ur| dd d urtd | jd ||d}t| j| jdd  | j}|D ]:\}	}
}|	|||||ddd }t rT| j	rT| 
||d d d |d }| 
|
||}q)||d d d |d }|
||d}q)|S )Nr   r   r   r   r$   Fr   )r   r   r   r   r   r   r   r-   r   r   r   )r\   r_   r   r`   r   rd   r   rc   r   r   r   r   r0   r0   r1   rt     s0   

	z#UNetMidBlockCrossAttnMotion.forward)r5   r$   r$   rG   r}   r~   r6   Tr$   r   r   FFFr}   r$   Nr6   r$   )NNNNNr$   r   r0   r0   r]   r1   r   V  s    
	

 	r   c                       sh   e Zd Z								ddeded	eee B d
eee B dededB dededef fddZ  ZS )MotionModulesr   r   FNr7   r6   r:   layers_per_blockr   r8   r@   r?   rB   r>   max_seq_lengthc
                    s   t    tg | _t|tr|f| }nt||kr(td| dt| t	|D ]}
| j
t|||
 ||||||| d|	d
 q,d S )NzZThe number of transformer layers per block must match the number of layers per block, got  and r   )
r:   r<   r>   r?   rB   r@   r8   r9   rE   rF   )rQ   rR   rS   rX   r   r   ru   r   r   rY   r   r2   )r\   r:   r   r   r8   r@   r?   rB   r>   r   r   r]   r0   r1   rR     s8   

zMotionModules.__init__)r   r   r   FNr7   r6   r6   )	r)   r*   r+   ru   r   rw   rx   rR   r{   r0   r0   r]   r1   r     s8    

	
r   c                       s   e Zd Ze										ddeed	f d
eee B deee B eee  B dedeee B deee B dededededB f fddZdd Z  Z	S )MotionAdapteri@  i  r   r   r   r$   r   r6   TNblock_out_channels.motion_layers_per_block#motion_transformer_layers_per_block!motion_mid_block_layers_per_block'motion_transformer_layers_per_mid_blockmotion_num_attention_headsmotion_norm_num_groupsmotion_max_seq_lengthuse_motion_mid_blockconv_in_channelsc                    s*  t    g }g }t|tr|ft| }nt|t|kr,tdt| dt| t|tr8|ft| }t|trC|f| }nt||krVtd| dt| dt|trc|ft| }nt|t|krytdt| dt| |
rtj|
|d dd	d
| _nd| _t	|D ]\}}|| }|
t||ddd|| ||| || d	 q|	rt|d |ddd|d |||d	| _nd| _tt|}|d }tt|}tt|}tt|}t	|D ] \}}|| }|
t||ddd|| ||| d	 || d	 qt|| _t|| _dS )a3  Container to store AnimateDiff Motion Modules

        Args:
            block_out_channels (`tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
            The tuple of output channels for each UNet block.
            motion_layers_per_block (`int` or `tuple[int]`, *optional*, defaults to 2):
                The number of motion layers per UNet block.
            motion_transformer_layers_per_block (`int`, `tuple[int]`, or `tuple[tuple[int]]`, *optional*, defaults to 1):
                The number of transformer layers to use in each motion layer in each block.
            motion_mid_block_layers_per_block (`int`, *optional*, defaults to 1):
                The number of motion layers in the middle UNet block.
            motion_transformer_layers_per_mid_block (`int` or `tuple[int]`, *optional*, defaults to 1):
                The number of transformer layers to use in each motion layer in the middle block.
            motion_num_attention_heads (`int` or `tuple[int]`, *optional*, defaults to 8):
                The number of heads to use in each attention layer of the motion module.
            motion_norm_num_groups (`int`, *optional*, defaults to 32):
                The number of groups to use in each group normalization layer of the motion module.
            motion_max_seq_length (`int`, *optional*, defaults to 32):
                The maximum sequence length to use in the motion module.
            use_motion_mid_block (`bool`, *optional*, defaults to True):
                Whether to use a motion module in the middle of the UNet.
        zKThe number of motion layers per block must match the number of blocks, got r   z$The number of layers per mid block (zD) must match the length of motion_transformer_layers_per_mid_block ()zgThe length of the attention head number tuple in the motion module must match the number of block, got r   r   r$   kernel_sizer   Nr7   F)	r:   r>   r?   rB   r@   r8   r   r   r   r   )rQ   rR   r   ru   r   r   rS   Conv2dconv_inr   r   r   	mid_blockr   reversedrX   down_blocks	up_blocks)r\   r   r   r   r   r   r   r   r   r   r  r	  r
  r   rm   output_channelreversed_block_out_channels reversed_motion_layers_per_block,reversed_motion_transformer_layers_per_block#reversed_motion_num_attention_headsr]   r0   r1   rR   %  s   
%





zMotionAdapter.__init__c                 C   s   d S Nr0   )r\   r(   r0   r0   r1   rt     s   zMotionAdapter.forward)
r   r   r$   r$   r$   r   r6   r6   TN)
r)   r*   r+   r   r   ru   rw   rR   rt   r{   r0   r0   r]   r1   r   $  sF    



	
 r   c                C       sB  e Zd ZdZdZdgZe									
	
					
		
			
							
						dfdedB dededee	df dee	df deedf deee B dede
de	dede
dedeee B ee B d eee B ee B dB d!eee B ee B d"eee B ee B dB d#eee B dB d$eee B dB d%ed&eeedf B d'ed(eeedf B d)eeedf B eeedf df B dB d*ed+ed,edB d-e	dB d.e	dB d/edB d0edB d1edB f@ fd2d3Ze		dgd4ed5edB d6efd7d8Zdhd:d;Zd5edB d9dfd<d=Z				did>e	d?ed@edAe	dB dBed9dfdCdDZdjdFedB dGed9dfdHdIZdhdJdKZdhdLdMZdNe
dOe
dPe
dQe
d9df
dRdSZdhdTdUZdVdW ZdXdY ZedZ							dkd[ejd\eje
B eB d]ejd^ejdB d_ejdB dZee	ef dB d`ee	ejf dB daeej dB dbejdB dced9e eej B fdddeZ!  Z"S )lUNetMotionModela=  
    A modified conditional 2D UNet model that takes a noisy sample, conditional state, and a timestep and returns a
    sample shaped output.

    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
    for all models (such as downloading or saving).
    TrU   Nrf   r   r   r   r|   r   r   r   r   r   r   r$   silur6   h㈵>r   Fr   rA   r:   r;   down_block_types.up_block_typesr   r   r   mid_block_scale_factoract_fnr>   norm_epsr?   r   $reverse_transformer_layers_per_blockr   -reverse_temporal_transformer_layers_per_block transformer_layers_per_mid_block)temporal_transformer_layers_per_mid_blockr   r8   r   r   "reverse_motion_num_attention_headsr   mid_block_layersencoder_hid_dimencoder_hid_dim_typeaddition_embed_typeaddition_time_embed_dim%projection_class_embeddings_input_dimtime_cond_proj_dimc!           7         s  t    || _t|t|krtd| d| dt|t|kr.td| d| dt|tsFt|t|krFtd| d| dt|tr^t|t|kr^td| d| dt|tsvt|t|krvtd| d| dt|tr|d u r|D ]}!t|!trtd	qt|tr|d u r|D ]}!t|!trtd
qd}"d}#|"d d }$tj	||d |"|$d| _
|d d }%t|d dd| _|d }&t|&|%|
| d| _|d u rd | _|dkrt|dd| _t||%| _tg | _tg | _t|tr|ft| }t|tr|ft| }t|tr!|gt| }t|tr.|gt| }t|tr;|gt| }t|trH|gt| }t|trU|gt| }t|trb|ft| }|d }'t|D ]\}(})|'}*||( }'|(t|d k}+|)dkrtd4i d|*d|'d|%d||( d||( d|d|
d|d||( d||( d|d |+ d!|d"||( d#|d$||( },n!|)d%krt|*|'|%||( ||
||+ |||( |||( d&},ntd'| j|, qj|d u r t|d( tr|d( nd}|r t|d( |%||
|	|d( |d( |d)|||d( |||d*| _nt|d( |%||
|	|d( |d( |d)|||d+| _d| _tt|}-tt|}.tt|}/tt|}0tt|}1|d u rctt|}|d u rntt|}|-d }'t|D ]\}(}2|(t|d k}+|'}3|-|( }'|-t|(d t|d  }*|+sd}4|  jd7  _nd)}4|2d,krtd4i d|*d|'d-|3d|%d.|(d|/|( d d||( d|d|
d|d|.|( d|0|( d/|4d!|d"|1|( d#|d$||( }5n#|2d0krt |*|3|'|%|(|/|( d ||
||4|1|( |||( d1}5ntd2| j|5 |'}3qv|d ur3tj!|d ||d3| _"t# | _$nd | _"d | _$|#d d }6tj	|d ||#|6d| _%d S )5Nz\Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: z. `up_block_types`: r   zbMust provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: z. `down_block_types`: zdMust provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: zdMust provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: z^Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: zOMust provide 'reverse_transformer_layers_per_block` if using asymmetrical UNet.ziMust provide 'reverse_temporal_transformer_layers_per_block` if using asymmetrical motion module in UNet.r   r$   r   r   r  rf   T)r  cond_proj_dim	text_timer   r:   r;   r   r<   r   r   r   r   r8   r?   r   r   r   r   r   r   r|   )r:   r;   r   r<   r   r   r   r   r   r   r   r   zeInvalid `down_block_type` encountered. Must be one of `CrossAttnDownBlockMotion` or `DownBlockMotion`r   F)r:   r   r   r   r   r?   r8   r   r   r   r<   r   r   r   r   )r:   r   r   r   r   r?   r8   r   r   r   r<   r   r   r   r   r   r   )r:   r   r;   r   r   r<   r   r   r   r   r   r   r   z_Invalid `up_block_type` encountered. Must be one of `CrossAttnUpBlockMotion` or `UpBlockMotion`)rI   rH   rJ   r0   )&rQ   rR   rA   r   r   r   ru   r   rS   r  r  r   	time_projr   time_embeddingencoder_hid_projadd_time_projadd_embeddingrX   r	  r
  r   r   r|   r   r   r  r%   num_upsamplersr  r   r   r   rT   conv_norm_outSiLUconv_actconv_out)7r\   rA   r:   r;   r  r  r   r   r   r  r  r>   r  r?   r   r  r   r  r  r  r   r8   r   r   r  r   r   r!  r"  r#  r$  r%  r&  layer_number_per_blockconv_in_kernelconv_out_kernelconv_in_paddingtime_embed_dimtimestep_input_dimr  r   down_block_typeinput_channelis_final_block
down_blockr  reversed_num_attention_headsreversed_layers_per_blockreversed_cross_attention_dimr  up_block_typer   r   up_blockconv_out_paddingr]   r0   r1   rR     s  
.


	






	




zUNetMotionModel.__init__unetmotion_adapterload_weightsc                    s,  |d u}|ra|j |jd t|jd t|jd krtdt|jd tr5|jd gt|jd  }nt|jd }t|jd trR|jd gt|jd  }nt|jd }||kratdt|j | j	 d< g } d D ]}d	|v r}|
d
 qq|
d qq| d< g }	 d D ]}d	|v r|	
d q|	
d q|	 d< |r|jd  d< |jd  d< |jd  d< |jd  d< |jd  d< |jd  d< |jd  d< |jd r|jd  d<  ds d  d< | | \t fdd D  | j	 d< |  }
|s|
S |rD|jd rD|j|
_tj|jj|jjd d dd d d d d f gdd}|
j||jjd n	|
j|j  |
j|j  |
j|j  td d! |j D ri }|j D ]4\}}|d"rttd#rtnt }| ||< qsttd#rt!nt"}||j#|j$|j%|j&d$||< qs|
j D ]\}}||vr|' ||< q|
(| d%|
j_)|j*|
_*t+|j,D ]9\}}|
j,| j-|j-  t|
j,| d&r|
j,| j.|j.  |
j,| j/r|
j,| j/|j/  qt+|j0D ]9\}}|
j0| j-|j-  t|
j0| d&r4|
j0| j.|j.  |
j0| j1rG|
j0| j1|j1  q|
j2j-|j2j-  |
j2j.|j2j.  |j3d urn|
j3|j3  |j4d ur}|
j4|j4  |
j5|j5  |r|
6| |
 |j7 |
S )'N)devicer  r   z;Incompatible Motion Adapter, got different number of blocksr   r   zEIncompatible Motion Adapter, got different number of layers per block_class_name	CrossAttnr   r|   r  r   r   r   r   r   r   r  r   r   r  r:   r8   r9   c                    s(   i | ]}|v s|v r|  |qS r0   )r   )rL   kconfigexpected_kwargsoptional_kwargsr0   r1   
<dictcomp>E  s   ( z/UNetMotionModel.from_unet2d.<locals>.<dictcomp>rf   r$   r   )weightbiasc                 s   s    | ]
}t |ttfV  qd S r  )r   r   r   rL   procr0   r0   r1   	<genexpr>Z  s
    
z.UNetMotionModel.from_unet2d.<locals>.<genexpr>zattn1.processorscaled_dot_product_attention)hidden_sizer?   r   
num_tokensip_image_projr   )8torF  r   rK  r   r   ru   r   rz   r)   r   r   _get_signature_keysr   from_configr  r-   r   rO  load_state_dictrP  
state_dictr)  r*  anyattn_processorsvaluesitemsendswithhasattrFr   r   r   r   rU  r?   r   rV  r^   set_attn_processorr"  r+  r   r	  r   r   r   r
  r   r  r/  r1  r2  load_motion_modulesdtype)clsrC  rD  rE  has_motion_adapterexpanded_layers_per_block!expanded_adapter_layers_per_blockr	  down_blocks_typer
  modelupdated_conv_in_weight
attn_procsr   	processorattn_processor_classr   r<  rA  r0   rJ  r1   from_unet2d  s   





,



zUNetMotionModel.from_unet2dre   c                 C   s   |   D ]}d|_q| jD ]}|j}|  D ]}d|_qq| jD ]}|j}|  D ]}d|_q)q t| jdrD| jj}|  D ]}d|_q>dS dS )z|Freeze the weights of just the UNet2DConditionModel, and leave the motion modules
        unfrozen for fine tuning.
        FTr   N)
parametersrequires_gradr	  r   r
  rb  r  )r\   paramr<  r   rA  r0   r0   r1   freeze_unet2d_params  s$   

z$UNetMotionModel.freeze_unet2d_paramsc                 C   s   t |jD ]\}}| j| j|j  qt |jD ]\}}| j| j|j  qt| jdr?| jj|jj  d S d S )Nr   )r   r	  r   r[  r\  r
  rb  r  )r\   rD  r   r<  rA  r0   r0   r1   re    s   z#UNetMotionModel.load_motion_modulessave_directoryis_main_processsafe_serializationvariantpush_to_hubc                 K   s   |   }i }| D ]\}	}
d|	v r|
||	< q
t| jd | jd | jd | jd | jd | jd d}|| |jd
|||||d	| d S )Nr   r   r   r>   r   r   r   )r   r   r   r   r   r   )rv  rw  rx  ry  rz  r0   )r\  r`  r   rK  r[  save_pretrained)r\   rv  rw  rx  ry  rz  r   r\  motion_state_dictrI  vadapterr0   r0   r1   save_motion_modules  s0   	

z#UNetMotionModel.save_motion_modulesr   
chunk_sizer   c                    sZ   |dvrt d| |pd}dtjjdtdtf fdd |  D ]} ||| q"d	S )
aX  
        Sets the attention processor to use [feed forward
        chunking](https://huggingface.co/blog/reformer#2-chunked-feed-forward-layers).

        Parameters:
            chunk_size (`int`, *optional*):
                The chunk size of the feed-forward layers. If not specified, will run feed-forward layer individually
                over each tensor of dim=`dim`.
            dim (`int`, *optional*, defaults to `0`):
                The dimension over which the feed-forward computation should be chunked. Choose between dim=0 (batch)
                or dim=1 (sequence length).
        )r   r$   z-Make sure to set `dim` to either 0 or 1, not r$   moduler  r   c                    6   t | dr| j||d |  D ]} ||| qd S Nset_chunk_feed_forward)r  r   rb  r  childrenr  r  r   childfn_recursive_feed_forwardr0   r1   r    
   
zJUNetMotionModel.enable_forward_chunking.<locals>.fn_recursive_feed_forwardN)r   r-   rS   Moduleru   r  )r\   r  r   r  r0   r  r1   enable_forward_chunking  s   z'UNetMotionModel.enable_forward_chunkingc                    s<   dt jjdtdtf fdd |  D ]} |d d qd S )Nr  r  r   c                    r  r  r  r  r  r0   r1   r    r  zKUNetMotionModel.disable_forward_chunking.<locals>.fn_recursive_feed_forwardr   )r-   rS   r  ru   r  )r\   r  r0   r  r1   disable_forward_chunking  s   z(UNetMotionModel.disable_forward_chunkingc                 C   sj   t dd | j D rt }nt dd | j D r t }ntdtt| j  | | dS )ze
        Disables custom attention processors and sets the default attention implementation.
        c                 s       | ]}|j tv V  qd S r  )r^   r   rQ  r0   r0   r1   rS  	      z=UNetMotionModel.set_default_attn_processor.<locals>.<genexpr>c                 s   r  r  )r^   r   rQ  r0   r0   r1   rS    r  zOCannot call `set_default_attn_processor` when attention processors are of type N)	allr^  r_  r   r   r   nextiterrd  )r\   ro  r0   r0   r1   set_default_attn_processor  s   z*UNetMotionModel.set_default_attn_processorr   r   r   r   c                 C   sH   t | jD ]\}}t|d| t|d| t|d| t|d| qdS )aF  Enables the FreeU mechanism from https://huggingface.co/papers/2309.11497.

        The suffixes after the scaling factors represent the stage blocks where they are being applied.

        Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of values that
        are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.

        Args:
            s1 (`float`):
                Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
                mitigate the "oversmoothing effect" in the enhanced denoising process.
            s2 (`float`):
                Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
                mitigate the "oversmoothing effect" in the enhanced denoising process.
            b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
            b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
        r   r   r   r   N)r   r
  setattr)r\   r   r   r   r   r   upsample_blockr0   r0   r1   enable_freeu  s   zUNetMotionModel.enable_freeuc                 C   sP   h d}t | jD ]\}}|D ]}t||st||ddur$t||d qq	dS )zDisables the FreeU mechanism.>   r   r   r   r   N)r   r
  rb  r   r  )r\   
freeu_keysr   r  rI  r0   r0   r1   disable_freeu.  s   zUNetMotionModel.disable_freeuc                 C   sn   d| _ | j D ]\}}dt|jjv rtdq| j| _ |  D ]}t|t	r.|j
dd q!| t  dS )u  
        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
        are fused. For cross-attention modules, key and value projection matrices are fused.

        > [!WARNING] > This API is 🧪 experimental.
        NAddedzQ`fuse_qkv_projections()` is not supported for models having added KV projections.T)fuse)original_attn_processorsr^  r`  rx   r^   r)   r   modulesr   r   fuse_projectionsrd  r   )r\   rM   attn_processorr  r0   r0   r1   fuse_qkv_projections7  s   
z$UNetMotionModel.fuse_qkv_projectionsc                 C   s   | j dur| | j  dS dS )un   Disables the fused QKV projection if enabled.

        > [!WARNING] > This API is 🧪 experimental.

        N)r  rd  )r\   r0   r0   r1   unfuse_qkv_projectionsM  s   
z&UNetMotionModel.unfuse_qkv_projectionsrd   r(   ra   r`   timestep_condr   added_cond_kwargsdown_block_additional_residualsmid_block_additional_residualr   c           "         s  d| j   d}d}t fdd|jdd D r td d}|dur3d	||j d
 }|d	}|}t	|sk|j
jdk}|j
jdk}t|trV|sO|rRtjntj}n
|sZ|r]tjntj}tj|g||j
d}nt|jdkrz|d |j
}|jd ||jd }| |}|j| jd}| ||}d}| jjdkrd|vrt| j d|d}d|vrt| j d|d}| | }||jd df}tj||gdd}||j}|  |}|du r|n|| }|j!d|jd  d}| j"dur0| jj#dkr0d|vrt| j d|d}| "|}fdd|D }||f}|$ddd	dd|jd  df|jdd  }| %|}|f}| j&D ]'}t'|drn|j(rn||||||d \}}n	|||d!\}}||7 }qU|durd"}t)||D ]\}}|| }||f7 }q|}| j*durt'| j*d#r| j*|||||d$}n
| j*|||||d%}|	dur||	 }t+| j,D ]R\}} |t| j,d	 k}!|t| j- d }|dt| j-  }|!s|r|d jdd }t'| dr| j(r| |||||||d&}q| ||||d'}q| j.r,| .|}| /|}| 0|}|dddf df|jd	d  $ddd	dd}|
sR|fS t1|d(S ))aG	  
        The [`UNetMotionModel`] forward method.

        Args:
            sample (`torch.Tensor`):
                The noisy input tensor with the following shape `(batch, num_frames, channel, height, width`.
            timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input.
            encoder_hidden_states (`torch.Tensor`):
                The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
            timestep_cond: (`torch.Tensor`, *optional*, defaults to `None`):
                Conditional embeddings for timestep. If provided, the embeddings will be summed with the samples passed
                through the `self.time_embedding` layer to obtain the timestep embeddings.
            attention_mask (`torch.Tensor`, *optional*, defaults to `None`):
                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
                negative values to the attention scores corresponding to "discard" tokens.
            cross_attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
                `self.processor` in
                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
            down_block_additional_residuals: (`tuple` of `torch.Tensor`, *optional*):
                A tuple of tensors that if specified are added to the residuals of down unet blocks.
            mid_block_additional_residual: (`torch.Tensor`, *optional*):
                A tensor that if specified is added to the residual of the middle unet block.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~models.unets.unet_motion_model.UNetMotionOutput`] instead of a plain
                tuple.

        Returns:
            [`~models.unets.unet_motion_model.UNetMotionOutput`] or `tuple`:
                If `return_dict` is True, an [`~models.unets.unet_motion_model.UNetMotionOutput`] is returned,
                otherwise a `tuple` is returned where the first element is the sample tensor.
        r   FNc                 3   s    | ]	}|  d kV  qdS )r   Nr0   )rL   s)default_overall_up_factorr0   r1   rS    s    z*UNetMotionModel.forward.<locals>.<genexpr>z9Forward upsample size to force interpolation output size.Tr$   g     mpsnpu)rf  rF  r   )rf  r(  text_embedsz has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`time_idsz has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`r   r   r   r   rW  image_embedsz has the config param `encoder_hid_dim_type` set to 'ip_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`c                    s&   g | ]}|j  d |jd    dqS )r   r  )repeat_interleaverh   )rL   image_embedr   r0   r1   rP     s    z+UNetMotionModel.forward.<locals>.<listcomp>r   rf   r   )r_   r   r`   r   rc   rd   )r_   r   rc   r0   r   )r`   r   rc   rd   )r`   r   rd   )r_   r   r   r`   r   r   rc   rd   )r_   r   r   r   rc   )r(   )2r.  r]  rh   r   inforX  rf  	unsqueezer-   	is_tensorrF  typer   rv   float32float64int32int64tensorr   expandr)  r*  rK  r#  r   r^   r   r,  flattenri   concatr-  r  r+  r"  rj   r  r	  rb  r   r   r  r   r
  r   r/  r1  r2  r'   )"r\   r(   ra   r`   r  r   rd   r  r  r  r   forward_upsample_sizer   	timestepsis_mpsis_npurf  t_embembaug_embr  r  time_embeds
add_embedsr  down_block_res_samplesdownsample_blockres_samplesnew_down_block_res_samplesdown_block_res_sampledown_block_additional_residualr   r  r;  r0   )r  rc   r1   rt   V  s   
3 















4

	
	


	


6
zUNetMotionModel.forward) Nrf   rf   r  r  r   r   r$   r$   r  r6   r  r   r$   Nr$   NNr$   Fr   r6   r   NTr$   NNNNNN)NT)re   N)TTNF)Nr   )NNNNNNT)#r)   r*   r+   r,    _supports_gradient_checkpointing _skip_layerwise_casting_patternsr   ru   r   rx   rv   rw   rR   classmethodr&   r   rq  ru  re  r  r  r  r  r  r  r  r  r   r-   r.   rz   r   r'   rt   r{   r0   r0   r]   r1   r    sX   



 !"&#$%&'()*+  E 

#


			
r  )Edataclassesr   typingr   r-   torch.nnrS   torch.nn.functional
functionalrc  configuration_utilsr   r   r   loadersr   r	   r
   utilsr   r   r   r   utils.torch_utilsr   	attentionr   r   attention_processorr   r   r   r   r   r   r   r   r   
embeddingsr   r   modeling_utilsr   r   r   r    r!    transformers.dual_transformer_2dr"   transformers.transformer_2dr#   unet_2d_blocksr%   unet_2d_conditionr&   
get_loggerr)   r   r'   r  r2   r|   r   r   r   r   r   r   r  r0   r0   r0   r1   <module>   sJ   ,
   : A  &) 
