o
    pi$                    @   s  d dl mZ d dlmZmZmZmZmZ d dlZd dl	m
Z
 d dlm
  mZ d dlZddlmZmZmZ ddlmZmZmZ ddlmZmZmZmZ ddlmZ d	d
lmZ d	dl m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z* d	dl+m,Z,m-Z- d	dl.m/Z/ d	dl0m1Z1m2Z2m3Z3 d	dl4m5Z5 d	dl6m7Z7 ddl8m9Z9 ddl:m;Z; e<e=Z>eG dd deZ?G dd de
j@ZAG dd de
j@ZBG dd de
j@ZCG dd de
j@ZDG dd de
j@ZEG d d! d!e
j@ZFG d"d# d#e
j@ZGG d$d% d%e/eeZHG d&d' d'e/eeeZIdS )(    )	dataclass)AnyDictOptionalTupleUnionN   )ConfigMixin
FrozenDictregister_to_config)FromOriginalModelMixinPeftAdapterMixinUNet2DConditionLoadersMixin)
BaseOutput	deprecateis_torch_versionlogging)apply_freeu   BasicTransformerBlock)
ADDED_KV_ATTENTION_PROCESSORSCROSS_ATTENTION_PROCESSORS	AttentionAttentionProcessorAttnAddedKVProcessorAttnProcessorAttnProcessor2_0FusedAttnProcessor2_0IPAdapterAttnProcessorIPAdapterAttnProcessor2_0)TimestepEmbedding	Timesteps)
ModelMixin)Downsample2DResnetBlock2D
Upsample2D)DualTransformer2DModel)Transformer2DModel   )UNetMidBlock2DCrossAttn)UNet2DConditionModelc                   @   s   e Zd ZU dZejed< dS )UNetMotionOutputa  
    The output of [`UNetMotionOutput`].

    Args:
        sample (`torch.Tensor` of shape `(batch_size, num_channels, num_frames, height, width)`):
            The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model.
    sampleN)__name__
__module____qualname____doc__torchTensor__annotations__ r5   r5   f/home/ubuntu/SoloSpeech/.venv/lib/python3.10/site-packages/diffusers/models/unets/unet_motion_model.pyr,   4   s   
 r,   c                        s   e Zd ZdZ													
	
		d%dededee dee dedededee dedee dedededee dee f fddZ						d&de
jdee
j dee
j dee
j d ed!eeeef  d"e
jfd#d$Z  ZS )'AnimateDiffTransformer3Das  
    A Transformer model for video-like data.

    Parameters:
        num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
        attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
        in_channels (`int`, *optional*):
            The number of channels in the input and output (specify if the input is **continuous**).
        num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
        cross_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
        attention_bias (`bool`, *optional*):
            Configure if the `TransformerBlock` attention should contain a bias parameter.
        sample_size (`int`, *optional*): The width of the latent images (specify if the input is **discrete**).
            This is fixed during training since it is used to learn a number of position embeddings.
        activation_fn (`str`, *optional*, defaults to `"geglu"`):
            Activation function to use in feed-forward. See `diffusers.models.activations.get_activation` for supported
            activation functions.
        norm_elementwise_affine (`bool`, *optional*):
            Configure if the `TransformerBlock` should use learnable elementwise affine parameters for normalization.
        double_self_attention (`bool`, *optional*):
            Configure if each `TransformerBlock` should contain two self-attention layers.
        positional_embeddings: (`str`, *optional*):
            The type of positional embeddings to apply to the sequence input before passing use.
        num_positional_embeddings: (`int`, *optional*):
            The maximum length of the sequence over which to apply positional embeddings.
       X   Nr)               FgegluTnum_attention_headsattention_head_dimin_channelsout_channels
num_layersdropoutnorm_num_groupscross_attention_dimattention_biassample_sizeactivation_fnnorm_elementwise_affinedouble_self_attentionpositional_embeddingsnum_positional_embeddingsc                    s   t    | _| _ || _tjj||ddd| _t	|| _
t 	
fddt|D | _t	|| _d S )Nư>T)
num_groupsnum_channelsepsaffinec                    s*   g | ]}t  
	d qS ))rB   rD   rG   rE   rI   rH   rJ   rK   r   ).0_rG   rE   r>   rD   rI   rB   	inner_dimrH   r=   rK   rJ   r5   r6   
<listcomp>|   s     z5AnimateDiffTransformer3D.__init__.<locals>.<listcomp>)super__init__r=   r>   r?   r2   nn	GroupNormnormLinearproj_in
ModuleListrangetransformer_blocksproj_out)selfr=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   	__class__rS   r6   rW   ^   s   
z!AnimateDiffTransformer3D.__init__hidden_statesencoder_hidden_statestimestepclass_labels
num_framescross_attention_kwargsreturnc                 C   s   |j \}}}	}
|| }|}|dddf ||||	|
}|ddddd}| |}|ddddd||	 |
 ||}| |}| jD ]}||||||d}qD| |}|ddddf ||	|
||ddddd }||||	|
}|| }|S )a0  
        The [`AnimateDiffTransformer3D`] forward method.

        Args:
            hidden_states (`torch.LongTensor` of shape `(batch size, num latent pixels)` if discrete, `torch.Tensor` of shape `(batch size, channel, height, width)` if continuous):
                Input hidden_states.
            encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, encoder_hidden_states dim)`, *optional*):
                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
                self-attention.
            timestep ( `torch.LongTensor`, *optional*):
                Used to indicate denoising step. Optional timestep to be applied as an embedding in `AdaLayerNorm`.
            class_labels ( `torch.LongTensor` of shape `(batch size, num classes)`, *optional*):
                Used to indicate class labels conditioning. Optional class labels to be applied as an embedding in
                `AdaLayerZeroNorm`.
            num_frames (`int`, *optional*, defaults to 1):
                The number of frames to be processed per batch. This is used to reshape the hidden states.
            cross_attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
                `self.processor` in
                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).

        Returns:
            torch.Tensor:
                The output tensor.
        Nr   r   r)   r      )re   rf   ri   rg   )shapereshapepermuterZ   r\   r_   r`   
contiguous)ra   rd   re   rf   rg   rh   ri   batch_frameschannelheightwidth
batch_sizeresidualblockoutputr5   r5   r6   forward   s2   #
$


	z AnimateDiffTransformer3D.forward)r8   r9   NNr)   r:   r;   NFNr<   TTNN)NNNr)   N)r.   r/   r0   r1   intr   floatboolstrrW   r2   r3   
LongTensorr   r   rx   __classcell__r5   r5   rb   r6   r7   A   s    	
5r7   c                &       s   e Zd Z																d%d
edededededededededededededeeee f de	e dedeeee f def$ fddZ
			d&dejde	ej d ed!eejeejd"f f fd#d$Z  ZS )'DownBlockMotionr:   r)   rL   defaultswishr;   T      ?Nr?   r@   temb_channelsrB   rA   
resnet_epsresnet_time_scale_shiftresnet_act_fnresnet_groupsresnet_pre_normoutput_scale_factoradd_downsampledownsample_paddingtemporal_num_attention_headstemporal_cross_attention_dimtemporal_max_seq_length%temporal_transformer_layers_per_blocktemporal_double_self_attentionc                    s0  t    g }g }t|tr|f| }nt||kr!td| t|tr,|f| }nt||kr9td| t|D ]5}|dkrE|n|}|t|||||	|||||
d
 |t	|| ||| |	|ddd||||  |d q=t
|| _t
|| _|rt
t|d	||d
dg| _nd | _d| _d S )Nz\`temporal_transformer_layers_per_block` must be an integer or a tuple of integers of length zS`temporal_num_attention_heads` must be an integer or a tuple of integers of length r   
r?   r@   r   rO   groupsrB   time_embedding_normnon_linearityr   pre_normFr<   
sinusoidalr=   r?   rA   rC   rD   rE   rG   rJ   rK   r>   rI   Topuse_convr@   paddingname)rV   rW   
isinstancery   len
ValueErrorr^   appendr%   r7   rX   r]   resnetsmotion_modulesr$   downsamplersgradient_checkpointing)ra   r?   r@   r   rB   rA   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   irb   r5   r6   rW      sz   




zDownBlockMotion.__init__rd   tembrh   rj   .c                 O   s   t |dks|dd d urd}tdd| d}t| j| j}|D ]=\}	}
| jrN| jrNdd }tdd	rBt	j
jj||	||d
d}nt	j
j||	||}n|	||}|
||d}||f }q!| jd urs| jD ]}||}qg||f }||fS )Nr   scaleThe `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`.1.0.0r5   c                        fdd}|S )Nc                         |  S Nr5   inputsmoduler5   r6   custom_forwardI     zNDownBlockMotion.forward.<locals>.create_custom_forward.<locals>.custom_forwardr5   r   r   r5   r   r6   create_custom_forwardH     z6DownBlockMotion.forward.<locals>.create_custom_forward>=1.11.0Fuse_reentrantrh   )r   getr   zipr   r   trainingr   r   r2   utils
checkpointr   )ra   rd   r   rh   argskwargsdeprecation_messageoutput_statesblocksresnetmotion_moduler   downsamplerr5   r5   r6   rx   6  s4   






zDownBlockMotion.forward)r:   r)   rL   r   r   r;   Tr   Tr)   r)   Nr;   r)   T)Nr)   )r.   r/   r0   ry   rz   r|   r{   r   r   r   rW   r2   r3   rx   r~   r5   r5   rb   r6   r      s    	
`r   c                6       s8  e Zd Z															
	
	
	
						d3dedededededeeee f dededededededededededededed ed!ed"e	e d#ed$ed%eeee f d&ef4 fd'd(Z
							d4d)ejd*e	ej d+e	ej d,e	ej d-ed.e	ej d/e	eeef  d0e	ej fd1d2Z  ZS )5CrossAttnDownBlockMotionr:   r)   rL   r   r   r;   T   r   FN   r?   r@   r   rB   rA   transformer_layers_per_blockr   r   r   r   r   r=   rD   r   r   r   dual_cross_attentionuse_linear_projectiononly_cross_attentionupcast_attentionattention_typer   r   r   r   r   c                    s  t    g }g }g }d| _|| _t|tr|f| }nt||kr)td| t|tr4|f| }nt||krAtd| t|D ]X}|dkrM|n|}|	t
|||||
|||	||d
 |sx|	t||| ||| ||
||||d
 n|	t||| |d||
d |	t|||| |
|d	d
d||| |d qEt|| _t|| _t|| _|rtt|d||ddg| _nd | _d	| _d S )NTPtransformer_layers_per_block must be an integer or a list of integers of length Ytemporal_transformer_layers_per_block must be an integer or a list of integers of length r   r   r?   rA   rD   rC   r   r   r   r   r)   r?   rA   rD   rC   Fr<   r   r   r   r   )rV   rW   has_cross_attentionr=   r   ry   r   r   r^   r   r%   r(   r'   r7   rX   r]   
attentionsr   r   r$   r   r   )ra   r?   r@   r   rB   rA   r   r   r   r   r   r   r=   rD   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rb   r5   r6   rW   k  s   



z!CrossAttnDownBlockMotion.__init__rd   r   re   attention_maskrh   encoder_attention_maskri   additional_residualsc	              	   C   s@  |d ur| dd d urtd d}	tt| j| j| j}
t|
D ]e\}\}}}| j	r[| j
r[ddd}tddr=dd	ini }tjjj||||fi |}||||||d	d
d }n|||}||||||d	d
d }|||d}|t|
d kr|d ur|| }|	|f }	q"| jd ur| jD ]}||}q|	|f }	||	fS )Nr   SPassing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.r5   c                        fdd}|S )Nc                        d ur | diS  |  S Nreturn_dictr5   r   r   r   r5   r6   r        zWCrossAttnDownBlockMotion.forward.<locals>.create_custom_forward.<locals>.custom_forwardr5   r   r   r   r5   r   r6   r        z?CrossAttnDownBlockMotion.forward.<locals>.create_custom_forwardr   r   r   Fre   ri   r   r   r   r   r   r)   r   )r   loggerwarninglistr   r   r   r   	enumerater   r   r   r2   r   r   r   r   )ra   rd   r   re   r   rh   r   ri   r   r   r   r   r   attnr   r   ckpt_kwargsr   r5   r5   r6   rx     sb   

	
	



z CrossAttnDownBlockMotion.forward)r:   r)   r)   rL   r   r   r;   Tr)   r   r   r)   TFFFFr   Nr   r;   r)   T)NNNr)   NNNr.   r/   r0   ry   rz   r   r   r|   r{   r   rW   r2   r3   r   r   rx   r~   r5   r5   rb   r6   r   j  s    	
 
	r   c                8       sL  e Zd Z													
										d6dededededee dededeeee f dedededede	dededede	de	de	d e	d!e	d"ed#ee d$ed%ed&eeee f f4 fd'd(Z
							d7d)ejd*eejd+f d,eej d-eej d.eeeef  d/ee d0eej d1eej d2ed3ejfd4d5Z  ZS )8CrossAttnUpBlockMotionNr:   r)   rL   r   r   r;   Tr   r   Fr   r?   r@   prev_output_channelr   resolution_idxrB   rA   r   r   r   r   r   r   r=   rD   r   add_upsampler   r   r   r   r   r   r   r   r   c           !         s  t    g }g }g }d| _|| _t|tr|f| }nt||kr.td| dt| t|tr9|f| }nt||krKtd| dt| t|D ]c}||d krY|n|}|dkra|n|} |	t
| | |||	|||
|||d
 |s|	t||| ||| ||||||d
 n|	t||| |d||d	 |	t|||| ||d
dd||| d
 qOt|| _t|| _t|| _|rtt|d|dg| _nd | _d
| _|| _d S )NTr   z, got r   r)   r   r   r   r   Fr<   r   
r=   r?   rA   rC   rD   rE   rG   rJ   rK   r>   r   r@   )rV   rW   r   r=   r   ry   r   r   r^   r   r%   r(   r'   r7   rX   r]   r   r   r   r&   
upsamplersr   r   )!ra   r?   r@   r   r   r   rB   rA   r   r   r   r   r   r   r=   rD   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   res_skip_channelsresnet_in_channelsrb   r5   r6   rW   >  s   




zCrossAttnUpBlockMotion.__init__rd   res_hidden_states_tuple.r   re   ri   upsample_sizer   r   rh   rj   c
              
   C   sx  |d ur| dd d urtd t| dd o(t| dd o(t| dd o(t| dd }
t| j| j| j}|D ]u\}}}|d }|d d }|
rWt| j	||| j
| j| j| jd\}}tj||gd	d
}| jr| jrddd}tddrtddini }tjjj||||fi |}||||||ddd }n|||}||||||ddd }|||	d}q4| jd ur| jD ]}|||}q|S )Nr   r   s1s2b1b2r   r   r   r   r)   dimc                    r   )Nc                     r   r   r5   r   r   r5   r6   r     r   zUCrossAttnUpBlockMotion.forward.<locals>.create_custom_forward.<locals>.custom_forwardr5   r   r5   r   r6   r     r   z=CrossAttnUpBlockMotion.forward.<locals>.create_custom_forwardr   r   r   Fr   r   r   r   )r   r   r   getattrr   r   r   r   r   r   r   r   r   r   r2   catr   r   r   r   r   r   )ra   rd   r   r   re   ri   r   r   r   rh   is_freeu_enabledr   r   r   r   res_hidden_statesr   r   	upsamplerr5   r5   r6   rx     s   






	
	

zCrossAttnUpBlockMotion.forward)Nr:   r)   r)   rL   r   r   r;   Tr)   r   r   TFFFFr   Nr   r;   r)   )NNNNNNr)   )r.   r/   r0   ry   r   rz   r   r   r|   r{   rW   r2   r3   r   r   rx   r~   r5   r5   rb   r6   r   =  s    	
 	
r   c                (       s   e Zd Z													
		d'dededededee dedededededededededee dededeee	e f f$ fddZ
			d(dejd e	ejd!f d"eej d#ed$ejf
d%d&Z  ZS ))UpBlockMotionNr:   r)   rL   r   r   r;   Tr   r   r?   r   r@   r   r   rB   rA   r   r   r   r   r   r   r   r   r   r   r   c                    s  t    g }g }t|tr|f| }nt||kr!td| t|D ]<}||d kr/|n|}|dkr7|n|}|t|| ||||||	|
||d
 |t	|||| ||ddd||| d
 q%t
|| _t
|| _|r}t
t|d	|d
g| _nd | _d| _|| _d S )Nr   r)   r   r   Fr<   r   r   Tr   )rV   rW   r   ry   r   r   r^   r   r%   r7   rX   r]   r   r   r&   r   r   r   )ra   r?   r   r@   r   r   rB   rA   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rb   r5   r6   rW     s^   


zUpBlockMotion.__init__rd   r   .r   rh   rj   c              
   O   sR  t |dks|dd d urd}tdd| t| dd o-t| dd o-t| dd o-t| dd }	t| j| j}
|
D ]_\}}|d	 }|d d	 }|	rYt| j||| j	| j
| j| jd
\}}tj||gdd}| jr| jrdd }tddrtjjj||||dd}ntjj||||}n|||}|||d}q7| jd ur| jD ]}|||}q|S )Nr   r   r   r   r   r   r   r   r   r   r)   r   c                    r   )Nc                     r   r   r5   r   r   r5   r6   r     r   zLUpBlockMotion.forward.<locals>.create_custom_forward.<locals>.custom_forwardr5   r   r5   r   r6   r     r   z4UpBlockMotion.forward.<locals>.create_custom_forwardr   r   Fr   r   )r   r   r   r   r   r   r   r   r   r   r   r   r   r2   r   r   r   r   r   r   r   )ra   rd   r   r   r   rh   r   r   r   r   r   r   r   r  r   r  r5   r5   r6   rx   f  sV   










zUpBlockMotion.forward)Nr:   r)   rL   r   r   r;   Tr   TNr   r;   r)   )NNr)   )r.   r/   r0   ry   r   rz   r|   r{   r   r   rW   r2   r3   rx   r~   r5   r5   rb   r6   r    s    	
R	r  c                .       s  e Zd Z													
	
	
					d-dededededeeee f dededededededededededededede	e ded eeee f f* fd!d"Z
						d.d#ejd$e	ej d%e	ej d&e	ej d'e	eeef  d(e	ej d)ed*ejfd+d,Z  ZS )/UNetMidBlockCrossAttnMotionr:   r)   rL   r   r   r;   Tr   r   FNr?   r   rB   rA   r   r   r   r   r   r   r=   r   rD   r   r   r   r   r   r   r   r   c                    s  t    d| _|| _|	d ur|	nt|d d}	t|tr#|f| }nt||kr1td| dt|tr<|f| }nt||krJtd| dt	|||||	|||||
d
g}g }g }t
|D ]N}|sz|t||| ||| ||	|||d	 n|t||| |d	||	d
 |t	|||||	|||||
d
 |t||| ||| |	|dd|dd
 qat|| _t|| _t|| _d| _d S )NTrk   r;   zT`transformer_layers_per_block` should be an integer or a list of integers of length .z]`temporal_transformer_layers_per_block` should be an integer or a list of integers of length r   )r?   rA   rD   rC   r   r   r   r)   r   Fr   r<   )
r=   r>   r?   rA   rC   rD   rE   rJ   rK   rG   )rV   rW   r   r=   minr   ry   r   r   r%   r^   r   r(   r'   r7   rX   r]   r   r   r   r   )ra   r?   r   rB   rA   r   r   r   r   r   r   r=   r   rD   r   r   r   r   r   r   r   r   r   r   r   r   rb   r5   r6   rW     s   






z$UNetMidBlockCrossAttnMotion.__init__rd   r   re   r   ri   r   rh   rj   c              	   C   s  |d ur| dd d urtd | jd ||}t| j| jdd  | j}|D ]]\}	}
}| jrn| jrnddd}t	ddrAd	d
ini }|	|||||d
dd }t
jjj||||fi |}t
jjj||
||fi |}q(|	|||||d
dd }|||d}|
||}q(|S )Nr   r   r   r)   c                    r   )Nc                     r   r   r5   r   r   r5   r6   r   B  r   zZUNetMidBlockCrossAttnMotion.forward.<locals>.create_custom_forward.<locals>.custom_forwardr5   r   r5   r   r6   r   A  r   zBUNetMidBlockCrossAttnMotion.forward.<locals>.create_custom_forwardr   r   r   Fr   r   r   )r   r   r   r   r   r   r   r   r   r   r2   r   r   )ra   rd   r   re   r   ri   r   rh   r   r   r   r   r   r   r5   r5   r6   rx   -  sb   


	z#UNetMidBlockCrossAttnMotion.forward)r:   r)   r)   rL   r   r   r;   Tr)   r   r   FFFr   r)   Nr;   r)   )NNNNNr)   r   r5   r5   rb   r6   r    s    	
 	r  c                       sp   e Zd Z								ddeded	eeee f d
eeee f dedee dededef fddZ	  Z
S )MotionModulesr   r   FNr<   r;   r?   layers_per_blockr   r=   rE   rD   rG   rC   max_seq_lengthc
                    s   t    tg | _t|tr|f| }nt||kr(td| dt| t	|D ]}
| j
t|||
 ||||||| d|	d
 q,d S )NzZThe number of transformer layers per block must match the number of layers per block, got  and r   )
r?   rA   rC   rD   rG   rE   r=   r>   rJ   rK   )rV   rW   rX   r]   r   r   ry   r   r   r^   r   r7   )ra   r?   r  r   r=   rE   rD   rG   rC   r	  r   rb   r5   r6   rW   r  s8   

zMotionModules.__init__)r   r   r   FNr<   r;   r;   )r.   r/   r0   ry   r   r   r{   r   r|   rW   r~   r5   r5   rb   r6   r  q  s8    	
r  c                       s   e Zd Ze										ddeed	f d
eeee f deeee eee  f dedeeee f deeee f dedededee f fddZ	dd Z
  ZS )MotionAdapteri@  i  r   r   r   r)   r   r;   TNblock_out_channels.motion_layers_per_block#motion_transformer_layers_per_block!motion_mid_block_layers_per_block'motion_transformer_layers_per_mid_blockmotion_num_attention_headsmotion_norm_num_groupsmotion_max_seq_lengthuse_motion_mid_blockconv_in_channelsc                    s*  t    g }g }t|tr|ft| }nt|t|kr,tdt| dt| t|tr8|ft| }t|trC|f| }nt||krVtd| dt| dt|trc|ft| }nt|t|krytdt| dt| |
rtj|
|d dd	d
| _nd| _t	|D ]\}}|| }|
t||ddd|| ||| || d	 q|	rt|d |ddd|d |||d	| _nd| _tt|}|d }tt|}tt|}tt|}t	|D ] \}}|| }|
t||ddd|| ||| d	 || d	 qt|| _t|| _dS )a3  Container to store AnimateDiff Motion Modules

        Args:
            block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
            The tuple of output channels for each UNet block.
            motion_layers_per_block (`int` or `Tuple[int]`, *optional*, defaults to 2):
                The number of motion layers per UNet block.
            motion_transformer_layers_per_block (`int`, `Tuple[int]`, or `Tuple[Tuple[int]]`, *optional*, defaults to 1):
                The number of transformer layers to use in each motion layer in each block.
            motion_mid_block_layers_per_block (`int`, *optional*, defaults to 1):
                The number of motion layers in the middle UNet block.
            motion_transformer_layers_per_mid_block (`int` or `Tuple[int]`, *optional*, defaults to 1):
                The number of transformer layers to use in each motion layer in the middle block.
            motion_num_attention_heads (`int` or `Tuple[int]`, *optional*, defaults to 8):
                The number of heads to use in each attention layer of the motion module.
            motion_norm_num_groups (`int`, *optional*, defaults to 32):
                The number of groups to use in each group normalization layer of the motion module.
            motion_max_seq_length (`int`, *optional*, defaults to 32):
                The maximum sequence length to use in the motion module.
            use_motion_mid_block (`bool`, *optional*, defaults to True):
                Whether to use a motion module in the middle of the UNet.
        zKThe number of motion layers per block must match the number of blocks, got r
  z$The number of layers per mid block (zD) must match the length of motion_transformer_layers_per_mid_block ()zgThe length of the attention head number tuple in the motion module must match the number of block, got r   r   r)   kernel_sizer   Nr<   F)	r?   rC   rD   rG   rE   r=   r	  r  r   r   )rV   rW   r   ry   r   r   rX   Conv2dconv_inr   r   r  	mid_blockr   reversedr]   down_blocks	up_blocks)ra   r  r  r  r  r  r  r  r  r  r  r  r  r   rq   output_channelreversed_block_out_channels reversed_motion_layers_per_block,reversed_motion_transformer_layers_per_block#reversed_motion_num_attention_headsrb   r5   r6   rW     s   
%





zMotionAdapter.__init__c                 C   s   d S r   r5   )ra   r-   r5   r5   r6   rx     s   zMotionAdapter.forward)
r  r   r)   r)   r)   r   r;   r;   TN)r.   r/   r0   r   r   ry   r   r{   r   rW   rx   r~   r5   r5   rb   r6   r    sF    
	
 r  c                C       s  e Zd ZdZdZe												
																										dmdee dededee	df dee	df deedf de
eee f dedede	dededede
eee ee f dee
eee ee f  d e
eee ee f d!ee
eee ee f  d"ee
eee f  d#ee
eee f  d$ed%e
eeedf f d&ed'e
eeedf f d(ee
eeedf eeedf df f  d)ed*ed+ee d,ee	 d-ee	 d.ee d/ee d0ee f@ fd1d2Ze		dnd3ed4ee d5efd6d7Zdod9d:Zd4ee d8dfd;d<Z				dpd=e	d>ed?ed@ee	 dAed8dfdBdCZed8ee	ef fdDdEZdFe
eee	ef f fdGdHZdqdJee dKed8dfdLdMZdodNdOZdodPdQZdrdRed8dfdSdTZdUedVedWedXed8df
dYdZZdod[d\Zd]d^ Z d_d` Z!							dsdae"j#dbe
e"j#eef dce"j#ddee"j# deee"j# dfeee	e$f  dgeee	e"j#f  dheee"j#  diee"j# djed8e
e%ee"j# f fdkdlZ&  Z'S )tUNetMotionModela=  
    A modified conditional 2D UNet model that takes a noisy sample, conditional state, and a timestep and returns a
    sample shaped output.

    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
    for all models (such as downloading or saving).
    TNrk   r   r   r   r   r  r   r   r   r  r   r)   silur;   h㈵>r   Fr   rF   r?   r@   down_block_types.up_block_typesr  r  r   mid_block_scale_factoract_fnrC   norm_epsrD   r   $reverse_transformer_layers_per_blockr   -reverse_temporal_transformer_layers_per_block transformer_layers_per_mid_block)temporal_transformer_layers_per_mid_blockr   r=   r  r  "reverse_motion_num_attention_headsr  mid_block_layersencoder_hid_dimencoder_hid_dim_typeaddition_embed_typeaddition_time_embed_dim%projection_class_embeddings_input_dimtime_cond_proj_dimc!           7         s  t    || _t|t|krtd| d| dt|t|kr.td| d| dt|tsFt|t|krFtd| d| dt|tr^t|t|kr^td| d| dt|tsvt|t|krvtd| d| dt|tr|d u r|D ]}!t|!trtd	qt|tr|d u r|D ]}!t|!trtd
qd}"d}#|"d d }$tj	||d |"|$d| _
|d d }%t|d dd| _|d }&t|&|%|
| d| _|d u rd | _|dkrt|dd| _t||%| _tg | _tg | _t|tr|ft| }t|tr|ft| }t|tr!|gt| }t|tr.|gt| }t|tr;|gt| }t|trH|gt| }t|trU|gt| }t|trb|ft| }|d }'t|D ]\}(})|'}*||( }'|(t|d k}+|)dkrtd4i d|*d|'d|%d||( d||( d|d|
d|d||( d||( d|d |+ d!|d"||( d#|d$||( },n!|)d%krt|*|'|%||( ||
||+ |||( |||( d&},ntd'| j|, qj|d u r t|d( tr|d( nd}|r t|d( |%||
|	|d( |d( |d)|||d( |||d*| _nt|d( |%||
|	|d( |d( |d)|||d+| _d| _tt|}-tt|}.tt|}/tt|}0tt|}1|d u rctt|}|d u rntt|}|-d }'t|D ]\}(}2|(t|d k}+|'}3|-|( }'|-t|(d t|d  }*|+sd}4|  jd7  _nd)}4|2d,krtd4i d|*d|'d-|3d|%d.|(d|/|( d d||( d|d|
d|d|.|( d|0|( d/|4d!|d"|1|( d#|d$||( }5n#|2d0krt |*|3|'|%|(|/|( d ||
||4|1|( |||( d1}5ntd2| j|5 |'}3qv|d ur3tj!|d ||d3| _"t# | _$nd | _"d | _$|#d d }6tj	|d ||#|6d| _%d S )5Nz\Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: z. `up_block_types`: r  zbMust provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: z. `down_block_types`: zdMust provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: zdMust provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: z^Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: zOMust provide 'reverse_transformer_layers_per_block` if using asymmetrical UNet.ziMust provide 'reverse_temporal_transformer_layers_per_block` if using asymmetrical motion module in UNet.r   r)   r   r   r  rk   T)r-  cond_proj_dim	text_timer   r?   r@   r   rA   r   r   r   r   r=   rD   r   r   r   r   r   r   r   )r?   r@   r   rA   r   r   r   r   r   r   r   r   zeInvalid `down_block_type` encountered. Must be one of `CrossAttnDownBlockMotion` or `DownBlockMotion`r   F)r?   r   r   r   r   rD   r=   r   r   r   rA   r   r   r   r   )r?   r   r   r   r   rD   r=   r   r   r   rA   r   r   r   r   r   r  )r?   r   r@   r   r   rA   r   r   r   r   r   r   r   z_Invalid `up_block_type` encountered. Must be one of `CrossAttnUpBlockMotion` or `UpBlockMotion`)rN   rM   rO   r5   )&rV   rW   rF   r   r   r   ry   r   rX   r  r  r"   	time_projr!   time_embeddingencoder_hid_projadd_time_projadd_embeddingr]   r  r  r   r   r   r   r  r  r*   num_upsamplersr  r  r   r  rY   conv_norm_outSiLUconv_actconv_out)7ra   rF   r?   r@   r*  r+  r  r  r   r,  r-  rC   r.  rD   r   r/  r   r0  r1  r2  r   r=   r  r  r3  r  r4  r5  r6  r7  r8  r9  r:  layer_number_per_blockconv_in_kernelconv_out_kernelconv_in_paddingtime_embed_dimtimestep_input_dimr   r   down_block_typeinput_channelis_final_block
down_blockr!  reversed_num_attention_headsreversed_layers_per_blockreversed_cross_attention_dimr$  up_block_typer   r   up_blockconv_out_paddingrb   r5   r6   rW   .  s  
.


	






	




zUNetMotionModel.__init__unetmotion_adapterload_weightsc                    s,  |d u}|ra|j |jd t|jd t|jd krtdt|jd tr5|jd gt|jd  }nt|jd }t|jd trR|jd gt|jd  }nt|jd }||kratdt|j | j	 d< g } d D ]}d	|v r}|
d
 qq|
d qq| d< g }	 d D ]}d	|v r|	
d q|	
d q|	 d< |r|jd  d< |jd  d< |jd  d< |jd  d< |jd  d< |jd  d< |jd  d< |jd r|jd  d<  ds d  d< | | \t fdd D  | j	 d< |  }
|s|
S |rD|jd rD|j|
_tj|jj|jjd d dd d d d d f gdd}|
j||jjd n	|
j|j  |
j|j  |
j|j  td d! |j D ri }|j D ]4\}}|d"rttd#rtnt }| ||< qsttd#rt!nt"}||j#|j$|j%|j&d$||< qs|
j D ]\}}||vr|' ||< q|
(| d%|
j_)|j*|
_*t+|j,D ]9\}}|
j,| j-|j-  t|
j,| d&r|
j,| j.|j.  |
j,| j/r|
j,| j/|j/  qt+|j0D ]9\}}|
j0| j-|j-  t|
j0| d&r4|
j0| j.|j.  |
j0| j1rG|
j0| j1|j1  q|
j2j-|j2j-  |
j2j.|j2j.  |j3d urn|
j3|j3  |j4d ur}|
j4|j4  |
j5|j5  |r|
6| |
 |j7 |
S )'N)devicer*  r  z;Incompatible Motion Adapter, got different number of blocksr  r  zEIncompatible Motion Adapter, got different number of layers per block_class_name	CrossAttnr   r   r+  r   r  r  r  r  r  r2  r  r   r  r?   r=   r>   c                    s(   i | ]}|v s|v r|  |qS r5   )r   )rQ   kconfigexpected_kwargsoptional_kwargsr5   r6   
<dictcomp>  s   ( z/UNetMotionModel.from_unet2d.<locals>.<dictcomp>rk   r)   r   )weightbiasc                 s   s    | ]
}t |ttfV  qd S r   )r   r   r    rQ   procr5   r5   r6   	<genexpr>  s
    
z.UNetMotionModel.from_unet2d.<locals>.<genexpr>zattn1.processorscaled_dot_product_attention)hidden_sizerD   r   
num_tokensip_image_projr   )8torZ  r   r_  r   r   ry   r   dictr.   r   r   _get_signature_keysr
   from_configr  r2   r   rc  load_state_dictrd  
state_dictr=  r>  anyattn_processorsvaluesitemsendswithhasattrFr   r   r    r   ri  rD   r   rj  rc   set_attn_processorr6  r?  r   r  r   r   r   r  r   r  rC  rE  rF  load_motion_modulesdtype)clsrW  rX  rY  has_motion_adapterexpanded_layers_per_block!expanded_adapter_layers_per_blockr  down_blocks_typer  modelupdated_conv_in_weight
attn_procsr   	processorattn_processor_classr   rP  rU  r5   r^  r6   from_unet2dr  s   





,



zUNetMotionModel.from_unet2drj   c                 C   s   |   D ]}d|_q| jD ]}|j}|  D ]}d|_qq| jD ]}|j}|  D ]}d|_q)q t| jdrD| jj}|  D ]}d|_q>dS dS )z|Freeze the weights of just the UNet2DConditionModel, and leave the motion modules
        unfrozen for fine tuning.
        FTr   N)
parametersrequires_gradr  r   r  rw  r  )ra   paramrP  r   rU  r5   r5   r6   freeze_unet2d_params  s$   

z$UNetMotionModel.freeze_unet2d_paramsc                 C   s   t |jD ]\}}| j| j|j  qt |jD ]\}}| j| j|j  qt| jdr?| jj|jj  d S d S )Nr   )r   r  r   rp  rq  r  rw  r  )ra   rX  r   rP  rU  r5   r5   r6   rz  $  s   z#UNetMotionModel.load_motion_modulessave_directoryis_main_processsafe_serializationvariantpush_to_hubc                 K   s   |   }i }| D ]\}	}
d|	v r|
||	< q
t| jd | jd | jd | jd | jd | jd d}|| |jd
|||||d	| d S )Nr   r  r  rC   r  r  r  )r  r  r  r  r  r  )r  r  r  r  r  r5   )rq  ru  r  r_  rp  save_pretrained)ra   r  r  r  r  r  r   rq  motion_state_dictr]  vadapterr5   r5   r6   save_motion_modules.  s0   	

z#UNetMotionModel.save_motion_modulesc                    sL   i }dt dtjjdtt tf f fdd |  D ]
\}} ||| q|S )z
        Returns:
            `dict` of attention processors: A dictionary containing all attention processors used in the model with
            indexed by its weight name.
        r   r   
processorsc                    sH   t |dr| ||  d< | D ]\}} |  d| || q|S )Nget_processor
.processorr  )rw  r  named_children)r   r   r  sub_namechildfn_recursive_add_processorsr5   r6   r  \  s
   
zDUNetMotionModel.attn_processors.<locals>.fn_recursive_add_processors)r|   r2   rX   Moduler   r   r  )ra   r  r   r   r5   r  r6   rs  Q  s
   	&	zUNetMotionModel.attn_processorsr  c                    s   t | j }t|tr"t ||kr"tdt | d| d| ddtdtjj	f fdd | 
 D ]
\}} ||| q3d	S )
a4  
        Sets the attention processor to use to compute attention.

        Parameters:
            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
                The instantiated processor class or a dictionary of processor classes that will be set as the processor
                for **all** `Attention` layers.

                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
                processor. This is strongly recommended when setting trainable attention processors.

        z>A dict of processors was passed, but the number of processors z0 does not match the number of attention layers: z. Please make sure to pass z processor classes.r   r   c                    sb   t |drt|ts|| n|||  d | D ]\}} |  d| || qd S )Nset_processorr  r  )rw  r   rm  r  popr  )r   r   r  r  r  fn_recursive_attn_processorr5   r6   r    s   

zGUNetMotionModel.set_attn_processor.<locals>.fn_recursive_attn_processorN)r   rs  keysr   rm  r   r|   r2   rX   r  r  )ra   r  countr   r   r5   r  r6   ry  k  s   
z"UNetMotionModel.set_attn_processorr   
chunk_sizer   c                    sZ   |dvrt d| |pd}dtjjdtdtf fdd |  D ]} ||| q"d	S )
aX  
        Sets the attention processor to use [feed forward
        chunking](https://huggingface.co/blog/reformer#2-chunked-feed-forward-layers).

        Parameters:
            chunk_size (`int`, *optional*):
                The chunk size of the feed-forward layers. If not specified, will run feed-forward layer individually
                over each tensor of dim=`dim`.
            dim (`int`, *optional*, defaults to `0`):
                The dimension over which the feed-forward computation should be chunked. Choose between dim=0 (batch)
                or dim=1 (sequence length).
        )r   r)   z-Make sure to set `dim` to either 0 or 1, not r)   r   r  r   c                    6   t | dr| j||d |  D ]} ||| qd S Nset_chunk_feed_forward)r  r   rw  r  childrenr   r  r   r  fn_recursive_feed_forwardr5   r6   r    
   
zJUNetMotionModel.enable_forward_chunking.<locals>.fn_recursive_feed_forwardN)r   r2   rX   r  ry   r  )ra   r  r   r   r5   r  r6   enable_forward_chunking  s   z'UNetMotionModel.enable_forward_chunkingc                    s<   dt jjdtdtf fdd |  D ]} |d d qd S )Nr   r  r   c                    r  r  r  r  r  r5   r6   r    r  zKUNetMotionModel.disable_forward_chunking.<locals>.fn_recursive_feed_forwardr   )r2   rX   r  ry   r  )ra   r   r5   r  r6   disable_forward_chunking  s   z(UNetMotionModel.disable_forward_chunkingc                 C   sj   t dd | j D rt }nt dd | j D r t }ntdtt| j  | | dS )ze
        Disables custom attention processors and sets the default attention implementation.
        c                 s       | ]}|j tv V  qd S r   )rc   r   re  r5   r5   r6   rg        z=UNetMotionModel.set_default_attn_processor.<locals>.<genexpr>c                 s   r  r   )rc   r   re  r5   r5   r6   rg    r  zOCannot call `set_default_attn_processor` when attention processors are of type N)	allrs  rt  r   r   r   nextiterry  )ra   r  r5   r5   r6   set_default_attn_processor  s   z*UNetMotionModel.set_default_attn_processorvaluec                 C   s    t |ttttfr||_d S d S r   )r   r   r   r   r  r   )ra   r   r  r5   r5   r6   _set_gradient_checkpointing  s   
z+UNetMotionModel._set_gradient_checkpointingr   r   r   r   c                 C   sH   t | jD ]\}}t|d| t|d| t|d| t|d| qdS )a>  Enables the FreeU mechanism from https://arxiv.org/abs/2309.11497.

        The suffixes after the scaling factors represent the stage blocks where they are being applied.

        Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of values that
        are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.

        Args:
            s1 (`float`):
                Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
                mitigate the "oversmoothing effect" in the enhanced denoising process.
            s2 (`float`):
                Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
                mitigate the "oversmoothing effect" in the enhanced denoising process.
            b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
            b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
        r   r   r   r   N)r   r  setattr)ra   r   r   r   r   r   upsample_blockr5   r5   r6   enable_freeu  s   zUNetMotionModel.enable_freeuc                 C   sP   h d}t | jD ]\}}|D ]}t||st||ddur$t||d qq	dS )zDisables the FreeU mechanism.>   r   r   r   r   N)r   r  rw  r   r  )ra   
freeu_keysr   r  r]  r5   r5   r6   disable_freeu  s   zUNetMotionModel.disable_freeuc                 C   sn   d| _ | j D ]\}}dt|jjv rtdq| j| _ |  D ]}t|t	r.|j
dd q!| t  dS )u1  
        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
        are fused. For cross-attention modules, key and value projection matrices are fused.

        <Tip warning={true}>

        This API is 🧪 experimental.

        </Tip>
        NAddedzQ`fuse_qkv_projections()` is not supported for models having added KV projections.T)fuse)original_attn_processorsrs  ru  r|   rc   r.   r   modulesr   r   fuse_projectionsry  r   )ra   rR   attn_processorr   r5   r5   r6   fuse_qkv_projections  s   
z$UNetMotionModel.fuse_qkv_projectionsc                 C   s   | j dur| | j  dS dS )u   Disables the fused QKV projection if enabled.

        <Tip warning={true}>

        This API is 🧪 experimental.

        </Tip>

        N)r  ry  )ra   r5   r5   r6   unfuse_qkv_projections  s   

z&UNetMotionModel.unfuse_qkv_projectionsr-   rf   re   timestep_condr   ri   added_cond_kwargsdown_block_additional_residualsmid_block_additional_residualr   c           !         s  d| j   d}d}t fdd|jdd D r td d}|dur3d	||j d
 }|d	}|}t	|sa|j
jdk}t|trN|rJtjntj}n|rStjntj}tj|g||j
d}nt|jdkrp|d |j
}|jd ||jd }| |}|j| jd}| ||}d}| jjdkrd|vrt| j d|d}d|vrt| j d|d}| | }||jd df}tj||gdd}||j}|  |}|du r|n|| }|j!dd}|j!dd}| j"dur'| jj#dkr'd|vrt| j d|d}| "|}fdd|D }||f}|$ddd	dd|jd  df|jdd  }| %|}|f}| j&D ]'}t'|dre|j(re||||||d\}}n	|||d \}}||7 }qL|durd!}t)||D ]\}}|| }||f7 }q|}| j*durt'| j*d"r| j*|||||d#}n
| j*|||||d$}|	dur||	 }t+| j,D ]R\}}|t| j,d	 k} |t|j- d }|dt|j-  }| s|r|d jdd }t'|dr
|j(r
||||||||d%}q|||||d&}q| j.r#| .|}| /|}| 0|}|dddf df|jd	d  $ddd	dd}|
sI|fS t1|d'S )(aG	  
        The [`UNetMotionModel`] forward method.

        Args:
            sample (`torch.Tensor`):
                The noisy input tensor with the following shape `(batch, num_frames, channel, height, width`.
            timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input.
            encoder_hidden_states (`torch.Tensor`):
                The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
            timestep_cond: (`torch.Tensor`, *optional*, defaults to `None`):
                Conditional embeddings for timestep. If provided, the embeddings will be summed with the samples passed
                through the `self.time_embedding` layer to obtain the timestep embeddings.
            attention_mask (`torch.Tensor`, *optional*, defaults to `None`):
                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
                negative values to the attention scores corresponding to "discard" tokens.
            cross_attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
                `self.processor` in
                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
            down_block_additional_residuals: (`tuple` of `torch.Tensor`, *optional*):
                A tuple of tensors that if specified are added to the residuals of down unet blocks.
            mid_block_additional_residual: (`torch.Tensor`, *optional*):
                A tensor that if specified is added to the residual of the middle unet block.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~models.unets.unet_motion_model.UNetMotionOutput`] instead of a plain
                tuple.

        Returns:
            [`~models.unets.unet_motion_model.UNetMotionOutput`] or `tuple`:
                If `return_dict` is True, an [`~models.unets.unet_motion_model.UNetMotionOutput`] is returned,
                otherwise a `tuple` is returned where the first element is the sample tensor.
        r   FNc                 3   s    | ]	}|  d kV  qdS )r   Nr5   )rQ   s)default_overall_up_factorr5   r6   rg  K  s    z*UNetMotionModel.forward.<locals>.<genexpr>z9Forward upsample size to force interpolation output size.Tr)   g     mps)r{  rZ  r   )r{  r<  text_embedsz has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`time_idsz has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`r   r   repeatsr   rk  image_embedsz has the config param `encoder_hid_dim_type` set to 'ip_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`c                    s   g | ]	}|j  d dqS )r   r  )repeat_interleave)rQ   image_embedr   r5   r6   rU     s    z+UNetMotionModel.forward.<locals>.<listcomp>r   rk   r   )rd   r   re   r   rh   ri   )rd   r   rh   r5   r   )re   r   rh   ri   )re   r   ri   )rd   r   r   re   r   r   rh   ri   )rd   r   r   r   rh   )r-   )2rB  rr  rl   r   inforl  r{  	unsqueezer2   	is_tensorrZ  typer   rz   float32float64int32int64tensorr   expandr=  r>  r_  r7  r   rc   r   r@  flattenrm   concatrA  r  r?  r6  rn   r  r  rw  r   r   r  r   r  r   rC  rE  rF  r,   )!ra   r-   rf   re   r  r   ri   r  r  r  r   forward_upsample_sizer   	timestepsis_mpsr{  t_embembaug_embr  r  time_embeds
add_embedsr  down_block_res_samplesdownsample_blockres_samplesnew_down_block_res_samplesdown_block_res_sampledown_block_additional_residualr   r  rO  r5   )r  rh   r6   rx     s   
2 














4

	
	


	


6
zUNetMotionModel.forward) Nrk   rk   r&  r'  r  r   r)   r)   r(  r;   r)  r   r)   Nr)   NNr)   Fr   r;   r   NTr)   NNNNNN)NT)rj   N)TTNF)Nr   )F)NNNNNNT)(r.   r/   r0   r1    _supports_gradient_checkpointingr   r   ry   r   r|   r   rz   r{   rW   classmethodr+   r  r  r  rz  r  propertyr   r   rs  ry  r  r  r  r  r  r  r  r  r2   r3   r   r,   rx   r~   r5   r5   rb   r6   r%  #  s\   


 !"(#$%&'()*+  E 

#"


		
r%  )Jdataclassesr   typingr   r   r   r   r   r2   torch.nnrX   torch.nn.functional
functionalrx  torch.utils.checkpointconfiguration_utilsr	   r
   r   loadersr   r   r   r   r   r   r   r   utils.torch_utilsr   	attentionr   attention_processorr   r   r   r   r   r   r   r   r   r    
embeddingsr!   r"   modeling_utilsr#   r   r$   r%   r&    transformers.dual_transformer_2dr'   transformers.transformer_2dr(   unet_2d_blocksr*   unet_2d_conditionr+   
get_loggerr.   r   r,   r  r7   r   r   r   r  r  r  r  r%  r5   r5   r5   r6   <module>   sL   0
   T [  C) 
