o
    Gi                    @   s  d dl mZmZ d dlZd dlmZ d dlm  mZ ddl	m
Z
mZ ddlmZmZmZ ddlmZ ddlmZmZmZmZmZmZ dd	lmZmZmZ dd
lmZ ddlm Z m!Z!m"Z"m#Z#m$Z$ e rid dl%Z&ndZ&e'e(Z)G dd dZ*G dd dZ+dej,dej-de.de.fddZ/eG dd dej,Z0eG dd dej,Z1eG dd dej,Z2G dd dej,Z3eG dd dej,Z4G d d! d!ej,Z5eG d"d# d#ej,Z6G d$d% d%ej,Z7dS )&    )AnyCallableN   )	deprecatelogging)is_torch_npu_availableis_torch_xla_availableis_xformers_available)maybe_allow_in_graph   )GEGLUGELUApproximateGELUFP32SiLULinearActivationSwiGLU)	AttentionAttentionProcessorJointAttnProcessor2_0)SinusoidalPositionalEmbedding)AdaLayerNormAdaLayerNormContinuousAdaLayerNormZeroRMSNormSD35AdaLayerNormZeroXc                   @   sP   e Zd Zedeeef fddZdeeeef B fddZdd Z	d	d
 Z
dS )AttentionMixinreturnc                    sL   i }dt dtjjdtt tf f fdd |  D ]
\}} ||| q|S )z
        Returns:
            `dict` of attention processors: A dictionary containing all attention processors used in the model with
            indexed by its weight name.
        namemodule
processorsc                    sH   t |dr| ||  d< | D ]\}} |  d| || q|S )Nget_processor
.processor.)hasattrr    named_children)r   r   r   sub_namechildfn_recursive_add_processors N/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/models/attention.pyr(   2   s
   
zCAttentionMixin.attn_processors.<locals>.fn_recursive_add_processors)strtorchnnModuledictr   r$   )selfr   r   r   r)   r'   r*   attn_processors(   s
   &	zAttentionMixin.attn_processors	processorc                    s   t | j }t|tr"t ||kr"tdt | d| d| ddtdtjj	f fdd | 
 D ]
\}} ||| q3d	S )
a4  
        Sets the attention processor to use to compute attention.

        Parameters:
            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
                The instantiated processor class or a dictionary of processor classes that will be set as the processor
                for **all** `Attention` layers.

                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
                processor. This is strongly recommended when setting trainable attention processors.

        z>A dict of processors was passed, but the number of processors z0 does not match the number of attention layers: z. Please make sure to pass z processor classes.r   r   c                    sb   t |drt|ts|| n|||  d | D ]\}} |  d| || qd S )Nset_processorr!   r"   )r#   
isinstancer/   r3   popr$   )r   r   r2   r%   r&   fn_recursive_attn_processorr)   r*   r7   U   s   

zFAttentionMixin.set_attn_processor.<locals>.fn_recursive_attn_processorN)lenr1   keysr4   r/   
ValueErrorr+   r,   r-   r.   r$   )r0   r2   countr   r   r)   r6   r*   set_attn_processor@   s   
z!AttentionMixin.set_attn_processorc                 C   sV   | j  D ]\}}dt|jjv rtdq|  D ]}t|tr(|j	r(|
  qdS )z
        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
        are fused. For cross-attention modules, key and value projection matrices are fused.
        AddedzQ`fuse_qkv_projections()` is not supported for models having added KV projections.N)r1   itemsr+   	__class____name__r:   modulesr4   AttentionModuleMixin_supports_qkv_fusionfuse_projections)r0   _attn_processorr   r)   r)   r*   fuse_qkv_projectionsb   s   z#AttentionMixin.fuse_qkv_projectionsc                 C   s*   |   D ]}t|tr|jr|  qdS )um   Disables the fused QKV projection if enabled.

        > [!WARNING] > This API is 🧪 experimental.
        N)rA   r4   rB   rC   unfuse_projections)r0   r   r)   r)   r*   unfuse_qkv_projectionso   s
   z%AttentionMixin.unfuse_qkv_projectionsN)r@   
__module____qualname__propertyr/   r+   r   r1   r<   rG   rI   r)   r)   r)   r*   r   '   s    "r   c                   @   sv  e Zd ZdZg ZdZdZdeddfddZd5de	dd	fd
dZ
defddZde	ddfddZ		d6de	deedB df dB ddfddZ	d7de	dedB ddfddZe dd Ze dd Zdeddfd d!Zd"ejdejfd#d$Zd8d"ejd&edejfd'd(Z	d7d)ejd*ejd+ejdB dejfd,d-Z	%d8d+ejd.ed/ed&edejf
d0d1Zd2ejdejfd3d4ZdS )9rB   NTFr2   r   c                 C   sV   t | dr&t| jtjjr&t|tjjs&td| j d|  | j	d || _dS )z
        Set the attention processor to use.

        Args:
            processor (`AttnProcessor`):
                The attention processor to use.
        r2   z-You are removing possibly trained weights of z with N)
r#   r4   r2   r,   r-   r.   loggerinfo_modulesr5   )r0   r2   r)   r)   r*   r3      s   
z"AttentionModuleMixin.set_processorreturn_deprecated_lorar   c                 C   s   |s| j S dS )a7  
        Get the attention processor in use.

        Args:
            return_deprecated_lora (`bool`, *optional*, defaults to `False`):
                Set to `True` to return the deprecated LoRA attention processor.

        Returns:
            "AttentionProcessor": The attention processor in use.
        N)r2   )r0   rP   r)   r)   r*   r       s   z"AttentionModuleMixin.get_processorbackendc                 C   sZ   ddl m} dd |j D }||vr!td|dd| || }|| j_d S )Nr   )AttentionBackendNamec                 S   s   h | ]}|j qS r)   value).0xr)   r)   r*   	<setcomp>   s    z=AttentionModuleMixin.set_attention_backend.<locals>.<setcomp>z	`backend=z ` must be one of the following: z, )	attention_dispatchrR   __members__valuesr:   joinlowerr2   _attention_backend)r0   rQ   rR   available_backendsr)   r)   r*   set_attention_backend   s   z*AttentionModuleMixin.set_attention_backenduse_npu_flash_attentionc                 C       |r	t  s	td| d dS )z
        Set whether to use NPU flash attention from `torch_npu` or not.

        Args:
            use_npu_flash_attention (`bool`): Whether to use NPU flash attention or not.
        ztorch_npu is not available_native_npuN)r   ImportErrorr_   )r0   r`   r)   r)   r*   set_use_npu_flash_attention   s   z0AttentionModuleMixin.set_use_npu_flash_attentionuse_xla_flash_attentionpartition_spec.c                 C   ra   )a  
        Set whether to use XLA flash attention from `torch_xla` or not.

        Args:
            use_xla_flash_attention (`bool`):
                Whether to use pallas flash attention kernel from `torch_xla` or not.
            partition_spec (`tuple[]`, *optional*):
                Specify the partition specification if using SPMD. Otherwise None.
            is_flux (`bool`, *optional*, defaults to `False`):
                Whether the model is a Flux model.
        ztorch_xla is not available_native_xlaN)r   rc   r_   )r0   re   rf   is_fluxr)   r)   r*   set_use_xla_flash_attention   s   z0AttentionModuleMixin.set_use_xla_flash_attention'use_memory_efficient_attention_xformersattention_opc           	   
   C   s   |rNt  stdddtj stdz$t  r7d}|dur'|\}}|j^}}tjdd|d}tj	
|||}W n tyF } z|d}~ww | d dS dS )	a  
        Set whether to use memory efficient attention from `xformers` or not.

        Args:
            use_memory_efficient_attention_xformers (`bool`):
                Whether to use memory efficient attention from `xformers` or not.
            attention_op (`Callable`, *optional*):
                The attention operation to use. Defaults to `None` which uses the default attention operation from
                `xformers`.
        zeRefer to https://github.com/facebookresearch/xformers for more information on how to install xformersxformers)r   zvtorch.cuda.is_available() should be True but is False. xformers' memory efficient attention is only available for GPU N)r   r   (   cudadevicedtype)r	   ModuleNotFoundErrorr,   rn   is_availabler:   SUPPORTED_DTYPESrandnxopsopsmemory_efficient_attention	Exceptionr_   )	r0   rj   rk   rq   op_fwop_bwrE   qer)   r)   r*   +set_use_memory_efficient_attention_xformers   s2   

z@AttentionModuleMixin.set_use_memory_efficient_attention_xformersc                 C   sT  | j st| jj d dS t| ddrdS | jjjj	}| jjjj
}t| drr| jrrt| jjj| jjjg}|jd }|jd }tj||| j||d| _| jj| t| d	rq| jrqt| jjj| jjjg}| jj| nNt| jjj| jjj| jjjg}|jd }|jd }tj||| j||d| _| jj| t| d	r| jrt| jjj| jjj| jjjg}| jj| t| d
ddur%t| dddur%t| dddur%t| jjj| jjj| jjjg}|jd }|jd }tj||| j||d| _| jj| | jr%t| jjj| jjj| jjjg}| jj| d| _dS )ze
        Fuse the query, key, and value projections into a single projection for efficiency.
        zK does not support fusing QKV projections, so `fuse_projections` will no-op.Nfused_projectionsFis_cross_attentionr   r   )biasrp   rq   use_bias
add_q_proj
add_k_proj
add_v_projT)rC   rM   debugr?   r@   getattrto_qweightdatarp   rq   r#   r   r,   catto_kto_vshaper-   Linearr   to_kvcopy_r   to_qkvr   r   r   added_proj_biasto_added_qkvr   )r0   rp   rq   concatenated_weightsin_featuresout_featuresconcatenated_biasr)   r)   r*   rD      sZ   

"

"


z%AttentionModuleMixin.fuse_projectionsc                 C   s`   | j sdS t| ddsdS t| drt| d t| dr!t| d t| dr+t| d d| _dS )z\
        Unfuse the query, key, and value projections back to separate projections.
        Nr   Fr   r   r   )rC   r   r#   delattrr   )r0   r)   r)   r*   rH   :  s   






z'AttentionModuleMixin.unfuse_projections
slice_sizec                 C   sh   t | dr|dur|| jkrtd| d| j dd}|dur%| d}|du r-|  }| | dS )z
        Set the slice size for attention computation.

        Args:
            slice_size (`int`):
                The slice size for attention computation.
        sliceable_head_dimNzslice_size z has to be smaller or equal to r"   sliced)r#   r   r:   _get_compatible_processordefault_processor_clsr3   )r0   r   r2   r)   r)   r*   set_attention_sliceT  s   
z(AttentionModuleMixin.set_attention_slicetensorc                 C   sL   | j }|j\}}}||| |||}|dddd|| ||| }|S )a  
        Reshape the tensor from `[batch_size, seq_len, dim]` to `[batch_size // heads, seq_len, dim * heads]`.

        Args:
            tensor (`torch.Tensor`): The tensor to reshape.

        Returns:
            `torch.Tensor`: The reshaped tensor.
        r   r   r      )headsr   reshapepermute)r0   r   	head_size
batch_sizeseq_lendimr)   r)   r*   batch_to_head_dimk  s
   
"z&AttentionModuleMixin.batch_to_head_dimr   out_dimc                 C   s~   | j }|jdkr|j\}}}d}n|j\}}}}|||| ||| }|dddd}|dkr=||| || || }|S )a5  
        Reshape the tensor for multi-head attention processing.

        Args:
            tensor (`torch.Tensor`): The tensor to reshape.
            out_dim (`int`, *optional*, defaults to `3`): The output dimension of the tensor.

        Returns:
            `torch.Tensor`: The reshaped tensor.
        r   r   r   r   )r   ndimr   r   r   )r0   r   r   r   r   r   r   	extra_dimr)   r)   r*   head_to_batch_dim{  s   
z&AttentionModuleMixin.head_to_batch_dimquerykeyattention_maskc           	      C   s   |j }| jr| }| }|du r*tj|jd |jd |jd |j |jd}d}n|}d}tj|||dd|| j	d}~| j
rE| }|jdd}~||}|S )	aL  
        Compute the attention scores.

        Args:
            query (`torch.Tensor`): The query tensor.
            key (`torch.Tensor`): The key tensor.
            attention_mask (`torch.Tensor`, *optional*): The attention mask to use.

        Returns:
            `torch.Tensor`: The attention probabilities/scores.
        Nr   r   rq   rp   )betaalphar   )rq   upcast_attentionfloatr,   emptyr   rp   baddbmm	transposescaleupcast_softmaxsoftmaxto)	r0   r   r   r   rq   baddbmm_inputr   attention_scoresattention_probsr)   r)   r*   get_attention_scores  s2    

z)AttentionModuleMixin.get_attention_scorestarget_lengthr   c           	      C   s   | j }|du r	|S |jd }||krA|jjdkr7|jd |jd |f}tj||j|jd}tj||gdd}n
tj	|d|fd	d
}|dkrW|jd || k rU|j
|dd}|S |dkrg|d}|j
|dd}|S )a  
        Prepare the attention mask for the attention computation.

        Args:
            attention_mask (`torch.Tensor`): The attention mask to prepare.
            target_length (`int`): The target length of the attention mask.
            batch_size (`int`): The batch size for repeating the attention mask.
            out_dim (`int`, *optional*, defaults to `3`): Output dimension.

        Returns:
            `torch.Tensor`: The prepared attention mask.
        Nr   mpsr   r   r   r   r           rS   r      )r   r   rp   typer,   zerosrq   r   Fpadrepeat_interleave	unsqueeze)	r0   r   r   r   r   r   current_lengthpadding_shapepaddingr)   r)   r*   prepare_attention_mask  s$   

z+AttentionModuleMixin.prepare_attention_maskencoder_hidden_statesc                 C   sf   | j dus	J dt| j tjr|  |}|S t| j tjr1|dd}|  |}|dd}|S J )z
        Normalize the encoder hidden states.

        Args:
            encoder_hidden_states (`torch.Tensor`): Hidden states of the encoder.

        Returns:
            `torch.Tensor`: The normalized encoder hidden states.
        NzGself.norm_cross must be defined to call self.norm_encoder_hidden_statesr   r   )
norm_crossr4   r-   	LayerNorm	GroupNormr   )r0   r   r)   r)   r*   norm_encoder_hidden_states  s   


z/AttentionModuleMixin.norm_encoder_hidden_states)F)NFN)r   )r@   rJ   rK   _default_processor_cls_available_processorsrC   r   r   r3   boolr    r+   r_   rd   tupleri   r   r~   r,   no_gradrD   rH   intr   Tensorr   r   r   r   r   r)   r)   r)   r*   rB   y   sp    


'
B

0
+rB   ffhidden_states	chunk_dim
chunk_sizec                    sf   |j | | dkrtd|j |  d| d|j | | }tj fdd|j||dD |d}|S )Nr   z)`hidden_states` dimension to be chunked: z$ has to be divisible by chunk size: z[. Make sure to set an appropriate `chunk_size` when calling `unet.enable_forward_chunking`.c                    s   g | ]} |qS r)   r)   )rU   	hid_slicer   r)   r*   
<listcomp>  s    z)_chunked_feed_forward.<locals>.<listcomp>r   )r   r:   r,   r   chunk)r   r   r   r   
num_chunks	ff_outputr)   r   r*   _chunked_feed_forward
  s   r   c                       sN   e Zd ZdZdedededef fddZdejd	ejd
ejfddZ  Z	S )GatedSelfAttentionDenseat  
    A gated self-attention dense layer that combines visual features and object features.

    Parameters:
        query_dim (`int`): The number of channels in the query.
        context_dim (`int`): The number of channels in the context.
        n_heads (`int`): The number of heads to use for attention.
        d_head (`int`): The number of channels in each head.
    	query_dimcontext_dimn_headsd_headc                    s   t    t||| _t|||d| _t|dd| _t	|| _
t	|| _| dttd | dttd d| _d S )N)r   r   dim_headgegluactivation_fn
alpha_attnr   alpha_denseT)super__init__r-   r   linearr   attnFeedForwardr   r   norm1norm2register_parameter	Parameterr,   r   enabled)r0   r   r   r   r   r?   r)   r*   r   %  s   

z GatedSelfAttentionDense.__init__rV   objsr   c              
   C   s   | j s|S |jd }| |}|| j | | tj||gddd d d |d d f   }|| j	 | 
| |  }|S )Nr   r   )r   r   r   r   tanhr   r   r,   r   r   r   r   )r0   rV   r   n_visualr)   r)   r*   forward6  s   

BzGatedSelfAttentionDense.forward)
r@   rJ   rK   __doc__r   r   r,   r   r  __classcell__r)   r)   r   r*   r     s    
$r   c                       s   e Zd ZdZ			ddedededededB d	ef fd
dZddedB defddZ	dde	j
de	j
de	j
deeef dB dee	je	jf f
ddZ  ZS )JointTransformerBlocka,  
    A Transformer block following the MMDiT architecture, introduced in Stable Diffusion 3.

    Reference: https://huggingface.co/papers/2403.03206

    Parameters:
        dim (`int`): The number of channels in the input and output.
        num_attention_heads (`int`): The number of heads to use for multi-head attention.
        attention_head_dim (`int`): The number of channels in each head.
        context_pre_only (`bool`): Boolean to determine if we should add some blocks associated with the
            processing of `context` conditions.
    FNr   num_attention_headsattention_head_dimcontext_pre_onlyqk_normuse_dual_attentionc           	         sH  t    || _|| _|rdnd}|rt|| _nt|| _|dkr.t||ddddd| _n|dkr8t|| _nt	d| d	t
td
rIt }nt	dt|d |||||d||dd| _|rnt|d |||d||dd	| _nd | _tj|ddd| _t||dd| _|stj|ddd| _t||dd| _nd | _d | _d | _d| _d S )Nada_norm_continousada_norm_zeroFư>T
layer_norm)elementwise_affineepsr   	norm_typezUnknown context_norm_type: z>, currently only support `ada_norm_continous`, `ada_norm_zero`scaled_dot_product_attentionzYThe current PyTorch version does not support the `scaled_dot_product_attention` function.)r   cross_attention_dimadded_kv_proj_dimr   r   r   r	  r   r2   r
  r  )	r   r  r   r   r   r   r2   r
  r  r  r  gelu-approximate)r   dim_outr   r   )r   r   r  r	  r   r   r   r   norm1_contextr:   r#   r   r   r   r   attn2r-   r   r   r   r   norm2_context
ff_context_chunk_size
_chunk_dim)	r0   r   r  r  r	  r
  r  context_norm_typer2   r   r)   r*   r   R  sp   
	





zJointTransformerBlock.__init__r   r   c                 C      || _ || _d S r   r  r  r0   r   r   r)   r)   r*   set_chunk_feed_forward     
z,JointTransformerBlock.set_chunk_feed_forwardr   r   tembjoint_attention_kwargsr   c                 C   s  |pi }| j r| j||d\}}}}}	}
}n| j||d\}}}}}	| jr,| ||}n| j||d\}}}}}| jd||d|\}}|d| }|| }| j rh| jdd|
i|}|d| }|| }| |}|d|d d d f   |d d d f  }| jd urt	| j
|| j| j}n| 
|}|	d| }|| }| jrd }||fS |d| }|| }| |}|d|d d d f   |d d d f  }| jd urt	| j|| j| j}n| |}||d|  }||fS )N)emb)r   r   r   r   r)   )r  r   r	  r  r   r   r  r   r  r   r   r  r  r  )r0   r   r   r%  r&  norm_hidden_statesgate_msa	shift_mlp	scale_mlpgate_mlpnorm_hidden_states2	gate_msa2r   
c_gate_msac_shift_mlpc_scale_mlp
c_gate_mlpattn_outputcontext_attn_outputattn_output2r   context_ff_outputr)   r)   r*   r    sX   

(


(

zJointTransformerBlock.forward)FNFr   r   )r@   rJ   rK   r  r   r   r+   r   r#  r,   FloatTensorr/   r   r   r   r  r  r)   r)   r   r*   r  C  s>    R
r  c                -       sN  e Zd ZdZ																					d1d
ededededB dededB dededededededededededB dedB dedB dedB dedB dedef, fd d!Zd2d#edB d
efd$d%Z								d3d&e
jd'e
jdB d(e
jdB d)e
jdB d*e
jdB d+eeef d,e
jdB d-eee
jf dB d.e
jfd/d0Z  ZS )4BasicTransformerBlocka  
    A basic Transformer block.

    Parameters:
        dim (`int`): The number of channels in the input and output.
        num_attention_heads (`int`): The number of heads to use for multi-head attention.
        attention_head_dim (`int`): The number of channels in each head.
        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
        cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
        num_embeds_ada_norm (:
            obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`.
        attention_bias (:
            obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter.
        only_cross_attention (`bool`, *optional*):
            Whether to use only cross-attention layers. In this case two cross attention layers are used.
        double_self_attention (`bool`, *optional*):
            Whether to use two self-attention layers. In this case no cross attention layers are used.
        upcast_attention (`bool`, *optional*):
            Whether to upcast the attention computation to float32. This is useful for mixed precision training.
        norm_elementwise_affine (`bool`, *optional*, defaults to `True`):
            Whether to use learnable elementwise affine parameters for normalization.
        norm_type (`str`, *optional*, defaults to `"layer_norm"`):
            The normalization layer to use. Can be `"layer_norm"`, `"ada_norm"` or `"ada_norm_zero"`.
        final_dropout (`bool` *optional*, defaults to False):
            Whether to apply a final dropout after the last feed-forward layer.
        attention_type (`str`, *optional*, defaults to `"default"`):
            The type of attention to use. Can be `"default"` or `"gated"` or `"gated-text-image"`.
        positional_embeddings (`str`, *optional*, defaults to `None`):
            The type of positional embeddings to apply to.
        num_positional_embeddings (`int`, *optional*, defaults to `None`):
            The maximum number of positional embeddings to apply.
    r   Nr   FTr  h㈵>defaultr   r  r  r  r   num_embeds_ada_normattention_biasonly_cross_attentiondouble_self_attentionr   norm_elementwise_affiner  norm_epsfinal_dropoutattention_typepositional_embeddingsnum_positional_embeddings-ada_norm_continous_conditioning_embedding_dimada_norm_biasff_inner_dimff_biasattention_out_biasc              
      s  t    || _|| _|| _|| _|| _|| _|| _|
| _	|| _
|| _|| _|	| _|d uo0|dk| _|d uo9|dk| _|dk| _|dk| _|dk| _|dv r]|d u r]td| d| d	|| _|| _|rm|d u rmtd
|dkryt||d| _nd | _|dkrt||| _n#|dkrt||| _n|dkrt|||||d| _n	tj|||d| _t||||||	r|nd ||d| _|d us|
r|dkrt||| _ n|dkrt|||||d| _ nt|||| _ t||
s|nd ||||||d| _!n|dkrt|||| _ nd | _ d | _!|dkrt|||||d| _"n|dv r't|||| _"n|dkr/d | _"t#||||||d| _$|dksD|dkrLt%||||| _&|dkr_t't()d||d  | _*d | _+d| _,d S )Nr  ada_normada_norm_singler  ada_norm_continuousrK  r  `norm_type` is set to w, but `num_embeds_ada_norm` is not defined. Please make sure to define `num_embeds_ada_norm` if setting `norm_type` to r"   \If `positional_embedding` type is defined, `num_positition_embeddings` must also be defined.
sinusoidalmax_seq_lengthrms_normr  r   r   r   dropoutr   r  r   out_biasr   r  r   r   rW  r   r   rX  )r  rK  r  layer_norm_i2vgenrW  r   rB  	inner_dimr   gatedzgated-text-image   g      ?r   )-r   r   r   r  r  rW  r  r   r=  r?  r@  rD  rE  r>  use_ada_layer_norm_zerouse_ada_layer_normuse_ada_layer_norm_singleuse_layer_normuse_ada_layer_norm_continuousr:   r  r<  r   	pos_embedr   r   r   r   r-   r   r   attn1r   r  norm3r   r   r   fuserr   r,   ru   scale_shift_tabler  r  )r0   r   r  r  rW  r  r   r<  r=  r>  r?  r   r@  r  rA  rB  rC  rD  rE  rF  rG  rH  rI  rJ  r   r)   r*   r     s   



	
	




	



zBasicTransformerBlock.__init__r   r   c                 C   r   r   r!  r"  r)   r)   r*   r#    r$  z,BasicTransformerBlock.set_chunk_feed_forwardr   r   r   encoder_attention_masktimestepcross_attention_kwargsclass_labelsadded_cond_kwargsr   c	                 C   sp  |d ur| dd d urtd |jd }	| jdkr"| ||}
n\| jdkr7| j||||jd\}
}}}}nG| jdv rB| |}
n<| jdkrP| ||d	 }
n.| jd
krz| jd  ||	dd j	ddd\}}}}}}| |}
|
d|  | }
nt
d| jd ur| |
}
|d ur| ni }|dd }| j|
f| jr|nd |d|}| jdkr|d| }n	| jd
kr|| }|| }|jdkr|d}|d ur| ||d }| jd ur2| jdkr| ||}
n&| jdv r| |}
n| jd
kr|}
n| jdkr| ||d	 }
nt
d| jd ur"| jd
kr"| |
}
| j|
f||d|}|| }| jdkrA| ||d	 }
n| jd
ksL| |}
| jdkrf|
d|d d d f   |d d d f  }
| jd
kry| |}
|
d|  | }
| jd urt| j|
| j| j}n| |
}| jdkr|d| }n
| jd
kr|| }|| }|jdkr|d}|S )Nr   SPassing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.r   rK  r  )hidden_dtype)r  rZ  rM  pooled_text_embrL  r^  r   r   r   zIncorrect norm usedgligenr   r   r   r   )r  r  rZ  zIncorrect norm)getrM   warningr   r  r   rq   rh  r   r   r:   rd  copyr5   re  r>  r   r   squeezerg  r  r   rf  r  r   r   r  )r0   r   r   r   ri  rj  rk  rl  rm  r   r(  r)  r*  r+  r,  	shift_msa	scale_msagligen_kwargsr3  r   r)   r)   r*   r    s   



















(


zBasicTransformerBlock.forward)r   Nr   NFFFFTr  r:  Fr;  NNNNNTTr7  )NNNNNNN)r@   rJ   rK   r  r   r+   r   r   r   r#  r,   r   
LongTensorr/   r   r  r  r)   r)   r   r*   r9    s    '	
 )
	
r9  c                
       sH   e Zd ZdZ		ddedededB dedB f fdd	Zd
d Z  ZS )LuminaFeedForwarda'  
    A feed-forward layer.

    Parameters:
        hidden_size (`int`):
            The dimensionality of the hidden layers in the model. This parameter determines the width of the model's
            hidden representations.
        intermediate_size (`int`): The intermediate dimension of the feedforward layer.
        multiple_of (`int`, *optional*): Value to ensure hidden dimension is a multiple
            of this value.
        ffn_dim_multiplier (float, *optional*): Custom multiplier for hidden
            dimension. Defaults to None.
       Nr   r\  multiple_offfn_dim_multiplierc                    st   t    |d urt|| }||| d |  }tj||dd| _tj||dd| _tj||dd| _t | _	d S )Nr   Fr   )
r   r   r   r-   r   linear_1linear_2linear_3r   silu)r0   r   r\  r}  r~  r   r)   r*   r   J  s(   
zLuminaFeedForward.__init__c                 C   s    |  | | || | S r   )r  r  r  r  )r0   rV   r)   r)   r*   r  h  s    zLuminaFeedForward.forward)r|  N)	r@   rJ   rK   r  r   r   r   r  r  r)   r)   r   r*   r{  ;  s    r{  c                       sx   e Zd ZdZ	ddedededededB f
 fdd	Zd
edB fddZ	ddejdedejdB dejfddZ	  Z
S )TemporalBasicTransformerBlocka  
    A basic Transformer block for video like data.

    Parameters:
        dim (`int`): The number of channels in the input and output.
        time_mix_inner_dim (`int`): The number of channels for temporal attention.
        num_attention_heads (`int`): The number of heads to use for multi-head attention.
        attention_head_dim (`int`): The number of channels in each head.
        cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
    Nr   time_mix_inner_dimr  r  r  c                    s   t    ||k| _t|| _t||dd| _t|| _t	|||d d| _
|d ur;t|| _t	||||d| _nd | _d | _t|| _t|dd| _d | _d | _d S )Nr   )r  r   )r   r   r   r  )r   r  r   r   r   )r   r   is_resr-   r   norm_inr   ff_inr   r   re  r   r  rf  r   r  r  )r0   r   r  r  r  r  r   r)   r*   r   y  s:   



z&TemporalBasicTransformerBlock.__init__r   c                 K   s   || _ d| _d S )Nr   r!  )r0   r   kwargsr)   r)   r*   r#    s   
z4TemporalBasicTransformerBlock.set_chunk_feed_forwardr   
num_framesr   r   c                 C   sv  |j d }|j \}}}|| }|d d d f ||||}|dddd}||| ||}|}| |}| jd urEt| j|| j| j}n| |}| jrQ|| }| 	|}	| j
|	d d}
|
| }| jd urv| |}	| j|	|d}
|
| }| |}	| jd urt| j|	| j| j}n| |	}| jr|| }n|}|d d d f ||||}|dddd}||| ||}|S )Nr   r   r   r   )r   )r   r   r   r  r  r   r  r  r  r   re  r  r   rf  r   )r0   r   r  r   r   batch_frames
seq_lengthchannelsresidualr(  r3  r   r)   r)   r*   r    s>   










z%TemporalBasicTransformerBlock.forwardr   )r@   rJ   rK   r  r   r   r#  r,   r   r  r  r)   r)   r   r*   r  l  s2    5
r  c                       sT   e Zd Z				ddedededed	ed
edB dedef fddZdd Z  ZS )SkipFFTransformerBlockr   NFTr   r  r  kv_input_dimkv_input_dim_proj_use_biasr  r=  rJ  c
           
   	      sv   t    ||krt|||| _nd | _t|d| _t|||||||	d| _t|d| _	t|||||||	d| _
d S )Nr  )r   r   r   rW  r   r  rX  )r   r  r   r   rW  r   rX  )r   r   r-   r   	kv_mapperr   r   r   re  r   r  )
r0   r   r  r  r  r  rW  r  r=  rJ  r   r)   r*   r     s0   

zSkipFFTransformerBlock.__init__c                 C   s   |d ur|  ni }| jd ur| t|}| |}| j|fd|i|}|| }| |}| j|fd|i|}|| }|S )Nr   )ru  r  r   r  r   re  r   r  )r0   r   r   rk  r(  r3  r)   r)   r*   r    s,   


zSkipFFTransformerBlock.forward)r   NFT)r@   rJ   rK   r   r   r   r  r  r)   r)   r   r*   r    s.    	
*r  c                /       sz  e Zd ZdZ																				
	d8dedededededB dededB dedededededededededB dedB dedB deded ed!ed"ef. fd#d$Zd%ed&e	e
eef  fd'd(Zd9d%ed"ed&e	e fd)d*Z	d9d ed!ed"ed&dfd+d,Zd:d.edB ded&dfd/d0Z				d;d1ejd2ejdB d3ejdB d4ejdB d5eeef d&ejfd6d7Z  ZS )<FreeNoiseTransformerBlocka  
    A FreeNoise Transformer block.

    Parameters:
        dim (`int`):
            The number of channels in the input and output.
        num_attention_heads (`int`):
            The number of heads to use for multi-head attention.
        attention_head_dim (`int`):
            The number of channels in each head.
        dropout (`float`, *optional*, defaults to 0.0):
            The dropout probability to use.
        cross_attention_dim (`int`, *optional*):
            The size of the encoder_hidden_states vector for cross attention.
        activation_fn (`str`, *optional*, defaults to `"geglu"`):
            Activation function to be used in feed-forward.
        num_embeds_ada_norm (`int`, *optional*):
            The number of diffusion steps used during training. See `Transformer2DModel`.
        attention_bias (`bool`, defaults to `False`):
            Configure if the attentions should contain a bias parameter.
        only_cross_attention (`bool`, defaults to `False`):
            Whether to use only cross-attention layers. In this case two cross attention layers are used.
        double_self_attention (`bool`, defaults to `False`):
            Whether to use two self-attention layers. In this case no cross attention layers are used.
        upcast_attention (`bool`, defaults to `False`):
            Whether to upcast the attention computation to float32. This is useful for mixed precision training.
        norm_elementwise_affine (`bool`, defaults to `True`):
            Whether to use learnable elementwise affine parameters for normalization.
        norm_type (`str`, defaults to `"layer_norm"`):
            The normalization layer to use. Can be `"layer_norm"`, `"ada_norm"` or `"ada_norm_zero"`.
        final_dropout (`bool` defaults to `False`):
            Whether to apply a final dropout after the last feed-forward layer.
        attention_type (`str`, defaults to `"default"`):
            The type of attention to use. Can be `"default"` or `"gated"` or `"gated-text-image"`.
        positional_embeddings (`str`, *optional*):
            The type of positional embeddings to apply to.
        num_positional_embeddings (`int`, *optional*, defaults to `None`):
            The maximum number of positional embeddings to apply.
        ff_inner_dim (`int`, *optional*):
            Hidden dimension of feed-forward MLP.
        ff_bias (`bool`, defaults to `True`):
            Whether or not to use bias in feed-forward MLP.
        attention_out_bias (`bool`, defaults to `True`):
            Whether or not to use bias in attention output project layer.
        context_length (`int`, defaults to `16`):
            The maximum number of frames that the FreeNoise block processes at once.
        context_stride (`int`, defaults to `4`):
            The number of frames to be skipped before starting to process a new batch of `context_length` frames.
        weighting_scheme (`str`, defaults to `"pyramid"`):
            The weighting scheme to use for weighting averaging of processed latent frames. As described in the
            Equation 9. of the [FreeNoise](https://huggingface.co/papers/2310.15169) paper, "pyramid" is the default
            setting used.
    r   Nr   FTr  r:     r   pyramidr   r  r  rW  r  r   r<  r=  r>  r?  r   r@  r  rA  rB  rD  rE  rH  rI  rJ  context_lengthcontext_strideweighting_schemec              
      s  t    || _|| _|| _|| _|| _|| _|| _|
| _	|| _
|| _|| _|	| _| ||| |d uo7|dk| _|d uo@|dk| _|dk| _|dk| _|dk| _|dv rd|d u rdtd| d| d	|| _|| _|rt|d u rttd
|dkrt||d| _nd | _tj|||d| _t||||||	r|nd ||d| _|d us|
rt|||| _t||
s|nd ||||||d| _t ||||||d| _!t|||| _"d | _#d| _$d S )Nr  rK  rL  r  rM  rN  rO  rP  r"   rQ  rR  rS  r  rV  rY  r[  r   )%r   r   r   r  r  rW  r  r   r=  r?  r@  rD  rE  r>  set_free_noise_propertiesr_  r`  ra  rb  rc  r:   r  r<  r   rd  r-   r   r   r   re  r   r  r   r   rf  r  r  )r0   r   r  r  rW  r  r   r<  r=  r>  r?  r   r@  r  rA  rB  rD  rE  rH  rI  rJ  r  r  r  r   r)   r*   r   n  s   





	
z"FreeNoiseTransformerBlock.__init__r  r   c                 C   sH   g }t d|| j d | jD ]}|}t||| j }|||f q|S )Nr   r   )ranger  r  minappend)r0   r  frame_indicesiwindow_start
window_endr)   r)   r*   _get_frame_indices  s   z,FreeNoiseTransformerBlock._get_frame_indicesc                 C   s  |dkrdg| }|S |dkrH|d dkr-|d }t td|d }||d d d  }|S |d d }t td|}||g |d d d  }|S |dkr|d dkrl|d }d	g|d  |g }|t t|dd }|S |d d }d	g| }|t t|dd }|S td
| )Nflatg      ?r  r   r   r   r   delayed_reverse_sawtoothg{Gz?z'Unsupported value for weighting_scheme=)listr  r:   )r0   r  r  weightsmidr)   r)   r*   _get_frame_weights  s0   
	
z,FreeNoiseTransformerBlock._get_frame_weightsc                 C   s   || _ || _|| _d S r   )r  r  r  )r0   r  r  r  r)   r)   r*   r    s   
z3FreeNoiseTransformerBlock.set_free_noise_propertiesr   r   c                 C   r   r   r!  r"  r)   r)   r*   r#    r$  z0FreeNoiseTransformerBlock.set_chunk_feed_forwardr   r   r   ri  rk  c              	   O   sN  |d ur| dd d urtd |d ur| ni }|j}|j}	|d}
| |
}| | j	| j
}tj|||	ddd}|d d |
k}|sm|
| j	k r[td|
d| j	|
|d d  }||
| j	 |
f tjd|
df|d	}t|}t|D ]\}\}}t|d d ||f }||9 }|d d ||f }| |}| jd ur| |}| j|f| jr|nd |d
|}|| }|jdkr|d}| jd ur| |}| jd ur| jdkr| |}| j|f||d
|}|| }|t|d kr>|s>|d d | d f  |d d | d f |d d | d f  7  < |d d | d f  |d d | f 7  < q|d d ||f  || 7  < |d d ||f  |7  < qtjdd t|j | j	dd|j | j	ddD dd!|	}| "|}| j#d urt$| j%|| j&| j#}n| %|}|| }|jdkr|d}|S )Nr   rn  r   ro   r   r   zExpected num_frames=z1 to be greater or equal than self.context_length=)rp   rr  r   rL  c                 S   s&   g | ]\}}t |d k|| |qS r7  )r,   where)rU   accumulated_splitnum_times_splitr)   r)   r*   r   y  s    z5FreeNoiseTransformerBlock.forward.<locals>.<listcomp>r   )'rs  rM   rt  ru  rp   rq   sizer  r  r  r  r,   r   r   r:   r  r   
zeros_like	enumerate	ones_liker   rd  re  r>  r   rv  r  r   r  r8   r   zipsplitr   rf  r  r   r   r  )r0   r   r   r   ri  rk  argsr  rp   rq   r  r  frame_weightsis_last_frame_batch_completelast_frame_batch_lengthnum_times_accumulatedaccumulated_valuesr  frame_start	frame_endr  hidden_states_chunkr(  r3  r   r)   r)   r*   r    s   













*. 	


z!FreeNoiseTransformerBlock.forward)r   Nr   NFFFFTr  r:  FNNNTTr  r   r  )r  r7  )NNNN)r@   rJ   rK   r  r   r   r+   r   r   r  r   r  r  r  r#  r,   r   r/   r   r  r  r)   r)   r   r*   r  6  s    ;	
r!

	r  c                       sh   e Zd ZdZ							dded	edB d
ededededef fddZde	j
de	j
fddZ  ZS )r   a  
    A feed-forward layer.

    Parameters:
        dim (`int`): The number of channels in the input.
        dim_out (`int`, *optional*): The number of channels in the output. If not given, defaults to `dim`.
        mult (`int`, *optional*, defaults to 4): The multiplier to use for the hidden dimension.
        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
        final_dropout (`bool` *optional*, defaults to False): Apply a final dropout.
        bias (`bool`, defaults to True): Whether to use a bias in the linear layer.
    Nr   r   r   FTr   r  multrW  r   rB  r   c	           
         s  t    |d u rt|| }|d ur|n|}|dkr"t|||d}	|dkr/t||d|d}	n0|dkr;t|||d}	n$|dkrGt|||d}	n|dkrSt|||d}	n|d	kr_t|||d
d}	t	g | _
| j
|	 | j
t| | j
tj|||d |r| j
t| d S d S )Ngelur  r  r  )approximater   r   zgeglu-approximateswigluzlinear-silur  )r   
activation)r   r   r   r   r   r   r   r   r-   
ModuleListnetr  Dropoutr   )
r0   r   r  r  rW  r   rB  r\  r   act_fnr   r)   r*   r     s.   
zFeedForward.__init__r   r   c                 O   sD   t |dks|dd d urd}tdd| | jD ]}||}q|S )Nr   r   zThe `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`.z1.0.0)r8   rs  r   r  )r0   r   r  r  deprecation_messager   r)   r)   r*   r    s   

zFeedForward.forward)Nr   r   r   FNT)r@   rJ   rK   r  r   r   r+   r   r   r,   r   r  r  r)   r)   r   r*   r     s2    	(r   )8typingr   r   r,   torch.nnr-   torch.nn.functional
functionalr   utilsr   r   utils.import_utilsr   r   r	   utils.torch_utilsr
   activationsr   r   r   r   r   r   attention_processorr   r   r   
embeddingsr   normalizationr   r   r   r   r   rl   rv   
get_loggerr@   rM   r   rB   r.   r   r   r   r   r  r9  r{  r  r  r  r   r)   r)   r)   r*   <module>   sP    

R   ) ,  M1 H  ]