o
    pi^                     @   s~  d dl mZmZmZmZmZ d dlZd dlm  m	Z
 d dlmZ ddlmZmZ ddlmZ ddlmZmZmZmZmZ dd	lmZmZ dd
lmZ ddlmZmZmZmZ e e!Z"dej#dej$de%de%fddZ&eG dd dej#Z'eG dd dej#Z(eG dd dej#Z)G dd dej#Z*eG dd dej#Z+G dd dej#Z,eG dd dej#Z-G d d! d!ej#Z.dS )"    )AnyDictListOptionalTupleN)nn   )	deprecatelogging)maybe_allow_in_graph   )GEGLUGELUApproximateGELUFP32SiLUSwiGLU)	AttentionJointAttnProcessor2_0)SinusoidalPositionalEmbedding)AdaLayerNormAdaLayerNormContinuousAdaLayerNormZeroRMSNormffhidden_states	chunk_dim
chunk_sizec                    sf   |j | | dkrtd|j |  d| d|j | | }tj fdd|j||dD |d}|S )Nr   z)`hidden_states` dimension to be chunked: z$ has to be divisible by chunk size: z[. Make sure to set an appropriate `chunk_size` when calling `unet.enable_forward_chunking`.c                    s   g | ]} |qS  r   ).0	hid_slicer   r   X/home/ubuntu/SoloSpeech/.venv/lib/python3.10/site-packages/diffusers/models/attention.py
<listcomp>(   s    z)_chunked_feed_forward.<locals>.<listcomp>dim)shape
ValueErrortorchcatchunk)r   r   r   r   
num_chunks	ff_outputr   r    r!   _chunked_feed_forward   s   r,   c                       sN   e Zd ZdZdedededef fddZdejd	ejd
ejfddZ  Z	S )GatedSelfAttentionDenseat  
    A gated self-attention dense layer that combines visual features and object features.

    Parameters:
        query_dim (`int`): The number of channels in the query.
        context_dim (`int`): The number of channels in the context.
        n_heads (`int`): The number of heads to use for attention.
        d_head (`int`): The number of channels in each head.
    	query_dimcontext_dimn_headsd_headc                    s   t    t||| _t|||d| _t|dd| _t	|| _
t	|| _| dttd | dttd d| _d S )N)r.   headsdim_headgegluactivation_fn
alpha_attn        alpha_denseT)super__init__r   Linearlinearr   attnFeedForwardr   	LayerNormnorm1norm2register_parameter	Parameterr'   tensorenabled)selfr.   r/   r0   r1   	__class__r   r!   r;   :   s   

z GatedSelfAttentionDense.__init__xobjsreturnc              
   C   s   | j s|S |jd }| |}|| j | | tj||gddd d d |d d f   }|| j	 | 
| |  }|S )Nr   r#   )rF   r%   r=   r7   tanhr>   rA   r'   r(   r9   r   rB   )rG   rJ   rK   n_visualr   r   r!   forwardK   s   

BzGatedSelfAttentionDense.forward)
__name__
__module____qualname____doc__intr;   r'   TensorrO   __classcell__r   r   rH   r!   r-   .   s    
$r-   c                       sV   e Zd ZdZd fdd	Zddee defdd	Zd
ej	dej	dej	fddZ
  ZS )JointTransformerBlocka$  
    A Transformer block following the MMDiT architecture, introduced in Stable Diffusion 3.

    Reference: https://arxiv.org/abs/2403.03206

    Parameters:
        dim (`int`): The number of channels in the input and output.
        num_attention_heads (`int`): The number of heads to use for multi-head attention.
        attention_head_dim (`int`): The number of channels in each head.
        context_pre_only (`bool`): Boolean to determine if we should add some blocks associated with the
            processing of `context` conditions.
    Fc                    s  t    || _|rdnd}t|| _|dkr#t||ddddd| _n|dkr-t|| _ntd| d	tt	d
r>t
 }ntdt|d |||||d|d	| _tj|ddd| _t||dd| _|sutj|ddd| _t||dd| _nd | _d | _d | _d| _d S )Nada_norm_continousada_norm_zeroFư>T
layer_norm)elementwise_affineepsbias	norm_typezUnknown context_norm_type: z>, currently only support `ada_norm_continous`, `ada_norm_zero`scaled_dot_product_attentionzYThe current PyTorch version does not support the `scaled_dot_product_attention` function.)	r.   cross_attention_dimadded_kv_proj_dimr3   r2   out_dimcontext_pre_onlyr^   	processorr\   r]   gelu-approximate)r$   dim_outr6   r   )r:   r;   rd   r   rA   r   norm1_contextr&   hasattrFr   r   r>   r   r@   rB   r?   r   norm2_context
ff_context_chunk_size
_chunk_dim)rG   r$   num_attention_headsattention_head_dimrd   context_norm_typere   rH   r   r!   r;   g   sL   





zJointTransformerBlock.__init__r   r   r$   c                 C      || _ || _d S Nrn   ro   rG   r   r$   r   r   r!   set_chunk_feed_forward      
z,JointTransformerBlock.set_chunk_feed_forwardr   encoder_hidden_statestembc                 C   s|  | j ||d\}}}}}| jr| ||}	n| j||d\}	}
}}}| j||	d\}}|d| }|| }| |}|d|d d d f   |d d d f  }| jd ur_t| j|| j	| j}n| |}|d| }|| }| jrxd }||fS |
d| }|| }| 
|}	|	d|d d d f   |d d d f  }	| jd urt| j|	| j	| j}n| |	}||d|  }||fS )N)emb)r   ry   r   )rA   rd   ri   r>   	unsqueezerB   rn   r,   r   ro   rl   rm   )rG   r   ry   rz   norm_hidden_statesgate_msa	shift_mlp	scale_mlpgate_mlpnorm_encoder_hidden_states
c_gate_msac_shift_mlpc_scale_mlp
c_gate_mlpattn_outputcontext_attn_outputr+   context_ff_outputr   r   r!   rO      s@   

(


(

zJointTransformerBlock.forward)Fr   )rP   rQ   rR   rS   r;   r   rT   rw   r'   FloatTensorrO   rV   r   r   rH   r!   rW   X   s    3rW   c                -       sN  e Zd ZdZ																					d1d
edededee dedee dedededededededededee dee dee dee dee dedef, fd d!Z	d2d#ee d
efd$d%Z
							d3d&ejd'eej d(eej d)eej d*eej d+eeef d,eej d-eeeejf  d.ejfd/d0Z  ZS )4BasicTransformerBlocka  
    A basic Transformer block.

    Parameters:
        dim (`int`): The number of channels in the input and output.
        num_attention_heads (`int`): The number of heads to use for multi-head attention.
        attention_head_dim (`int`): The number of channels in each head.
        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
        cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
        num_embeds_ada_norm (:
            obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`.
        attention_bias (:
            obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter.
        only_cross_attention (`bool`, *optional*):
            Whether to use only cross-attention layers. In this case two cross attention layers are used.
        double_self_attention (`bool`, *optional*):
            Whether to use two self-attention layers. In this case no cross attention layers are used.
        upcast_attention (`bool`, *optional*):
            Whether to upcast the attention computation to float32. This is useful for mixed precision training.
        norm_elementwise_affine (`bool`, *optional*, defaults to `True`):
            Whether to use learnable elementwise affine parameters for normalization.
        norm_type (`str`, *optional*, defaults to `"layer_norm"`):
            The normalization layer to use. Can be `"layer_norm"`, `"ada_norm"` or `"ada_norm_zero"`.
        final_dropout (`bool` *optional*, defaults to False):
            Whether to apply a final dropout after the last feed-forward layer.
        attention_type (`str`, *optional*, defaults to `"default"`):
            The type of attention to use. Can be `"default"` or `"gated"` or `"gated-text-image"`.
        positional_embeddings (`str`, *optional*, defaults to `None`):
            The type of positional embeddings to apply to.
        num_positional_embeddings (`int`, *optional*, defaults to `None`):
            The maximum number of positional embeddings to apply.
    r8   Nr4   FTr[   h㈵>defaultr$   rp   rq   ra   r6   num_embeds_ada_normattention_biasonly_cross_attentiondouble_self_attentionupcast_attentionnorm_elementwise_affiner_   norm_epsfinal_dropoutattention_typepositional_embeddingsnum_positional_embeddings-ada_norm_continous_conditioning_embedding_dimada_norm_biasff_inner_dimff_biasattention_out_biasc              
      s  t    || _|| _|| _|| _|| _|| _|| _|
| _	|| _
|| _|| _|	| _|d uo0|dk| _|d uo9|dk| _|dk| _|dk| _|dk| _|dv r]|d u r]td| d| d	|| _|| _|rm|d u rmtd
|dkryt||d| _nd | _|dkrt||| _n#|dkrt||| _n|dkrt|||||d| _n	tj|||d| _t||||||	r|nd ||d| _|d us|
r|dkrt||| _ n|dkrt|||||d| _ nt|||| _ t||
s|nd ||||||d| _!n|dkrt|||| _ nd | _ d | _!|dkrt|||||d| _"n|dv r't|||| _"n|dkr/d | _"t#||||||d| _$|dksD|dkrLt%||||| _&|dkr_t't()d||d  | _*d | _+d| _,d S )NrY   ada_normada_norm_singler[   ada_norm_continuousr   rY   `norm_type` is set to w, but `num_embeds_ada_norm` is not defined. Please make sure to define `num_embeds_ada_norm` if setting `norm_type` to .\If `positional_embedding` type is defined, `num_positition_embeddings` must also be defined.
sinusoidalmax_seq_lengthrms_normrf   r.   r2   r3   dropoutr^   ra   r   out_biasr.   ra   r2   r3   r   r^   r   r   )rY   r   r[   layer_norm_i2vgenr   r6   r   	inner_dimr^   gatedzgated-text-image   g      ?r   )-r:   r;   r$   rp   rq   r   ra   r6   r   r   r   r   r   r   use_ada_layer_norm_zerouse_ada_layer_normuse_ada_layer_norm_singleuse_layer_normuse_ada_layer_norm_continuousr&   r_   r   r   	pos_embedr   rA   r   r   r   r@   r   attn1rB   attn2norm3r?   r   r-   fuserrD   r'   randnscale_shift_tablern   ro   )rG   r$   rp   rq   r   ra   r6   r   r   r   r   r   r   r_   r   r   r   r   r   r   r   r   r   r   rH   r   r!   r;      s   



	
	




	



zBasicTransformerBlock.__init__r   r   c                 C   rs   rt   ru   rv   r   r   r!   rw     rx   z,BasicTransformerBlock.set_chunk_feed_forwardr   attention_maskry   encoder_attention_masktimestepcross_attention_kwargsclass_labelsadded_cond_kwargsrL   c	                 C   sp  |d ur| dd d urtd |jd }	| jdkr"| ||}
n\| jdkr7| j||||jd\}
}}}}nG| jdv rB| |}
n<| jdkrP| ||d	 }
n.| jd
krz| jd  ||	dd j	ddd\}}}}}}| |}
|
d|  | }
nt
d| jd ur| |
}
|d ur| ni }|dd }| j|
f| jr|nd |d|}| jdkr|d| }n	| jd
kr|| }|| }|jdkr|d}|d ur| ||d }| jd ur2| jdkr| ||}
n&| jdv r| |}
n| jd
kr|}
n| jdkr| ||d	 }
nt
d| jd ur"| jd
kr"| |
}
| j|
f||d|}|| }| jdkrA| ||d	 }
n| jd
ksL| |}
| jdkrf|
d|d d d f   |d d d f  }
| jd
kry| |}
|
d|  | }
| jd urt| j|
| j| j}n| |
}| jdkr|d| }n
| jd
kr|| }|| }|jdkr|d}|S )NscaleSPassing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.r   r   rY   )hidden_dtype)r[   r   r   pooled_text_embr   r   r   r#   zIncorrect norm usedgligenry   r      rK   )rY   r[   r   zIncorrect norm)getloggerwarningr%   r_   rA   dtyper   reshaper)   r&   r   copypopr   r   r|   ndimsqueezer   r   rB   r   rn   r,   r   ro   )rG   r   r   ry   r   r   r   r   r   
batch_sizer}   r~   r   r   r   	shift_msa	scale_msagligen_kwargsr   r+   r   r   r!   rO     s   



















(


zBasicTransformerBlock.forward)r8   Nr4   NFFFFTr[   r   Fr   NNNNNTTr   )NNNNNNN)rP   rQ   rR   rS   rT   r   strboolfloatr;   rw   r'   rU   
LongTensorr   r   rO   rV   r   r   rH   r!   r      s    '	
 )
	
r   c                
       sH   e Zd ZdZ		ddededee dee f fdd	Zd
d Z  Z	S )LuminaFeedForwarda'  
    A feed-forward layer.

    Parameters:
        hidden_size (`int`):
            The dimensionality of the hidden layers in the model. This parameter determines the width of the model's
            hidden representations.
        intermediate_size (`int`): The intermediate dimension of the feedforward layer.
        multiple_of (`int`, *optional*): Value to ensure hidden dimension is a multiple
            of this value.
        ffn_dim_multiplier (float, *optional*): Custom multiplier for hidden
            dimension. Defaults to None.
       Nr$   r   multiple_offfn_dim_multiplierc                    s   t    td| d }|d urt|| }||| d |  }tj||dd| _tj||dd| _tj||dd| _t | _	d S )Nr      r   Fr^   )
r:   r;   rT   r   r<   linear_1linear_2linear_3r   silu)rG   r$   r   r   r   rH   r   r!   r;   /  s*   
zLuminaFeedForward.__init__c                 C   s    |  | | || | S rt   )r   r   r   r   )rG   rJ   r   r   r!   rO   N  s    zLuminaFeedForward.forward)r   N)
rP   rQ   rR   rS   rT   r   r   r;   rO   rV   r   r   rH   r!   r      s    r   c                       sx   e Zd ZdZ	ddededededee f
 fdd	Zd
ee fddZ	ddej	dedeej	 dej	fddZ
  ZS )TemporalBasicTransformerBlocka  
    A basic Transformer block for video like data.

    Parameters:
        dim (`int`): The number of channels in the input and output.
        time_mix_inner_dim (`int`): The number of channels for temporal attention.
        num_attention_heads (`int`): The number of heads to use for multi-head attention.
        attention_head_dim (`int`): The number of channels in each head.
        cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
    Nr$   time_mix_inner_dimrp   rq   ra   c                    s   t    ||k| _t|| _t||dd| _t|| _t	|||d d| _
|d ur;t|| _t	||||d| _nd | _d | _t|| _t|dd| _d | _d | _d S )Nr4   )rh   r6   )r.   r2   r3   ra   )r.   ra   r2   r3   r5   )r:   r;   is_resr   r@   norm_inr?   ff_inrA   r   r   rB   r   r   r   rn   ro   )rG   r$   r   rp   rq   ra   rH   r   r!   r;   _  s:   



z&TemporalBasicTransformerBlock.__init__r   c                 K   s   || _ d| _d S )Nr   ru   )rG   r   kwargsr   r   r!   rw     s   
z4TemporalBasicTransformerBlock.set_chunk_feed_forwardr   
num_framesry   rL   c                 C   sv  |j d }|j \}}}|| }|d d d f ||||}|dddd}||| ||}|}| |}| jd urEt| j|| j| j}n| |}| jrQ|| }| 	|}	| j
|	d d}
|
| }| jd urv| |}	| j|	|d}
|
| }| |}	| jd urt| j|	| j| j}n| |	}| jr|| }n|}|d d d f ||||}|dddd}||| ||}|S )Nr   r   r   r   )ry   )r%   r   permuter   rn   r,   r   ro   r   rA   r   r   rB   r   r   )rG   r   r   ry   r   batch_frames
seq_lengthchannelsresidualr}   r   r+   r   r   r!   rO     s>   










z%TemporalBasicTransformerBlock.forwardrt   )rP   rQ   rR   rS   rT   r   r;   rw   r'   rU   rO   rV   r   r   rH   r!   r   R  s2    5
r   c                       sT   e Zd Z				ddedededed	ed
ee dedef fddZdd Z  ZS )SkipFFTransformerBlockr8   NFTr$   rp   rq   kv_input_dimkv_input_dim_proj_use_biasra   r   r   c
           
   	      sv   t    ||krt|||| _nd | _t|d| _t|||||||	d| _t|d| _	t|||||||	d| _
d S )NrZ   )r.   r2   r3   r   r^   ra   r   )r.   ra   r2   r3   r   r^   r   )r:   r;   r   r<   	kv_mapperr   rA   r   r   rB   r   )
rG   r$   rp   rq   r   r   r   ra   r   r   rH   r   r!   r;     s0   

zSkipFFTransformerBlock.__init__c                 C   s   |d ur|  ni }| jd ur| t|}| |}| j|fd|i|}|| }| |}| j|fd|i|}|| }|S )Nry   )r   r   rk   r   rA   r   rB   r   )rG   r   ry   r   r}   r   r   r   r!   rO     s,   


zSkipFFTransformerBlock.forward)r8   NFT)	rP   rQ   rR   rT   r   r   r;   rO   rV   r   r   rH   r!   r     s.    	
*r   c                /       sz  e Zd ZdZ																				
	d8dededededee dedee dededededededededee dee dee deded ed!ed"ef. fd#d$Z	d%ed&e
eeef  fd'd(Zd9d%ed"ed&e
e fd)d*Z	d9d ed!ed"ed&dfd+d,Zd:d.ee ded&dfd/d0Z				d;d1ejd2eej d3eej d4eej d5eeef d&ejfd6d7Z  ZS )<FreeNoiseTransformerBlocka  
    A FreeNoise Transformer block.

    Parameters:
        dim (`int`):
            The number of channels in the input and output.
        num_attention_heads (`int`):
            The number of heads to use for multi-head attention.
        attention_head_dim (`int`):
            The number of channels in each head.
        dropout (`float`, *optional*, defaults to 0.0):
            The dropout probability to use.
        cross_attention_dim (`int`, *optional*):
            The size of the encoder_hidden_states vector for cross attention.
        activation_fn (`str`, *optional*, defaults to `"geglu"`):
            Activation function to be used in feed-forward.
        num_embeds_ada_norm (`int`, *optional*):
            The number of diffusion steps used during training. See `Transformer2DModel`.
        attention_bias (`bool`, defaults to `False`):
            Configure if the attentions should contain a bias parameter.
        only_cross_attention (`bool`, defaults to `False`):
            Whether to use only cross-attention layers. In this case two cross attention layers are used.
        double_self_attention (`bool`, defaults to `False`):
            Whether to use two self-attention layers. In this case no cross attention layers are used.
        upcast_attention (`bool`, defaults to `False`):
            Whether to upcast the attention computation to float32. This is useful for mixed precision training.
        norm_elementwise_affine (`bool`, defaults to `True`):
            Whether to use learnable elementwise affine parameters for normalization.
        norm_type (`str`, defaults to `"layer_norm"`):
            The normalization layer to use. Can be `"layer_norm"`, `"ada_norm"` or `"ada_norm_zero"`.
        final_dropout (`bool` defaults to `False`):
            Whether to apply a final dropout after the last feed-forward layer.
        attention_type (`str`, defaults to `"default"`):
            The type of attention to use. Can be `"default"` or `"gated"` or `"gated-text-image"`.
        positional_embeddings (`str`, *optional*):
            The type of positional embeddings to apply to.
        num_positional_embeddings (`int`, *optional*, defaults to `None`):
            The maximum number of positional embeddings to apply.
        ff_inner_dim (`int`, *optional*):
            Hidden dimension of feed-forward MLP.
        ff_bias (`bool`, defaults to `True`):
            Whether or not to use bias in feed-forward MLP.
        attention_out_bias (`bool`, defaults to `True`):
            Whether or not to use bias in attention output project layer.
        context_length (`int`, defaults to `16`):
            The maximum number of frames that the FreeNoise block processes at once.
        context_stride (`int`, defaults to `4`):
            The number of frames to be skipped before starting to process a new batch of `context_length` frames.
        weighting_scheme (`str`, defaults to `"pyramid"`):
            The weighting scheme to use for weighting averaging of processed latent frames. As described in the
            Equation 9. of the [FreeNoise](https://arxiv.org/abs/2310.15169) paper, "pyramid" is the default setting
            used.
    r8   Nr4   FTr[   r      r   pyramidr$   rp   rq   r   ra   r6   r   r   r   r   r   r   r_   r   r   r   r   r   r   r   context_lengthcontext_strideweighting_schemec              
      s  t    || _|| _|| _|| _|| _|| _|| _|
| _	|| _
|| _|| _|	| _| ||| |d uo7|dk| _|d uo@|dk| _|dk| _|dk| _|dk| _|dv rd|d u rdtd| d| d	|| _|| _|rt|d u rttd
|dkrt||d| _nd | _tj|||d| _t||||||	r|nd ||d| _|d us|
rt|||| _t||
s|nd ||||||d| _t ||||||d| _!t|||| _"d | _#d| _$d S )NrY   r   r   r[   r   r   r   r   r   r   r   r   rf   r   r   r   r   )%r:   r;   r$   rp   rq   r   ra   r6   r   r   r   r   r   r   set_free_noise_propertiesr   r   r   r   r   r&   r_   r   r   r   r   r@   rA   r   r   rB   r   r?   r   r   rn   ro   )rG   r$   rp   rq   r   ra   r6   r   r   r   r   r   r   r_   r   r   r   r   r   r   r   r   r   r   rH   r   r!   r;   T  s   





	
z"FreeNoiseTransformerBlock.__init__r   rL   c                 C   sH   g }t d|| j d | jD ]}|}t||| j }|||f q|S )Nr   r   )ranger   r   minappend)rG   r   frame_indicesiwindow_start
window_endr   r   r!   _get_frame_indices  s   z,FreeNoiseTransformerBlock._get_frame_indicesc                 C   s   |dkr=|d dkr t td|d d }||d d d  }|S t td|d d }||d d g |d d d  }|S td| )Nr   r   r   r   r   z'Unsupported value for weighting_scheme=)listr   r&   )rG   r   r   weightsr   r   r!   _get_frame_weights  s    z,FreeNoiseTransformerBlock._get_frame_weightsc                 C   s   || _ || _|| _d S rt   )r   r   r   )rG   r   r   r   r   r   r!   r     s   
z3FreeNoiseTransformerBlock.set_free_noise_propertiesr   r   c                 C   rs   rt   ru   rv   r   r   r!   rw     rx   z0FreeNoiseTransformerBlock.set_chunk_feed_forwardr   r   ry   r   r   c              	   O   s.  |d ur| dd d urtd |d ur| ni }|j}|j}	|d}
| |
}| | j	| j
}tj|||	ddd}|d d |
k}|sm|
| j	k r[td|
d| j	|
|d d  }||
| j	 |
f tjd|
df|d	}t|}t|D ]\}\}}t|d d ||f }||9 }|d d ||f }| |}| jd ur| |}| j|f| jr|nd |d
|}|| }|jdkr|d}| jd ur| |}| jd ur| jdkr| |}| j|f||d
|}|| }|t|d kr>|s>|d d | d f  |d d | d f |d d | d f  7  < |d d | d f  |d d | f 7  < q|d d ||f  || 7  < |d d ||f  |7  < qt|dk|| ||	}|  |}| j!d urt"| j#|| j$| j!}n| #|}|| }|jdkr|d}|S )Nr   r   r   )devicer   r   r   zExpected num_frames=z1 to be greater or equal than self.context_length=)r  r   r   r   )%r   r   r   r   r  r   sizer  r
  r   r   r'   rE   r|   r&   r  zeros
zeros_like	enumerate	ones_likerA   r   r   r   r   r   r   rB   r_   lenwheretor   rn   r,   r   ro   )rG   r   r   ry   r   r   argsr   r  r   r   r  frame_weightsis_last_frame_batch_completelast_frame_batch_lengthnum_times_accumulatedaccumulated_valuesr  frame_start	frame_endr	  hidden_states_chunkr}   r   r+   r   r   r!   rO     s   













*. 


z!FreeNoiseTransformerBlock.forward)r8   Nr4   NFFFFTr[   r   FNNNTTr   r   r   )r   r   )NNNN)rP   rQ   rR   rS   rT   r   r   r   r   r;   r   r   r  r
  r   rw   r'   rU   r   r   rO   rV   r   r   rH   r!   r     s    ;	
r

	r   c                       sh   e Zd ZdZ							dded	ee d
ededededef fddZ	de
jde
jfddZ  ZS )r?   a  
    A feed-forward layer.

    Parameters:
        dim (`int`): The number of channels in the input.
        dim_out (`int`, *optional*): The number of channels in the output. If not given, defaults to `dim`.
        mult (`int`, *optional*, defaults to 4): The multiplier to use for the hidden dimension.
        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
        final_dropout (`bool` *optional*, defaults to False): Apply a final dropout.
        bias (`bool`, defaults to True): Whether to use a bias in the linear layer.
    Nr   r8   r4   FTr$   rh   multr   r6   r   r^   c	           
         s  t    |d u rt|| }|d ur|n|}|dkr"t|||d}	|dkr/t||d|d}	n#|dkr;t|||d}	n|dkrGt|||d}	n|dkrRt|||d}	tg | _	| j	
|	 | j	
t| | j	
tj|||d |r| j	
t| d S d S )	Ngelur   rg   rM   )approximater^   r4   zgeglu-approximateswiglu)r:   r;   rT   r   r   r   r   r   
ModuleListnetr  Dropoutr<   )
rG   r$   rh   r  r   r6   r   r   r^   act_fnrH   r   r!   r;   c  s*   
zFeedForward.__init__r   rL   c                 O   sD   t |dks|dd d urd}tdd| | jD ]}||}q|S )Nr   r   zThe `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`.z1.0.0)r  r   r	   r"  )rG   r   r  r   deprecation_messagemoduler   r   r!   rO     s   

zFeedForward.forward)Nr   r8   r4   FNT)rP   rQ   rR   rS   rT   r   r   r   r   r;   r'   rU   rO   rV   r   r   rH   r!   r?   U  s2    	&r?   )/typingr   r   r   r   r   r'   torch.nn.functionalr   
functionalrk   utilsr	   r
   utils.torch_utilsr   activationsr   r   r   r   r   attention_processorr   r   
embeddingsr   normalizationr   r   r   r   
get_loggerrP   r   ModulerU   rT   r,   r-   rW   r   r   r   r   r   r?   r   r   r   r!   <module>   s<   
){  M2 H  :