o
    Gi                      @   sH  d dl mZ d dlZd dlmZ d dlm  mZ ddlm	Z	 ddl
mZ ddlmZ ddlmZmZmZmZmZ dd	lmZ dd
lmZmZmZmZmZmZ G dd dejZG dd dejZdej dej fddZ!G dd dejZ"G dd dejZ#G dd dejZ$G dd dejZ%G dd dejZ&G dd dejZ'dS )    )partialN   )	deprecate   )get_activation)SpatialNorm)Downsample1DDownsample2DFirDownsample2DKDownsample2Ddownsample_2d)AdaGroupNorm)FirUpsample2DKUpsample2D
Upsample1D
Upsample2Dupfirdn2d_nativeupsample_2dc                "       s   e Zd ZdZddddddddd	d
dddddddededB dedededededB dedededededB dededededB f  fddZde	j
d e	j
d!e	j
fd"d#Z  ZS )$ResnetBlockCondNorm2Da)  
    A Resnet block that use normalization layer that incorporate conditioning information.

    Parameters:
        in_channels (`int`): The number of channels in the input.
        out_channels (`int`, *optional*, default to be `None`):
            The number of output channels for the first conv2d layer. If None, same as `in_channels`.
        dropout (`float`, *optional*, defaults to `0.0`): The dropout probability to use.
        temb_channels (`int`, *optional*, default to `512`): the number of channels in timestep embedding.
        groups (`int`, *optional*, default to `32`): The number of groups to use for the first normalization layer.
        groups_out (`int`, *optional*, default to None):
            The number of groups to use for the second normalization layer. if set to None, same as `groups`.
        eps (`float`, *optional*, defaults to `1e-6`): The epsilon to use for the normalization.
        non_linearity (`str`, *optional*, default to `"swish"`): the activation function to use.
        time_embedding_norm (`str`, *optional*, default to `"ada_group"` ):
            The normalization layer for time embedding `temb`. Currently only support "ada_group" or "spatial".
        kernel (`torch.Tensor`, optional, default to None): FIR filter, see
            [`~models.resnet.FirUpsample2D`] and [`~models.resnet.FirDownsample2D`].
        output_scale_factor (`float`, *optional*, default to be `1.0`): the scale factor to use for the output.
        use_in_shortcut (`bool`, *optional*, default to `True`):
            If `True`, add a 1x1 nn.conv2d layer for skip-connection.
        up (`bool`, *optional*, default to `False`): If `True`, add an upsample layer.
        down (`bool`, *optional*, default to `False`): If `True`, add a downsample layer.
        conv_shortcut_bias (`bool`, *optional*, default to `True`):  If `True`, adds a learnable bias to the
            `conv_shortcut` output.
        conv_2d_out_channels (`int`, *optional*, default to `None`): the number of channels in the output.
            If None, same as `out_channels`.
    NF               ư>swish	ada_group      ?T)out_channelsconv_shortcutdropouttemb_channelsgroups
groups_outepsnon_linearitytime_embedding_normoutput_scale_factoruse_in_shortcutupdownconv_shortcut_biasconv_2d_out_channelsin_channelsr   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   c                   s  t    || _|d u r|n|}|| _|| _|| _|| _|| _|
| _|d u r(|}| jdkr7t	||||d| _
n| jdkrCt||| _
ntd| j tj||dddd| _| jdkret	||||d| _n| jdkrqt||| _ntd| j tj|| _|p|}tj||dddd| _t|	| _d  | _| _| jrt|dd	| _n| jrt|ddd
d| _|d u r| j|kn|| _d | _| jrtj||ddd|d| _d S d S )Nr   )r"   spatialz" unsupported time_embedding_norm:    r   kernel_sizestridepaddingFuse_convopr3   r1   namer   r/   r0   r1   bias)super__init__r+   r   use_conv_shortcutr'   r(   r%   r$   r   norm1r   
ValueErrornnConv2dconv1norm2torchDropoutr   conv2r   nonlinearityupsample
downsampler   r	   r&   r   )selfr+   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   	__class__ K/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/models/resnet.pyr:   I   sV   





zResnetBlockCondNorm2D.__init__input_tensortembreturnc                 O   s   t |dks|dd d urd}tdd| |}| ||}| |}| jd urB|jd dkr7| }| }| |}| |}n| jd urQ| |}| |}| 	|}| 
||}| |}| |}| |}| jd uru| |}|| | j }|S )Nr   scaleThe `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`.1.0.0@   )lengetr   r<   rE   rF   shape
contiguousrG   r@   rA   r   rD   r   r%   )rH   rM   rN   argskwargsdeprecation_messagehidden_statesoutput_tensorrK   rK   rL   forward   s0   











zResnetBlockCondNorm2D.forward)__name__
__module____qualname____doc__intboolfloatstrr:   rB   Tensorr]   __classcell__rK   rK   rI   rL   r   +   sf    !	
$Kr   c                (       s   e Zd ZdZddddddddd	dd
dddddddddededB dedededededB dedededededej	dB dededB dededededB f& fd d!Z
d"ej	d#ej	d$ej	fd%d&Z  ZS )'ResnetBlock2Da9  
    A Resnet block.

    Parameters:
        in_channels (`int`): The number of channels in the input.
        out_channels (`int`, *optional*, default to be `None`):
            The number of output channels for the first conv2d layer. If None, same as `in_channels`.
        dropout (`float`, *optional*, defaults to `0.0`): The dropout probability to use.
        temb_channels (`int`, *optional*, default to `512`): the number of channels in timestep embedding.
        groups (`int`, *optional*, default to `32`): The number of groups to use for the first normalization layer.
        groups_out (`int`, *optional*, default to None):
            The number of groups to use for the second normalization layer. if set to None, same as `groups`.
        eps (`float`, *optional*, defaults to `1e-6`): The epsilon to use for the normalization.
        non_linearity (`str`, *optional*, default to `"swish"`): the activation function to use.
        time_embedding_norm (`str`, *optional*, default to `"default"` ): Time scale shift config.
            By default, apply timestep embedding conditioning with a simple shift mechanism. Choose "scale_shift" for a
            stronger conditioning with scale and shift.
        kernel (`torch.Tensor`, optional, default to None): FIR filter, see
            [`~models.resnet.FirUpsample2D`] and [`~models.resnet.FirDownsample2D`].
        output_scale_factor (`float`, *optional*, default to be `1.0`): the scale factor to use for the output.
        use_in_shortcut (`bool`, *optional*, default to `True`):
            If `True`, add a 1x1 nn.conv2d layer for skip-connection.
        up (`bool`, *optional*, default to `False`): If `True`, add an upsample layer.
        down (`bool`, *optional*, default to `False`): If `True`, add a downsample layer.
        conv_shortcut_bias (`bool`, *optional*, default to `True`):  If `True`, adds a learnable bias to the
            `conv_shortcut` output.
        conv_2d_out_channels (`int`, *optional*, default to `None`): the number of channels in the output.
            If None, same as `out_channels`.
    NFr   r   r   Tr   r   defaultr   )r   r   r   r   r    r!   pre_normr"   r#   skip_time_actr$   kernelr%   r&   r'   r(   r)   r*   r+   r   r   r   r   r    r!   rj   r"   r#   rk   r$   rl   r%   r&   r'   r(   r)   r*   c                   s>  t    |dkrtd|dkrtdd| _|| _|d u r!|n|}|| _|| _|| _|| _|| _	|| _
|| _|d u r>|}tjj|||	dd| _tj||dddd	| _|d ur}| j
d
kret||| _n| j
dkrtt|d| | _ntd| j
 dd | _tjj|||	dd| _tj|| _|p|}tj||dddd	| _t|
| _d  | _| _| jr|dkrd  fdd| _n>|dkrttjddd| _n0t|dd| _n(| jr|dkrd  fdd| _n|dkrttj ddd| _n	t!|dddd| _|d u r| j|kn|| _"d | _#| j"rtj||ddd|d| _#d S d S )Nr   zkThis class cannot be used with `time_embedding_norm==ada_group`, please use `ResnetBlockCondNorm2D` insteadr,   ziThis class cannot be used with `time_embedding_norm==spatial`, please use `ResnetBlockCondNorm2D` insteadT
num_groupsnum_channelsr"   affiner-   r   r.   ri   scale_shiftr   zunknown time_embedding_norm :  fir)r   r-   r-   r   c                       t |  dS N)rl   )r   x
fir_kernelrK   rL   <lambda>$      z(ResnetBlock2D.__init__.<locals>.<lambda>sde_vpg       @nearest)scale_factormodeFr2   c                    rt   ru   )r   rv   rx   rK   rL   rz   ,  r{   )r/   r0   r4   r5   r   r7   )$r9   r:   r=   rj   r+   r   r;   r'   r(   r%   r$   rk   rB   r>   	GroupNormr<   r?   r@   Lineartime_emb_projrA   rC   r   rD   r   rE   rF   rG   r   Finterpolater   
avg_pool2dr	   r&   r   )rH   r+   r   r   r   r   r    r!   rj   r"   r#   rk   r$   rl   r%   r&   r'   r(   r)   r*   rI   rx   rL   r:      s|   



zResnetBlock2D.__init__rM   rN   rO   c           
      O   s  t |dks|dd d urd}tdd| |}| |}| |}| jd urA|jd dkr6| }| }| |}| |}n| jd urP| |}| |}| 	|}| j
d urq| jsb| |}| 
|d d d d d d f }| jdkr|d ur~|| }| |}n.| jdkr|d u rtd| j tj|d	d
d\}}| |}|d
|  | }n| |}| |}| |}| |}| jd ur| jr| }| |}|| | j }	|	S )Nr   rP   rQ   rR   rS   ri   rq   z9 `temb` should not be None when `time_embedding_norm` is r   r   )dim)rT   rU   r   r<   rE   rF   rV   rW   rG   r@   r   rk   r$   rA   r=   rB   chunkr   rD   r   trainingr%   )
rH   rM   rN   rX   rY   rZ   r[   
time_scale
time_shiftr\   rK   rK   rL   r]   ?  sT   



















zResnetBlock2D.forward)r^   r_   r`   ra   rb   rc   rd   re   rB   rf   r:   r]   rg   rK   rK   rI   rL   rh      sx    "	
$drh   tensorrO   c                 C   s   t | jdkr| d d d d d f S t | jdkr'| d d d d d d d f S t | jdkr<| d d d d dd d f S tdt |  d)Nr   r-      r   z`len(tensor)`: z has to be 2, 3 or 4.)rT   rV   r=   )r   rK   rK   rL   rearrange_dims}  s   r   c                       s^   e Zd ZdZ		ddededeeeef B dedef
 fd	d
Zdej	dej	fddZ
  ZS )Conv1dBlocka  
    Conv1d --> GroupNorm --> Mish

    Parameters:
        inp_channels (`int`): Number of input channels.
        out_channels (`int`): Number of output channels.
        kernel_size (`int` or `tuple`): Size of the convolving kernel.
        n_groups (`int`, default `8`): Number of groups to separate the channels into.
        activation (`str`, defaults to `mish`): Name of the activation function.
       mishinp_channelsr   r/   n_groups
activationc                    s>   t    tj||||d d| _t||| _t|| _d S )Nr   r1   )	r9   r:   r>   Conv1dconv1dr   
group_normr   r   )rH   r   r   r/   r   r   rI   rK   rL   r:     s   
zConv1dBlock.__init__inputsrO   c                 C   s2   |  |}t|}| |}t|}| |}|S N)r   r   r   r   )rH   r   intermediate_reproutputrK   rK   rL   r]     s   


zConv1dBlock.forward)r   r   r^   r_   r`   ra   rb   tuplere   r:   rB   rf   r]   rg   rK   rK   rI   rL   r     s     r   c                       sd   e Zd ZdZ		ddedededeeeef B def
 fd	d
Zdej	dej	dej	fddZ
  ZS )ResidualTemporalBlock1Da  
    Residual 1D block with temporal convolutions.

    Parameters:
        inp_channels (`int`): Number of input channels.
        out_channels (`int`): Number of output channels.
        embed_dim (`int`): Embedding dimension.
        kernel_size (`int` or `tuple`): Size of the convolving kernel.
        activation (`str`, defaults `mish`): It is possible to choose the right activation function.
       r   r   r   	embed_dimr/   r   c                    sh   t    t|||| _t|||| _t|| _t||| _	||kr-t
||d| _d S t | _d S )Nr   )r9   r:   r   conv_inconv_outr   time_emb_actr>   r   time_embr   Identityresidual_conv)rH   r   r   r   r/   r   rI   rK   rL   r:     s   

z ResidualTemporalBlock1D.__init__r   trO   c                 C   s>   |  |}| |}| |t| }| |}|| | S )z
        Args:
            inputs : [ batch_size x inp_channels x horizon ]
            t : [ batch_size x embed_dim ]

        returns:
            out : [ batch_size x out_channels x horizon ]
        )r   r   r   r   r   r   )rH   r   r   outrK   rK   rL   r]     s
   
	

zResidualTemporalBlock1D.forward)r   r   r   rK   rK   rI   rL   r     s     $r   c                	       sZ   e Zd ZdZ			ddededB dedef fd	d
ZddejdedejfddZ	  Z
S )TemporalConvLayera  
    Temporal convolutional layer that can be used for video (sequence of images) input Code mostly copied from:
    https://github.com/modelscope/modelscope/blob/1509fdb973e5871f37148a4b5e5964cafd43e64d/modelscope/models/multi_modal/video_synthesis/unet_sd.py#L1016

    Parameters:
        in_dim (`int`): Number of input channels.
        out_dim (`int`): Number of output channels.
        dropout (`float`, *optional*, defaults to `0.0`): The dropout probability to use.
    Nr   r   in_dimout_dimr   norm_num_groupsc                    s
  t    |p|}|| _|| _tt||t tj||ddd| _	tt||t t
|tj||ddd| _tt||t t
|tj||ddd| _tt||t t
|tj||ddd| _tj| jd j tj| jd j d S )Nr-   r   r   )r   r   r   r   )r9   r:   r   r   r>   
Sequentialr   SiLUConv3dr@   rC   rD   conv3conv4initzeros_weightr8   )rH   r   r   r   r   rI   rK   rL   r:     s:   




zTemporalConvLayer.__init__r   r[   
num_framesrO   c                 C   s   |d d d f  d|f|jdd   ddddd}|}| |}| |}| |}| |}|| }|ddddd |jd |jd  df|jdd   }|S )Nr   r   r   r   r-   r   )reshaperV   permuter@   rD   r   r   )rH   r[   r   identityrK   rK   rL   r]     s   4



$zTemporalConvLayer.forward)Nr   r   )r   r^   r_   r`   ra   rb   rd   r:   rB   rf   r]   rg   rK   rK   rI   rL   r     s    $)r   c                	       sZ   e Zd ZdZ			ddededB dedef fd	d
ZdejdejdejfddZ	  Z
S )TemporalResnetBlocka  
    A Resnet block.

    Parameters:
        in_channels (`int`): The number of channels in the input.
        out_channels (`int`, *optional*, default to be `None`):
            The number of output channels for the first conv2d layer. If None, same as `in_channels`.
        temb_channels (`int`, *optional*, default to `512`): the number of channels in timestep embedding.
        eps (`float`, *optional*, defaults to `1e-6`): The epsilon to use for the normalization.
    Nr   r   r+   r   r   r"   c                    s   t    || _|d u r|n|}|| _d}dd |D }tjjd||dd| _tj|||d|d| _	|d ur>t
||| _nd | _tjjd||dd| _tjd	| _tj|||d|d| _td
| _| j|k| _d | _| jr|tj||dddd| _d S d S )Nr   c                 S   s   g | ]}|d  qS )r   rK   ).0krK   rK   rL   
<listcomp>;  s    z0TemporalResnetBlock.__init__.<locals>.<listcomp>r   Trm   r   r.   r   silur   )r9   r:   r+   r   rB   r>   r   r<   r   r@   r   r   rA   rC   r   rD   r   rE   r&   r   )rH   r+   r   r   r"   r/   r1   rI   rK   rL   r:   .  sL   

zTemporalResnetBlock.__init__rM   rN   rO   c                 C   s   |}|  |}| |}| |}| jd ur:| |}| |d d d d d d d d f }|ddddd}|| }| |}| |}| |}| |}| jd urX| |}|| }|S )Nr   r   r   r-   r   )	r<   rE   r@   r   r   rA   r   rD   r   )rH   rM   rN   r[   r\   rK   rK   rL   r]   d  s"   




$





zTemporalResnetBlock.forward)Nr   r   r   rK   rK   rI   rL   r   "  s    $6r   c                       s   e Zd ZdZ							dded	edB d
edededB dedef fddZ		ddej	dej	dB dej	dB fddZ
  ZS )SpatioTemporalResBlocka  
    A SpatioTemporal Resnet block.

    Parameters:
        in_channels (`int`): The number of channels in the input.
        out_channels (`int`, *optional*, default to be `None`):
            The number of output channels for the first conv2d layer. If None, same as `in_channels`.
        temb_channels (`int`, *optional*, default to `512`): the number of channels in timestep embedding.
        eps (`float`, *optional*, defaults to `1e-6`): The epsilon to use for the spatial resenet.
        temporal_eps (`float`, *optional*, defaults to `eps`): The epsilon to use for the temporal resnet.
        merge_factor (`float`, *optional*, defaults to `0.5`): The merge factor to use for the temporal mixing.
        merge_strategy (`str`, *optional*, defaults to `learned_with_images`):
            The merge strategy to use for the temporal mixing.
        switch_spatial_to_temporal_mix (`bool`, *optional*, defaults to `False`):
            If `True`, switch the spatial and temporal mixing.
    Nr   r         ?learned_with_imagesFr+   r   r   r"   temporal_epsmerge_factorswitch_spatial_to_temporal_mixc	           	         sf   t    t||||d| _t|d ur|n||d ur|n|||d ur$|n|d| _t|||d| _d S )N)r+   r   r   r"   )alphamerge_strategyr   )r9   r:   rh   spatial_res_blockr   temporal_res_blockAlphaBlender
time_mixer)	rH   r+   r   r   r"   r   r   r   r   rI   rK   rL   r:     s$   
zSpatioTemporalResBlock.__init__r[   rN   image_only_indicatorc                 C   s   |j d }| ||}|j \}}}}|| }	|d d d f |	||||ddddd}
|d d d f |	||||ddddd}|d urM||	|d}| ||}| j|
||d}|ddddd||||}|S )Nr   r   r   r   r-   r   )	x_spatial
x_temporalr   )rV   r   r   r   r   r   )rH   r[   rN   r   r   batch_frameschannelsheightwidth
batch_sizehidden_states_mixrK   rK   rL   r]     s$   
**zSpatioTemporalResBlock.forward)Nr   r   Nr   r   F)NN)r^   r_   r`   ra   rb   rd   rc   r:   rB   rf   r]   rg   rK   rK   rI   rL   r     sB    	$r   c                	       s   e Zd ZdZg dZ		ddededef fdd	Zd
e	j
dede	j
fddZ	dde	j
de	j
d
e	j
dB de	j
fddZ  ZS )r   a  
    A module to blend spatial and temporal features.

    Parameters:
        alpha (`float`): The initial value of the blending factor.
        merge_strategy (`str`, *optional*, defaults to `learned_with_images`):
            The merge strategy to use for the temporal mixing.
        switch_spatial_to_temporal_mix (`bool`, *optional*, defaults to `False`):
            If `True`, switch the spatial and temporal mixing.
    )learnedfixedr   r   Fr   r   r   c                    s   t    || _|| _|| jvrtd| j | jdkr)| dt|g d S | jdks3| jdkrC| 	dtj
t|g d S td| j )Nzmerge_strategy needs to be in r   
mix_factorr   r   zUnknown merge strategy )r9   r:   r   r   
strategiesr=   register_bufferrB   rf   register_parameterr>   	Parameter)rH   r   r   r   rI   rK   rL   r:     s   


 zAlphaBlender.__init__r   ndimsrO   c                 C   s   | j dkr
| j}|S | j dkrt| j}|S | j dkrg|d u r$tdt| tjdd|jdt| jd }|dkrM|d d d d d d d f }|S |d	kr_|	d
d d d d f }|S td| dt
)Nr   r   r   zMPlease provide image_only_indicator to use learned_with_images merge strategyr   )device).Nr   r-   r   zUnexpected ndims z. Dimensions should be 3 or 5)r   r   rB   sigmoidr=   whererc   onesr   r   NotImplementedError)rH   r   r   r   rK   rK   rL   	get_alpha  s,   



zAlphaBlender.get_alphaNr   r   c                 C   s@   |  ||j}||j}| jrd| }|| d| |  }|S )Nr   )r   ndimtodtyper   )rH   r   r   r   r   rw   rK   rK   rL   r]     s   zAlphaBlender.forward)r   Fr   )r^   r_   r`   ra   r   rd   re   rc   r:   rB   rf   rb   r   r]   rg   rK   rK   rI   rL   r     s.    #r   )(	functoolsr   rB   torch.nnr>   torch.nn.functional
functionalr   utilsr   activationsr   attention_processorr   downsamplingr   r	   r
   r   r   normalizationr   
upsamplingr   r   r   r   r   r   Moduler   rh   rf   r   r   r   r   r   r   r   rK   rK   rK   rL   <module>   s*    
  B$/G]T