o
    pi}                     @   s\  d dl mZ d dlmZmZmZ d dlZd dlmZ d dl	m  m
Z ddlmZ ddlmZ ddlmZ dd	lmZmZmZmZmZ dd
lmZ ddlmZmZmZmZmZm Z  G dd dej!Z"G dd dej!Z#dej$dej$fddZ%G dd dej!Z&G dd dej!Z'G dd dej!Z(G dd dej!Z)G dd dej!Z*G dd dej!Z+dS )     )partial)OptionalTupleUnionN   )	deprecate   )get_activation)SpatialNorm)Downsample1DDownsample2DFirDownsample2DKDownsample2Ddownsample_2d)AdaGroupNorm)FirUpsample2DKUpsample2D
Upsample1D
Upsample2Dupfirdn2d_nativeupsample_2dc                "       s   e Zd ZdZddddddddd	d
dddddddedee dededededee dededededee dedededee f  fddZ	de
jd e
jd!e
jfd"d#Z  ZS )$ResnetBlockCondNorm2Da)  
    A Resnet block that use normalization layer that incorporate conditioning information.

    Parameters:
        in_channels (`int`): The number of channels in the input.
        out_channels (`int`, *optional*, default to be `None`):
            The number of output channels for the first conv2d layer. If None, same as `in_channels`.
        dropout (`float`, *optional*, defaults to `0.0`): The dropout probability to use.
        temb_channels (`int`, *optional*, default to `512`): the number of channels in timestep embedding.
        groups (`int`, *optional*, default to `32`): The number of groups to use for the first normalization layer.
        groups_out (`int`, *optional*, default to None):
            The number of groups to use for the second normalization layer. if set to None, same as `groups`.
        eps (`float`, *optional*, defaults to `1e-6`): The epsilon to use for the normalization.
        non_linearity (`str`, *optional*, default to `"swish"`): the activation function to use.
        time_embedding_norm (`str`, *optional*, default to `"ada_group"` ):
            The normalization layer for time embedding `temb`. Currently only support "ada_group" or "spatial".
        kernel (`torch.Tensor`, optional, default to None): FIR filter, see
            [`~models.resnet.FirUpsample2D`] and [`~models.resnet.FirDownsample2D`].
        output_scale_factor (`float`, *optional*, default to be `1.0`): the scale factor to use for the output.
        use_in_shortcut (`bool`, *optional*, default to `True`):
            If `True`, add a 1x1 nn.conv2d layer for skip-connection.
        up (`bool`, *optional*, default to `False`): If `True`, add an upsample layer.
        down (`bool`, *optional*, default to `False`): If `True`, add a downsample layer.
        conv_shortcut_bias (`bool`, *optional*, default to `True`):  If `True`, adds a learnable bias to the
            `conv_shortcut` output.
        conv_2d_out_channels (`int`, *optional*, default to `None`): the number of channels in the output.
            If None, same as `out_channels`.
    NF               ư>swish	ada_group      ?T)out_channelsconv_shortcutdropouttemb_channelsgroups
groups_outepsnon_linearitytime_embedding_normoutput_scale_factoruse_in_shortcutupdownconv_shortcut_biasconv_2d_out_channelsin_channelsr   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   c                   s  t    || _|d u r|n|}|| _|| _|| _|| _|| _|
| _|d u r(|}| jdkr7t	||||d| _
n| jdkrCt||| _
ntd| j tj||dddd| _| jdkret	||||d| _n| jdkrqt||| _ntd| j tj|| _|p|}tj||dddd| _t|	| _d  | _| _| jrt|dd	| _n| jrt|ddd
d| _|d u r| j|kn|| _d | _| jrtj||ddd|d| _d S d S )Nr   )r%   spatialz" unsupported time_embedding_norm:    r   kernel_sizestridepaddingFuse_convopr6   r4   namer   r2   r3   r4   bias)super__init__r.   r   use_conv_shortcutr*   r+   r(   r'   r   norm1r
   
ValueErrornnConv2dconv1norm2torchDropoutr!   conv2r	   nonlinearityupsample
downsampler   r   r)   r    )selfr.   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   	__class__ U/home/ubuntu/SoloSpeech/.venv/lib/python3.10/site-packages/diffusers/models/resnet.pyr=   J   sV   





zResnetBlockCondNorm2D.__init__input_tensortembreturnc                 O   s   t |dks|dd d urd}tdd| |}| ||}| |}| jd urB|jd dkr7| }| }| |}| |}n| jd urQ| |}| |}| 	|}| 
||}| |}| |}| |}| jd uru| |}|| | j }|S )Nr   scaleThe `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`.1.0.0@   )lengetr   r?   rH   rI   shape
contiguousrJ   rC   rD   r!   rG   r    r(   )rK   rP   rQ   argskwargsdeprecation_messagehidden_statesoutput_tensorrN   rN   rO   forward   s0   











zResnetBlockCondNorm2D.forward)__name__
__module____qualname____doc__intr   boolfloatstrr=   rE   Tensorr`   __classcell__rN   rN   rL   rO   r   ,   sf    !	
$Kr   c                (       s   e Zd ZdZddddddddd	dd
dddddddddedee dededededee dedededededee	j
 dedee dedededee f& fd d!Zd"e	j
d#e	j
d$e	j
fd%d&Z  ZS )'ResnetBlock2Da9  
    A Resnet block.

    Parameters:
        in_channels (`int`): The number of channels in the input.
        out_channels (`int`, *optional*, default to be `None`):
            The number of output channels for the first conv2d layer. If None, same as `in_channels`.
        dropout (`float`, *optional*, defaults to `0.0`): The dropout probability to use.
        temb_channels (`int`, *optional*, default to `512`): the number of channels in timestep embedding.
        groups (`int`, *optional*, default to `32`): The number of groups to use for the first normalization layer.
        groups_out (`int`, *optional*, default to None):
            The number of groups to use for the second normalization layer. if set to None, same as `groups`.
        eps (`float`, *optional*, defaults to `1e-6`): The epsilon to use for the normalization.
        non_linearity (`str`, *optional*, default to `"swish"`): the activation function to use.
        time_embedding_norm (`str`, *optional*, default to `"default"` ): Time scale shift config.
            By default, apply timestep embedding conditioning with a simple shift mechanism. Choose "scale_shift" for a
            stronger conditioning with scale and shift.
        kernel (`torch.Tensor`, optional, default to None): FIR filter, see
            [`~models.resnet.FirUpsample2D`] and [`~models.resnet.FirDownsample2D`].
        output_scale_factor (`float`, *optional*, default to be `1.0`): the scale factor to use for the output.
        use_in_shortcut (`bool`, *optional*, default to `True`):
            If `True`, add a 1x1 nn.conv2d layer for skip-connection.
        up (`bool`, *optional*, default to `False`): If `True`, add an upsample layer.
        down (`bool`, *optional*, default to `False`): If `True`, add a downsample layer.
        conv_shortcut_bias (`bool`, *optional*, default to `True`):  If `True`, adds a learnable bias to the
            `conv_shortcut` output.
        conv_2d_out_channels (`int`, *optional*, default to `None`): the number of channels in the output.
            If None, same as `out_channels`.
    NFr   r   r   Tr   r   defaultr   )r   r    r!   r"   r#   r$   pre_normr%   r&   skip_time_actr'   kernelr(   r)   r*   r+   r,   r-   r.   r   r    r!   r"   r#   r$   rm   r%   r&   rn   r'   ro   r(   r)   r*   r+   r,   r-   c                   s>  t    |dkrtd|dkrtdd| _|| _|d u r!|n|}|| _|| _|| _|| _|| _	|| _
|| _|d u r>|}tjj|||	dd| _tj||dddd	| _|d ur}| j
d
kret||| _n| j
dkrtt|d| | _ntd| j
 dd | _tjj|||	dd| _tj|| _|p|}tj||dddd	| _t|
| _d  | _| _| jr|dkrd  fdd| _n>|dkrttjddd| _n0t|dd| _n(| jr|dkrd  fdd| _n|dkrttj ddd| _n	t!|dddd| _|d u r| j|kn|| _"d | _#| j"rtj||ddd|d| _#d S d S )Nr   zkThis class cannot be used with `time_embedding_norm==ada_group`, please use `ResnetBlockCondNorm2D` insteadr/   ziThis class cannot be used with `time_embedding_norm==spatial`, please use `ResnetBlockCondNorm2D` insteadT
num_groupsnum_channelsr%   affiner0   r   r1   rl   scale_shiftr   zunknown time_embedding_norm :  fir)r   r0   r0   r   c                       t |  dS N)ro   )r   x
fir_kernelrN   rO   <lambda>%      z(ResnetBlock2D.__init__.<locals>.<lambda>sde_vpg       @nearest)scale_factormodeFr5   c                    rw   rx   )r   ry   r{   rN   rO   r}   -  r~   )r2   r3   r7   r8   r   r:   )$r<   r=   r@   rm   r.   r   r>   r*   r+   r(   r'   rn   rE   rA   	GroupNormr?   rB   rC   Lineartime_emb_projrD   rF   r!   rG   r	   rH   rI   rJ   r   Finterpolater   
avg_pool2dr   r)   r    )rK   r.   r   r    r!   r"   r#   r$   rm   r%   r&   rn   r'   ro   r(   r)   r*   r+   r,   r-   rL   r{   rO   r=      s|   



zResnetBlock2D.__init__rP   rQ   rR   c           
      O   s  t |dks|dd d urd}tdd| |}| |}| |}| jd urA|jd dkr6| }| }| |}| |}n| jd urP| |}| |}| 	|}| j
d urq| jsb| |}| 
|d d d d d d f }| jdkr|d ur~|| }| |}n.| jdkr|d u rtd| j tj|d	d
d\}}| |}|d
|  | }n| |}| |}| |}| |}| jd ur| |}|| | j }	|	S )Nr   rS   rT   rU   rV   rl   rt   z9 `temb` should not be None when `time_embedding_norm` is r   r   )dim)rW   rX   r   r?   rH   rI   rY   rZ   rJ   rC   r   rn   r'   rD   r@   rE   chunkr!   rG   r    r(   )
rK   rP   rQ   r[   r\   r]   r^   
time_scale
time_shiftr_   rN   rN   rO   r`   @  sP   



















zResnetBlock2D.forward)ra   rb   rc   rd   re   r   rf   rg   rh   rE   ri   r=   r`   rj   rN   rN   rL   rO   rk      sx    "	
$drk   tensorrR   c                 C   s   t | jdkr| d d d d d f S t | jdkr'| d d d d d d d f S t | jdkr<| d d d d dd d f S tdt |  d)Nr   r0      r   z`len(tensor)`: z has to be 2, 3 or 4.)rW   rY   r@   )r   rN   rN   rO   rearrange_dimsy  s   r   c                       sb   e Zd ZdZ		ddededeeeeef f dedef
 fd	d
Zde	j
de	j
fddZ  ZS )Conv1dBlocka  
    Conv1d --> GroupNorm --> Mish

    Parameters:
        inp_channels (`int`): Number of input channels.
        out_channels (`int`): Number of output channels.
        kernel_size (`int` or `tuple`): Size of the convolving kernel.
        n_groups (`int`, default `8`): Number of groups to separate the channels into.
        activation (`str`, defaults to `mish`): Name of the activation function.
       mishinp_channelsr   r2   n_groups
activationc                    s>   t    tj||||d d| _t||| _t|| _d S )Nr   r4   )	r<   r=   rA   Conv1dconv1dr   
group_normr	   r   )rK   r   r   r2   r   r   rL   rN   rO   r=     s   
zConv1dBlock.__init__inputsrR   c                 C   s2   |  |}t|}| |}t|}| |}|S N)r   r   r   r   )rK   r   intermediate_reproutputrN   rN   rO   r`     s   


zConv1dBlock.forward)r   r   ra   rb   rc   rd   re   r   r   rh   r=   rE   ri   r`   rj   rN   rN   rL   rO   r     s     r   c                       sh   e Zd ZdZ		ddedededeeeeef f def
 fd	d
Zde	j
de	j
de	j
fddZ  ZS )ResidualTemporalBlock1Da  
    Residual 1D block with temporal convolutions.

    Parameters:
        inp_channels (`int`): Number of input channels.
        out_channels (`int`): Number of output channels.
        embed_dim (`int`): Embedding dimension.
        kernel_size (`int` or `tuple`): Size of the convolving kernel.
        activation (`str`, defaults `mish`): It is possible to choose the right activation function.
       r   r   r   	embed_dimr2   r   c                    sh   t    t|||| _t|||| _t|| _t||| _	||kr-t
||d| _d S t | _d S )Nr   )r<   r=   r   conv_inconv_outr	   time_emb_actrA   r   time_embr   Identityresidual_conv)rK   r   r   r   r2   r   rL   rN   rO   r=     s   

z ResidualTemporalBlock1D.__init__r   trR   c                 C   s>   |  |}| |}| |t| }| |}|| | S )z
        Args:
            inputs : [ batch_size x inp_channels x horizon ]
            t : [ batch_size x embed_dim ]

        returns:
            out : [ batch_size x out_channels x horizon ]
        )r   r   r   r   r   r   )rK   r   r   outrN   rN   rO   r`     s
   
	

zResidualTemporalBlock1D.forward)r   r   r   rN   rN   rL   rO   r     s     $r   c                	       sZ   e Zd ZdZ			ddedee dedef fd	d
Zddej	dedej	fddZ
  ZS )TemporalConvLayera  
    Temporal convolutional layer that can be used for video (sequence of images) input Code mostly copied from:
    https://github.com/modelscope/modelscope/blob/1509fdb973e5871f37148a4b5e5964cafd43e64d/modelscope/models/multi_modal/video_synthesis/unet_sd.py#L1016

    Parameters:
        in_dim (`int`): Number of input channels.
        out_dim (`int`): Number of output channels.
        dropout (`float`, *optional*, defaults to `0.0`): The dropout probability to use.
    Nr   r   in_dimout_dimr!   norm_num_groupsc                    s
  t    |p|}|| _|| _tt||t tj||ddd| _	tt||t t
|tj||ddd| _tt||t t
|tj||ddd| _tt||t t
|tj||ddd| _tj| jd j tj| jd j d S )Nr0   r   r   )r   r   r   r   )r<   r=   r   r   rA   
Sequentialr   SiLUConv3drC   rF   rG   conv3conv4initzeros_weightr;   )rK   r   r   r!   r   rL   rN   rO   r=     s:   




zTemporalConvLayer.__init__r   r^   
num_framesrR   c                 C   s   |d d d f  d|f|jdd   ddddd}|}| |}| |}| |}| |}|| }|ddddd |jd |jd  df|jdd   }|S )Nr   r   r   r   r0   r   )reshaperY   permuterC   rG   r   r   )rK   r^   r   identityrN   rN   rO   r`     s   4



$zTemporalConvLayer.forward)Nr   r   )r   ra   rb   rc   rd   re   r   rg   r=   rE   ri   r`   rj   rN   rN   rL   rO   r     s    $)r   c                	       sZ   e Zd ZdZ			ddedee dedef fd	d
Zdej	dej	dej	fddZ
  ZS )TemporalResnetBlocka  
    A Resnet block.

    Parameters:
        in_channels (`int`): The number of channels in the input.
        out_channels (`int`, *optional*, default to be `None`):
            The number of output channels for the first conv2d layer. If None, same as `in_channels`.
        temb_channels (`int`, *optional*, default to `512`): the number of channels in timestep embedding.
        eps (`float`, *optional*, defaults to `1e-6`): The epsilon to use for the normalization.
    Nr   r   r.   r   r"   r%   c                    s   t    || _|d u r|n|}|| _d}dd |D }tjjd||dd| _tj|||d|d| _	|d ur>t
||| _nd | _tjjd||dd| _tjd	| _tj|||d|d| _td
| _| j|k| _d | _| jr|tj||dddd| _d S d S )Nr   c                 S   s   g | ]}|d  qS )r   rN   ).0krN   rN   rO   
<listcomp>7  s    z0TemporalResnetBlock.__init__.<locals>.<listcomp>r   Trp   r   r1   r   silur   )r<   r=   r.   r   rE   rA   r   r?   r   rC   r   r   rD   rF   r!   rG   r	   rH   r)   r    )rK   r.   r   r"   r%   r2   r4   rL   rN   rO   r=   *  sL   

zTemporalResnetBlock.__init__rP   rQ   rR   c                 C   s   |}|  |}| |}| |}| jd ur:| |}| |d d d d d d d d f }|ddddd}|| }| |}| |}| |}| |}| jd urX| |}|| }|S )Nr   r   r   r0   r   )	r?   rH   rC   r   r   rD   r!   rG   r    )rK   rP   rQ   r^   r_   rN   rN   rO   r`   `  s"   




$





zTemporalResnetBlock.forward)Nr   r   r   rN   rN   rL   rO   r     s    $6r   c                       s   e Zd ZdZ							dded	ee d
ededee dedef fddZ		dde	j
dee	j
 dee	j
 fddZ  ZS )SpatioTemporalResBlocka  
    A SpatioTemporal Resnet block.

    Parameters:
        in_channels (`int`): The number of channels in the input.
        out_channels (`int`, *optional*, default to be `None`):
            The number of output channels for the first conv2d layer. If None, same as `in_channels`.
        temb_channels (`int`, *optional*, default to `512`): the number of channels in timestep embedding.
        eps (`float`, *optional*, defaults to `1e-6`): The epsilon to use for the spatial resenet.
        temporal_eps (`float`, *optional*, defaults to `eps`): The epsilon to use for the temporal resnet.
        merge_factor (`float`, *optional*, defaults to `0.5`): The merge factor to use for the temporal mixing.
        merge_strategy (`str`, *optional*, defaults to `learned_with_images`):
            The merge strategy to use for the temporal mixing.
        switch_spatial_to_temporal_mix (`bool`, *optional*, defaults to `False`):
            If `True`, switch the spatial and temporal mixing.
    Nr   r         ?learned_with_imagesFr.   r   r"   r%   temporal_epsmerge_factorswitch_spatial_to_temporal_mixc	           	         sf   t    t||||d| _t|d ur|n||d ur|n|||d ur$|n|d| _t|||d| _d S )N)r.   r   r"   r%   )alphamerge_strategyr   )r<   r=   rk   spatial_res_blockr   temporal_res_blockAlphaBlender
time_mixer)	rK   r.   r   r"   r%   r   r   r   r   rL   rN   rO   r=     s$   
zSpatioTemporalResBlock.__init__r^   rQ   image_only_indicatorc                 C   s   |j d }| ||}|j \}}}}|| }	|d d d f |	||||ddddd}
|d d d f |	||||ddddd}|d urM||	|d}| ||}| j|
||d}|ddddd||||}|S )Nr   r   r   r   r0   r   )	x_spatial
x_temporalr   )rY   r   r   r   r   r   )rK   r^   rQ   r   r   batch_frameschannelsheightwidth
batch_sizehidden_states_mixrN   rN   rO   r`     s$   
**zSpatioTemporalResBlock.forward)Nr   r   Nr   r   F)NN)ra   rb   rc   rd   re   r   rg   rf   r=   rE   ri   r`   rj   rN   rN   rL   rO   r   {  sB    	$r   c                	       s   e Zd ZdZg dZ		ddededef fdd	Zd
e	j
dede	j
fddZ	dde	j
de	j
d
ee	j
 de	j
fddZ  ZS )r   a  
    A module to blend spatial and temporal features.

    Parameters:
        alpha (`float`): The initial value of the blending factor.
        merge_strategy (`str`, *optional*, defaults to `learned_with_images`):
            The merge strategy to use for the temporal mixing.
        switch_spatial_to_temporal_mix (`bool`, *optional*, defaults to `False`):
            If `True`, switch the spatial and temporal mixing.
    )learnedfixedr   r   Fr   r   r   c                    s   t    || _|| _|| jvrtd| j | jdkr)| dt|g d S | jdks3| jdkrC| 	dtj
t|g d S td| j )Nzmerge_strategy needs to be in r   
mix_factorr   r   zUnknown merge strategy )r<   r=   r   r   
strategiesr@   register_bufferrE   ri   register_parameterrA   	Parameter)rK   r   r   r   rL   rN   rO   r=     s   


 zAlphaBlender.__init__r   ndimsrR   c                 C   s   | j dkr
| j}|S | j dkrt| j}|S | j dkrg|d u r$tdt| tjdd|jdt| jd }|dkrM|d d d d d d d f }|S |d	kr_|	d
d d d d f }|S td| dt
)Nr   r   r   zMPlease provide image_only_indicator to use learned_with_images merge strategyr   )device).Nr   r0   r   zUnexpected ndims z. Dimensions should be 3 or 5)r   r   rE   sigmoidr@   whererf   onesr   r   NotImplementedError)rK   r   r   r   rN   rN   rO   	get_alpha  s,   



zAlphaBlender.get_alphaNr   r   c                 C   s@   |  ||j}||j}| jrd| }|| d| |  }|S )Nr   )r   ndimtodtyper   )rK   r   r   r   r   rz   rN   rN   rO   r`     s   zAlphaBlender.forward)r   Fr   )ra   rb   rc   rd   r   rg   rh   rf   r=   rE   ri   re   r   r   r`   rj   rN   rN   rL   rO   r     s.    #r   ),	functoolsr   typingr   r   r   rE   torch.nnrA   torch.nn.functional
functionalr   utilsr   activationsr	   attention_processorr
   downsamplingr   r   r   r   r   normalizationr   
upsamplingr   r   r   r   r   r   Moduler   rk   ri   r   r   r   r   r   r   r   rN   rN   rN   rO   <module>   s,    
  =$/G]T