o
    pi@                     @   s   d dl mZ d dlmZmZmZ d dlZd dlmZ ddl	m
Z
mZ ddlmZ ddlmZmZmZ dd	lmZ d
dlmZmZmZ eG dd deZG dd dee
ZdS )    )	dataclass)OptionalTupleUnionN   )ConfigMixinregister_to_config)
BaseOutput   )GaussianFourierProjectionTimestepEmbedding	Timesteps)
ModelMixin   )UNetMidBlock2Dget_down_blockget_up_blockc                   @   s   e Zd ZU dZejed< dS )UNet2DOutputz
    The output of [`UNet2DModel`].

    Args:
        sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`):
            The hidden states output from the last layer of the model.
    sampleN)__name__
__module____qualname____doc__torchTensor__annotations__ r   r   \/home/ubuntu/SoloSpeech/.venv/lib/python3.10/site-packages/diffusers/models/unets/unet_2d.pyr      s   
 r   c                7       sD  e Zd ZdZe											
																d8deeeeeef f  dedede	de
dede	dee
df dee
df deedf ded ed!ed"e
d#e
d$ed%e
d&ee d'ed(ee d)ed*e
d+e	d,ee
 d-ee d.ee f4 fd/d0Z		d9d1ejd2eejeef d3eej d4e	d5eeef f
d6d7Z  ZS ):UNet2DModela  
    A 2D UNet model that takes a noisy sample and a timestep and returns a sample shaped output.

    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
    for all models (such as downloading or saving).

    Parameters:
        sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
            Height and width of input/output sample. Dimensions must be a multiple of `2 ** (len(block_out_channels) -
            1)`.
        in_channels (`int`, *optional*, defaults to 3): Number of channels in the input sample.
        out_channels (`int`, *optional*, defaults to 3): Number of channels in the output.
        center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample.
        time_embedding_type (`str`, *optional*, defaults to `"positional"`): Type of time embedding to use.
        freq_shift (`int`, *optional*, defaults to 0): Frequency shift for Fourier time embedding.
        flip_sin_to_cos (`bool`, *optional*, defaults to `True`):
            Whether to flip sin to cos for Fourier time embedding.
        down_block_types (`Tuple[str]`, *optional*, defaults to `("DownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D")`):
            Tuple of downsample block types.
        mid_block_type (`str`, *optional*, defaults to `"UNetMidBlock2D"`):
            Block type for middle of UNet, it can be either `UNetMidBlock2D` or `UnCLIPUNetMidBlock2D`.
        up_block_types (`Tuple[str]`, *optional*, defaults to `("AttnUpBlock2D", "AttnUpBlock2D", "AttnUpBlock2D", "UpBlock2D")`):
            Tuple of upsample block types.
        block_out_channels (`Tuple[int]`, *optional*, defaults to `(224, 448, 672, 896)`):
            Tuple of block output channels.
        layers_per_block (`int`, *optional*, defaults to `2`): The number of layers per block.
        mid_block_scale_factor (`float`, *optional*, defaults to `1`): The scale factor for the mid block.
        downsample_padding (`int`, *optional*, defaults to `1`): The padding for the downsample convolution.
        downsample_type (`str`, *optional*, defaults to `conv`):
            The downsample type for downsampling layers. Choose between "conv" and "resnet"
        upsample_type (`str`, *optional*, defaults to `conv`):
            The upsample type for upsampling layers. Choose between "conv" and "resnet"
        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
        attention_head_dim (`int`, *optional*, defaults to `8`): The attention head dimension.
        norm_num_groups (`int`, *optional*, defaults to `32`): The number of groups for normalization.
        attn_norm_num_groups (`int`, *optional*, defaults to `None`):
            If set to an integer, a group norm layer will be created in the mid block's [`Attention`] layer with the
            given number of groups. If left as `None`, the group norm layer will only be created if
            `resnet_time_scale_shift` is set to `default`, and if created will have `norm_num_groups` groups.
        norm_eps (`float`, *optional*, defaults to `1e-5`): The epsilon for normalization.
        resnet_time_scale_shift (`str`, *optional*, defaults to `"default"`): Time scale shift config
            for ResNet blocks (see [`~models.resnet.ResnetBlock2D`]). Choose from `default` or `scale_shift`.
        class_embed_type (`str`, *optional*, defaults to `None`):
            The type of class embedding to use which is ultimately summed with the time embeddings. Choose from `None`,
            `"timestep"`, or `"identity"`.
        num_class_embeds (`int`, *optional*, defaults to `None`):
            Input dimension of the learnable embedding matrix to be projected to `time_embed_dim` when performing class
            conditioning with `class_embed_type` equal to `None`.
    Nr   F
positionalr   TDownBlock2DAttnDownBlock2Dr"   r"   AttnUpBlock2Dr$   r$   	UpBlock2D   i  i  i  r
   r   conv        silu       h㈵>defaultsample_sizein_channelsout_channelscenter_input_sampletime_embedding_type
freq_shiftflip_sin_to_cosdown_block_types.up_block_typesblock_out_channelslayers_per_blockmid_block_scale_factordownsample_paddingdownsample_typeupsample_typedropoutact_fnattention_head_dimnorm_num_groupsattn_norm_num_groupsnorm_epsresnet_time_scale_shiftadd_attentionclass_embed_typenum_class_embedsnum_train_timestepsc           (         s  t    || _|
d d }t|t|	kr!td| d|	 dt|
t|kr4td|
 d| dtj||
d dd	d
| _|dkrTt|
d dd| _	d|
d  }n#|dkrft
|
d ||| _	|
d }n|dkrwt||
d | _	|
d }t||| _|d u r|d urt||| _n|dkrt||| _n|dkrt||| _nd | _tg | _d | _tg | _|
d }t|D ]1\}}|} |
| }|t|
d k}!t||| |||! ||||d ur|n|||||d}"| j|" qt|
d |||||||d ur|n|
d |||d| _tt|
}#|#d }t|	D ]D\}}$|}%|#| }|#t|d t|
d  } |t|
d k}!t|$|d | ||%||! ||||d urM|n||||d}&| j|& |}%q|d ure|nt|
d d d}'tj|
d |'|d| _t | _tj|
d |ddd
| _ d S )Nr      z\Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: z. `up_block_types`: .zbMust provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: z. `down_block_types`: r   )r   r   )kernel_sizepaddingfourier   )embedding_sizescaler
   r   learnedtimestepidentityr   )
num_layersr0   r1   temb_channelsadd_downsample
resnet_epsresnet_act_fnresnet_groupsr@   r;   rD   r<   r>   )r0   rU   r>   rW   rX   output_scale_factorrD   r@   rY   attn_groupsrE   )rT   r0   r1   prev_output_channelrU   add_upsamplerW   rX   rY   r@   rD   r=   r>   r,   )num_channels
num_groupseps)!super__init__r/   len
ValueErrornnConv2dconv_inr   	time_projr   	Embeddingr   time_embeddingclass_embeddingIdentity
ModuleListdown_blocks	mid_block	up_blocks	enumerater   appendr   listreversedminr   	GroupNormconv_norm_outSiLUconv_actconv_out)(selfr/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   time_embed_dimtimestep_input_dimoutput_channelidown_block_typeinput_channelis_final_block
down_blockreversed_block_out_channelsup_block_typer]   up_blocknum_groups_out	__class__r   r   rc   \   s   

 
zUNet2DModel.__init__r   rR   class_labelsreturn_dictreturnc                 C   sV  | j jr
d| d }|}t|stj|gtj|jd}nt|r1t|jdkr1|d 	|j}|tj
|jd |j|jd }| |}|j	| jd}| |}| jdurx|du r^td| j jdkri| |}| |j	| jd}|| }n| jdu r|durtd	|}	| |}|f}
| jD ]}t|d
r||||	d\}}}	n|||d\}}|
|7 }
q| ||}d}	| jD ]+}|
t|j d }|
dt|j  }
t|d
r|||||	\}}	q||||}q| |}| |}| |}|	dur||	7 }| j jdkr ||jd gdgt|jdd  R }|| }|s&|fS t|dS )a  
        The [`UNet2DModel`] forward method.

        Args:
            sample (`torch.Tensor`):
                The noisy input tensor with the following shape `(batch, channel, height, width)`.
            timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input.
            class_labels (`torch.Tensor`, *optional*, defaults to `None`):
                Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~models.unets.unet_2d.UNet2DOutput`] instead of a plain tuple.

        Returns:
            [`~models.unets.unet_2d.UNet2DOutput`] or `tuple`:
                If `return_dict` is True, an [`~models.unets.unet_2d.UNet2DOutput`] is returned, otherwise a `tuple` is
                returned where the first element is the sample tensor.
        r
   g      ?)dtypedevicer   N)r   z=class_labels should be provided when doing class conditioningrR   zJclass_embedding needs to be initialized in order to use class conditioning	skip_conv)hidden_statestembskip_sample)r   r   rM   r   )r   )configr2   r   	is_tensortensorlongr   rd   shapetoonesr   ri   rk   rl   re   rF   rh   ro   hasattrrp   rq   resnetsrx   rz   r{   r3   reshaper   )r|   r   rR   r   r   	timestepst_embemb	class_embr   down_block_res_samplesdownsample_blockres_samplesupsample_blockr   r   r   forward   sb   














,
zUNet2DModel.forward)Nr   r   Fr   r   Tr    r#   r&   r
   r   r   r(   r(   r)   r*   r+   r,   Nr-   r.   TNNN)NT)r   r   r   r   r   r   r   intr   boolstrfloatrc   r   r   r   r   __classcell__r   r   r   r   r   (   s    3
	


 
r   )dataclassesr   typingr   r   r   r   torch.nnrf   configuration_utilsr   r   utilsr	   
embeddingsr   r   r   modeling_utilsr   unet_2d_blocksr   r   r   r   r   r   r   r   r   <module>   s   