o
    GiW                     @   s   d dl Z d dlmZ d dlZd dlmZ d dlm	Z	 ddl
mZmZ ddlmZmZ ddlmZmZ ddlmZ d	d
lmZmZmZmZmZ eeZe jjG dd deZ eG dd dej!eeZ"dS )    N)
FrozenDict   )ConfigMixinflax_register_to_config)
BaseOutputlogging   )FlaxTimestepEmbeddingFlaxTimesteps)FlaxModelMixin   )FlaxCrossAttnDownBlock2DFlaxCrossAttnUpBlock2DFlaxDownBlock2DFlaxUNetMidBlock2DCrossAttnFlaxUpBlock2Dc                   @   s   e Zd ZU dZejed< dS )FlaxUNet2DConditionOutputa  
    The output of [`FlaxUNet2DConditionModel`].

    Args:
        sample (`jnp.ndarray` of shape `(batch_size, num_channels, height, width)`):
            The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model.
    sampleN)__name__
__module____qualname____doc__jnpndarray__annotations__ r   r   a/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/models/unets/unet_2d_condition_flax.pyr   %   s   
 r   c                   @   s  e Zd ZU dZdZeed< dZeed< dZeed< dZ	e
edf ed	< d
Ze
edf ed< dZedB ed< dZee
e B ed< dZe
edf ed< dZeed< dZee
edf B ed< dZee
edf B dB ed< dZeed< dZeed< dZeed< ejZejed< dZeed< d Zeed!< dZeed"< dZeed#< d$Zee
edf B ed%< dZ edB ed&< dZ!edB ed'< d(Z"eed)< dZ#edB ed*< d+e$j%d,e&fd-d.Z'd;d/d0Z(					d<d1ej)d2ej)eB eB d3ej)d4e*e&B dB d5e
ej)df dB d6ej)dB d7ed8ed,e+e
ej) B fd9d:Z,dS )=FlaxUNet2DConditionModela  
    A conditional 2D UNet model that takes a noisy sample, conditional state, and a timestep and returns a sample
    shaped output.

    This model inherits from [`FlaxModelMixin`]. Check the superclass documentation for it's generic methods
    implemented for all models (such as downloading or saving).

    This model is also a Flax Linen [flax.linen.Module](https://flax.readthedocs.io/en/latest/flax.linen.html#module)
    subclass. Use it as a regular Flax Linen module and refer to the Flax documentation for all matters related to its
    general usage and behavior.

    Inherent JAX features such as the following are supported:
    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)

    Parameters:
        sample_size (`int`, *optional*):
            The size of the input sample.
        in_channels (`int`, *optional*, defaults to 4):
            The number of channels in the input sample.
        out_channels (`int`, *optional*, defaults to 4):
            The number of channels in the output.
        down_block_types (`tuple[str]`, *optional*, defaults to `("FlaxCrossAttnDownBlock2D", "FlaxCrossAttnDownBlock2D", "FlaxCrossAttnDownBlock2D", "FlaxDownBlock2D")`):
            The tuple of downsample blocks to use.
        up_block_types (`tuple[str]`, *optional*, defaults to `("FlaxUpBlock2D", "FlaxCrossAttnUpBlock2D", "FlaxCrossAttnUpBlock2D", "FlaxCrossAttnUpBlock2D")`):
            The tuple of upsample blocks to use.
        mid_block_type (`str`, *optional*, defaults to `"UNetMidBlock2DCrossAttn"`):
            Block type for middle of UNet, it can be one of `UNetMidBlock2DCrossAttn`. If `None`, the mid block layer
            is skipped.
        block_out_channels (`tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
            The tuple of output channels for each block.
        layers_per_block (`int`, *optional*, defaults to 2):
            The number of layers per block.
        attention_head_dim (`int` or `tuple[int]`, *optional*, defaults to 8):
            The dimension of the attention heads.
        num_attention_heads (`int` or `tuple[int]`, *optional*):
            The number of attention heads.
        cross_attention_dim (`int`, *optional*, defaults to 768):
            The dimension of the cross attention features.
        dropout (`float`, *optional*, defaults to 0):
            Dropout probability for down, up and bottleneck blocks.
        flip_sin_to_cos (`bool`, *optional*, defaults to `True`):
            Whether to flip the sin to cos in the time embedding.
        freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding.
        use_memory_efficient_attention (`bool`, *optional*, defaults to `False`):
            Enable memory efficient attention as described [here](https://huggingface.co/papers/2112.05682).
        split_head_dim (`bool`, *optional*, defaults to `False`):
            Whether to split the head dimension into a new axis for the self-attention computation. In most cases,
            enabling this flag should speed up the computation for Stable Diffusion 2.x and Stable Diffusion XL.
        sample_size   in_channelsout_channels)CrossAttnDownBlock2Dr#   r#   DownBlock2D.down_block_types)	UpBlock2DCrossAttnUpBlock2Dr'   r'   up_block_typesUNetMidBlock2DCrossAttnNmid_block_typeFonly_cross_attention)i@  i     r,   block_out_channelsr   layers_per_block   attention_head_dimnum_attention_headsr,   cross_attention_dimg        dropoutuse_linear_projectiondtypeTflip_sin_to_cosr   
freq_shiftuse_memory_efficient_attentionsplit_head_dimr   transformer_layers_per_blockaddition_embed_typeaddition_time_embed_dim@   addition_embed_type_num_heads%projection_class_embeddings_input_dimrngreturnc                 C   s  d| j | j| jf}tj|tjd}tjdtjd}tjdd| jftjd}tj	
|\}}||d}d }	| jdkrvd| jj | jj | jjk}
|
rKdnd}| jj|| jj  }| j| }|| j }tjd|ftjdtjd|ftjdd}	| |||||	d	 S )
Nr   r5   )r   )paramsr3   	text_time      )text_embedstime_idsrC   )r!   r   r   zerosfloat32onesint32r2   jaxrandomsplitr;   configr<   r?   init)selfr@   sample_shaper   	timestepsencoder_hidden_states
params_rngdropout_rngrngsadded_cond_kwargs
is_refinernum_micro_conditionstext_embeds_dimtime_ids_channelstime_ids_dimsr   r   r   init_weights   s,   




z%FlaxUNet2DConditionModel.init_weightsc                 C   s  t d | j}|d d }| jd urtd| jp| j}tj|d ddd| jd| _	t
|d | j| jjd	| _t|| jd
| _| j}t|trP|ft| j }t|tr]|ft| j }| j}t|trm|gt| j }| jd u rvd | _n/| jdkr| jd u rtd| j dt
| j| j| j| _t|| jd
| _n	td| j dg }|d }t| jD ]G\}}	|}
|| }|t|d k}|	dkrt|
|| j| j|| || | | j || | j!| j"| jd}nt#|
|| j| j| | jd}|$| q|| _%| jj&dkrt'|d | j|d |d | j | j!| j"| jd| _(n| jj&d u r&d | _(n	td| jj& g }t)t*|}t)t*|}t)t*|}|d }t)t*|}t| j+D ]^\}}|}|| }|t,|d t|d  }
|t|d k}|dkrt-|
||| jd || || | | j| j || | j!| j"| jd}nt.|
||| jd | | j| jd}|$| |}qR|| _/tj0ddd| _1tj| j2ddd| jd| _3d S )NzFlax classes are deprecated and will be removed in Diffusers v1.0.0. We recommend migrating to PyTorch classes or pinning your version of Diffusers.r   r    a#  At the moment it is not possible to define the number of attention heads via `num_attention_heads` because of a naming issue as described in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131. Passing `num_attention_heads` will only be supported in diffusers v0.19.)r   r   r   r   )r`   r`   )kernel_sizestridespaddingr5   )r6   r7   rB   rD   zaddition_embed_type z2 requires `addition_time_embed_dim` to not be Nonezaddition_embed_type: z must be None or `text_time`.r   r#   )r!   r"   r3   
num_layersr:   r1   add_downsampler4   r+   r8   r9   r5   )r!   r"   r3   rd   re   r5   r)   )r!   r3   r1   r:   r4   r8   r9   r5   zUnexpected mid_block_type r'   )r!   r"   prev_output_channelrd   r:   r1   add_upsampler3   r4   r+   r8   r9   r5   )r!   r"   rg   rd   rh   r3   r5   r   gh㈵>)
num_groupsepsilon)4loggerwarningr-   r1   
ValueErrorr0   nnConvr5   conv_inr
   r6   rP   r7   	time_projr	   time_embeddingr+   
isinstanceboollenr%   intr:   r;   add_embeddingr<   add_time_proj	enumerater   r3   r.   r4   r8   r9   r   appenddown_blocksr*   r   	mid_blocklistreversedr(   minr   r   	up_blocks	GroupNormconv_norm_outr"   conv_out)rR   r-   time_embed_dimr1   r+   r:   r{   output_channelidown_block_typeinput_channelis_final_block
down_blockr   reversed_block_out_channelsreversed_num_attention_heads%reversed_transformer_layers_per_blockup_block_typerg   up_blockr   r   r   setup   s  

	





	




zFlaxUNet2DConditionModel.setupr   rT   rU   rY   down_block_additional_residualsmid_block_additional_residualreturn_dicttrainc	                 C   s  t |tjstj|gtjd}nt |tjr*t|jdkr*|jtjd}t	|d}| 
|}	| |	}	d}
| jdkr|du rLtd| j d| j d|d}|du r]t| j d	|d
}|du rnt| j d| t|}t||jd df}tj||gdd}| |}
|
dur|	|
 n|	}	t|d}| |}|f}| jD ]!}t |tr|||	|| d\}}n
|||	| d\}}||7 }q|durd}t||D ]\}}||7 }||f7 }q|}| jdur| j||	|| d}|dur||7 }| jD ]2}|| jd  d }|d| jd   }t |tr)|||	||| d}q|||	|| d}q| |}t |}| !|}t|d}|sO|fS t"|dS )a  
        Args:
            sample (`jnp.ndarray`): (batch, channel, height, width) noisy inputs tensor
            timestep (`jnp.ndarray` or `float` or `int`): timesteps
            encoder_hidden_states (`jnp.ndarray`): (batch_size, sequence_length, hidden_size) encoder hidden states
            added_cond_kwargs: (`dict`, *optional*):
                A kwargs dictionary containing additional embeddings that if specified are added to the embeddings that
                are passed along to the UNet blocks.
            down_block_additional_residuals: (`tuple` of `torch.Tensor`, *optional*):
                A tuple of tensors that if specified are added to the residuals of down unet blocks.
            mid_block_additional_residual: (`torch.Tensor`, *optional*):
                A tensor that if specified is added to the residual of the middle unet block.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`models.unets.unet_2d_condition_flax.FlaxUNet2DConditionOutput`] instead of
                a plain tuple.
            train (`bool`, *optional*, defaults to `False`):
                Use deterministic functions and disable dropout when not training.

        Returns:
            [`~models.unets.unet_2d_condition_flax.FlaxUNet2DConditionOutput`] or `tuple`:
            [`~models.unets.unet_2d_condition_flax.FlaxUNet2DConditionOutput`] if `return_dict` is True, otherwise a
            `tuple`. When returning a tuple, the first element is the sample tensor.
        rB   r   NrD   z1Need to provide argument `added_cond_kwargs` for z! when using `addition_embed_type=`rG   z has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`rH   z has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`rf   )axis)r   r   r   r   )deterministicr   r   )tembrU   res_hidden_states_tupler   )r   r   r   )r   r   r   r   )r   )#rs   r   r   arrayrL   ru   shapeastyperJ   expand_dimsrq   rr   r;   rm   	__class__getrx   ravelreshapeconcatenaterw   	transposerp   r{   r   zipr|   r   r.   r   r   rn   silur   r   )rR   r   rT   rU   rY   r   r   r   r   t_embaug_embrG   rH   time_embeds
add_embedsdown_block_res_samplesr   res_samplesnew_down_block_res_samplesdown_block_res_sampledown_block_additional_residualr   r   r   r   __call__Q  s   #


















z!FlaxUNet2DConditionModel.__call__)rA   N)NNNTF)-r   r   r   r   r   rv   r   r!   r"   r%   tuplestrr(   r*   r+   rt   r-   r.   r0   r1   r2   r3   floatr4   r   rJ   r5   r6   r7   r8   r9   r:   r;   r<   r>   r?   rM   Arrayr   r_   r   r   dictr   r   r   r   r   r   r   2   sj   
 5
  0
	
r   )#flax
flax.linenlinenrn   rM   	jax.numpynumpyr   flax.core.frozen_dictr   configuration_utilsr   r   utilsr   r   embeddings_flaxr	   r
   modeling_flax_utilsr   unet_2d_blocks_flaxr   r   r   r   r   
get_loggerr   rk   struct	dataclassr   Moduler   r   r   r   r   <module>   s   
	