o
    piFA                     @   s   d dl mZmZmZ d dlZd dlmZ d dlZd dl	m
Z d dlmZ ddlmZmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZmZmZ ejjG dd deZG dd dejZ eG dd dejeeZ!dS )    )OptionalTupleUnionN)
FrozenDict   )ConfigMixinflax_register_to_config)
BaseOutput   )FlaxTimestepEmbeddingFlaxTimesteps)FlaxModelMixin)FlaxCrossAttnDownBlock2DFlaxDownBlock2DFlaxUNetMidBlock2DCrossAttnc                   @   s&   e Zd ZU dZejed< ejed< dS )FlaxControlNetOutputz
    The output of [`FlaxControlNetModel`].

    Args:
        down_block_res_samples (`jnp.ndarray`):
        mid_block_res_sample (`jnp.ndarray`):
    down_block_res_samplesmid_block_res_sampleN)__name__
__module____qualname____doc__jnpndarray__annotations__ r   r   ^/home/ubuntu/SoloSpeech/.venv/lib/python3.10/site-packages/diffusers/models/controlnet_flax.pyr   !   s   
 
r   c                   @   sZ   e Zd ZU eed< dZeedf ed< ejZ	ej	ed< ddd	Z
d
ejdejfddZdS )#FlaxControlNetConditioningEmbeddingconditioning_embedding_channels       `      .block_out_channelsdtypereturnNc                 C   s   t j| jd dd| jd| _g }tt| jd D ]-}| j| }| j|d  }t j|dd| jd}|| t j|ddd| jd}|| q|| _t j| j	ddt j
 t j
 | jd| _d S )	Nr      r(   r
   r
   r*   )kernel_sizepaddingr%   r
   )r   r   r+   stridesr,   r%   r+   r,   kernel_init	bias_initr%   )nnConvr$   r%   conv_inrangelenappendblocksr   initializers
zeros_initconv_out)selfr8   i
channel_inchannel_outconv1conv2r   r   r   setup4   sD   

z)FlaxControlNetConditioningEmbedding.setupconditioningc                 C   s@   |  |}t|}| jD ]}||}t|}q| |}|S )N)r4   r2   silur8   r;   )r<   rC   	embeddingblockr   r   r   __call__Z   s   



z,FlaxControlNetConditioningEmbedding.__call__r&   N)r   r   r   intr   r$   r   r   float32r%   rB   r   rG   r   r   r   r   r   /   s   
 
&r   c                   @   s  e Zd ZU dZdZeed< dZeed< dZe	e
df ed< d	Zeee	edf f ed
< dZe	edf ed< dZeed< dZeee	edf f ed< dZeeee	edf f  ed< dZeed< dZeed< d	Zeed< ejZejed< dZeed< dZeed< dZe
ed< dZe	edf ed < d!ejd"e fd#d$Z!d1d%d&Z"	'			d2d(ej#d)eej#eef d*ej#d+ej#d,ed-ed.ed"ee$e	e	ej#df ej#f f fd/d0Z%dS )3FlaxControlNetModelu
  
    A ControlNet model.

    This model inherits from [`FlaxModelMixin`]. Check the superclass documentation for it’s generic methods
    implemented for all models (such as downloading or saving).

    This model is also a Flax Linen [`flax.linen.Module`](https://flax.readthedocs.io/en/latest/flax.linen.html#module)
    subclass. Use it as a regular Flax Linen module and refer to the Flax documentation for all matters related to its
    general usage and behavior.

    Inherent JAX features such as the following are supported:

    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)

    Parameters:
        sample_size (`int`, *optional*):
            The size of the input sample.
        in_channels (`int`, *optional*, defaults to 4):
            The number of channels in the input sample.
        down_block_types (`Tuple[str]`, *optional*, defaults to `("FlaxCrossAttnDownBlock2D", "FlaxCrossAttnDownBlock2D", "FlaxCrossAttnDownBlock2D", "FlaxDownBlock2D")`):
            The tuple of downsample blocks to use.
        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
            The tuple of output channels for each block.
        layers_per_block (`int`, *optional*, defaults to 2):
            The number of layers per block.
        attention_head_dim (`int` or `Tuple[int]`, *optional*, defaults to 8):
            The dimension of the attention heads.
        num_attention_heads (`int` or `Tuple[int]`, *optional*):
            The number of attention heads.
        cross_attention_dim (`int`, *optional*, defaults to 768):
            The dimension of the cross attention features.
        dropout (`float`, *optional*, defaults to 0):
            Dropout probability for down, up and bottleneck blocks.
        flip_sin_to_cos (`bool`, *optional*, defaults to `True`):
            Whether to flip the sin to cos in the time embedding.
        freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding.
        controlnet_conditioning_channel_order (`str`, *optional*, defaults to `rgb`):
            The channel order of conditional image. Will convert to `rgb` if it's `bgr`.
        conditioning_embedding_out_channels (`tuple`, *optional*, defaults to `(16, 32, 96, 256)`):
            The tuple of output channel for each block in the `conditioning_embedding` layer.
    r!   sample_size   in_channels)CrossAttnDownBlock2DrO   rO   DownBlock2D.down_block_typesFonly_cross_attention)i@  i     rS   r$   r   layers_per_block   attention_head_dimNnum_attention_headsrS   cross_attention_dimg        dropoutuse_linear_projectionr%   Tflip_sin_to_cosr   
freq_shiftrgb%controlnet_conditioning_channel_orderr   #conditioning_embedding_out_channelsrngr&   c                 C   s   d| j | j| jf}tj|tjd}tjdtjd}tjdd| jftjd}dd| jd | jd f}tj|tjd}tj	
|\}}	||	d}
| |
||||d S )Nr
   r%   )r
   r(   rU   )paramsrY   rb   )rN   rL   r   zerosrJ   onesint32rX   jaxrandomsplitinit)r<   r`   sample_shapesample	timestepsencoder_hidden_statescontrolnet_cond_shapecontrolnet_cond
params_rngdropout_rngrngsr   r   r   init_weights   s   
z FlaxControlNetModel.init_weightsc                 C   sH  | j }|d d }| jp| j}tj|d ddd| jd| _t|d | j| j	j
d| _t|| jd| _t|d | jd	| _| j}t|trL|ft| j }t|trY|ft| j }g }g }|d }tj|dd
tj tj | jd}|| t| jD ]x\}	}
|}||	 }|	t|d k}|
dkrt||| j| j||	 | | j||	 | jd	}nt||| j| j| | jd}|| t | jD ]}tj|dd
tj tj | jd}|| q|stj|dd
tj tj | jd}|| q}|| _!|| _"|d }t#|| j|d | j| jd| _$tj|dd
tj tj | jd| _%d S )Nr   rM   r'   r*   r)   r-   )r[   r\   ra   )r   r$   VALIDr/   r
   rO   )	rN   out_channelsrY   
num_layersrW   add_downsamplerZ   rR   r%   )rN   ru   rY   rv   rw   r%   )rN   rY   rW   rZ   r%   )&r$   rW   rV   r2   r3   r%   r4   r   r[   configr\   	time_projr   time_embeddingr   r_   controlnet_cond_embeddingrR   
isinstanceboolr6   rQ   rI   r9   r:   r7   	enumerater   rY   rT   rZ   r   r5   down_blockscontrolnet_down_blocksr   	mid_blockcontrolnet_mid_block)r<   r$   time_embed_dimrW   rR   r   r   output_channelcontrolnet_blockr=   down_block_typeinput_channelis_final_block
down_block_mid_block_channelr   r   r   rB      s   	



	
zFlaxControlNetModel.setup      ?rk   rl   rm   ro   conditioning_scalereturn_dicttrainc                    s  | j }|dkrtj|dd}t|tjstj|gtjd}nt|tjr8t|jdkr8|j	tj
d}t|d}| |}	| |	}	t|d}| |}t|d}| |}||7 }|f}
| jD ]!}t|tru|||	|| d\}}n
|||	| d\}}|
|7 }
qb| j||	|| d}d}t|
| jD ]\}}||}||f7 }q|}
| |} fd	d
|
D }
| 9 }|s|
|fS t|
|dS )a  
        Args:
            sample (`jnp.ndarray`): (batch, channel, height, width) noisy inputs tensor
            timestep (`jnp.ndarray` or `float` or `int`): timesteps
            encoder_hidden_states (`jnp.ndarray`): (batch_size, sequence_length, hidden_size) encoder hidden states
            controlnet_cond (`jnp.ndarray`): (batch, channel, height, width) the conditional input tensor
            conditioning_scale (`float`, *optional*, defaults to `1.0`): the scale factor for controlnet outputs
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`models.unets.unet_2d_condition_flax.FlaxUNet2DConditionOutput`] instead of
                a plain tuple.
            train (`bool`, *optional*, defaults to `False`):
                Use deterministic functions and disable dropout when not training.

        Returns:
            [`~models.unets.unet_2d_condition_flax.FlaxUNet2DConditionOutput`] or `tuple`:
                [`~models.unets.unet_2d_condition_flax.FlaxUNet2DConditionOutput`] if `return_dict` is True, otherwise
                a `tuple`. When returning a tuple, the first element is the sample tensor.
        bgrr
   )axisra   r   )r   r   r(   r
   )deterministicr   c                    s   g | ]}|  qS r   r   ).0rk   r   r   r   
<listcomp>  s    z0FlaxControlNetModel.__call__.<locals>.<listcomp>)r   r   )r^   r   flipr}   r   arrayre   r6   shapeastyperJ   expand_dimsrz   r{   	transposer4   r|   r   r   r   zipr   r   r   )r<   rk   rl   rm   ro   r   r   r   channel_ordert_embr   r   res_samples!controlnet_down_block_res_samplesdown_block_res_sampler   r   r   r   r   rG   :  sF   







zFlaxControlNetModel.__call__rH   )r   TF)&r   r   r   r   rL   rI   r   rN   rQ   r   strrR   r   r~   r$   rT   rV   rW   r   rX   rY   floatrZ   r   rJ   r%   r[   r\   r^   r_   rf   Arrayr   rs   rB   r   r   rG   r   r   r   r   rK   g   sR   
 - 
 	rK   )"typingr   r   r   flax
flax.linenlinenr2   rf   	jax.numpynumpyr   flax.core.frozen_dictr   configuration_utilsr   r   utilsr	   embeddings_flaxr   r   modeling_flax_utilsr   unets.unet_2d_blocks_flaxr   r   r   struct	dataclassr   Moduler   rK   r   r   r   r   <module>   s    8