o
    piO                     @   s   d dl Z d dlZd dlmZ d dlZd dlmZ dde	fddZ
ejjjddfde	de	fdd	ZG d
d dejZG dd dejZG dd dejZG dd dejZG dd dejZdS )    N   key_chunk_sizec                    s   j dd \} j d t|t  tjtjddfdd fdd	}tjj	|t
d
|d\}}}	tj|	d
dd}
t|	|
 }|tj|dd9 }||9 }|jd
d}t|djd
d}|| S )zBMulti-head dot product attention with a limited number of queries.NF)prevent_csec                    sl   t jd| | d}t j|ddd}tj|}t || }t jd|| d}t d|}||jdd|fS )	Nz...qhd,...khd->...qhk	precisionr   Taxiskeepdimsz...vhf,...qhv->...qhfz...qhk->...qhr
   )jnpeinsummaxjaxlaxstop_gradientexpsum)querykeyvalueattn_weights	max_scoreexp_weights
exp_valuesr    ]/home/ubuntu/SoloSpeech/.venv/lib/python3.10/site-packages/diffusers/models/attention_flax.pysummarize_chunk   s   z/_query_chunk_attention.<locals>.summarize_chunkc                    s   t jjdgjd  | ddg tjd d  g d}t jjdgjd  | ddg tjd d g d}||S )Nr      r   operandstart_indicesslice_sizes)r   r   dynamic_slicendimlistshape)	chunk_idx	key_chunkvalue_chunk)
k_featuresr   r   	num_headsr   r   
v_featuresr   r   r   chunk_scanner+   s   z-_query_chunk_attention.<locals>.chunk_scannerr   )fxsTr	   r   )r'   minr   sqrt	functoolspartialr   
checkpointr   maparanger   r   expand_dimsr   )r   r   r   r   r   num_kvr.   chunk_valueschunk_weights	chunk_max
global_max	max_diffs
all_valuesall_weightsr   )	r+   r   r   r,   r   r   r   r-   r   r   _query_chunk_attention   s   

 rA   i   query_chunk_sizec           	   	      s`   j dd \ f	dd}tjj|ddt d\}}tj|ddS )a  
    Flax Memory-efficient multi-head dot product attention. https://arxiv.org/abs/2112.05682v2
    https://github.com/AminRezaei0x443/memory-efficient-attention

    Args:
        query (`jnp.ndarray`): (batch..., query_length, head, query_key_depth_per_head)
        key (`jnp.ndarray`): (batch..., key_value_length, head, query_key_depth_per_head)
        value (`jnp.ndarray`): (batch..., key_value_length, head, value_depth_per_head)
        precision (`jax.lax.Precision`, *optional*, defaults to `jax.lax.Precision.HIGHEST`):
            numerical precision for computation
        query_chunk_size (`int`, *optional*, defaults to 1024):
            chunk size to divide query array value must divide query_length equally without remainder
        key_chunk_size (`int`, *optional*, defaults to 4096):
            chunk size to divide key and value array value must divide key_value_length equally without remainder

    Returns:
        (`jnp.ndarray`) with shape of (batch..., query_length, head, value_depth_per_head)
    r   Nc                    s`   t jjdgjd  | ddg tjd d tg d}|  t| dfS )Nr   r   r   r    )r   r   r   r   r   )r   r   r$   r%   r&   r'   r1   rA   )r(   _query_chunk	r   r   r,   num_qr   
q_featuresr   rB   r   r   r   r.   a   s    
z5jax_memory_efficient_attention.<locals>.chunk_scannerr   )r/   initr0   lengthr   )r'   r   r   scanmathceilr   concatenate)	r   r   r   r   rB   r   r.   rC   resr   rE   r   jax_memory_efficient_attentionJ   s   
rO   c                   @   s   e Zd ZU dZeed< dZeed< dZeed< dZe	ed< d	Z
eed
< d	Zeed< ejZejed< dd Zdd Zdd ZdddZdS )FlaxAttentiona   
    A Flax multi-head attention module as described in: https://arxiv.org/abs/1706.03762

    Parameters:
        query_dim (:obj:`int`):
            Input hidden states dimension
        heads (:obj:`int`, *optional*, defaults to 8):
            Number of heads
        dim_head (:obj:`int`, *optional*, defaults to 64):
            Hidden states dimension inside each head
        dropout (:obj:`float`, *optional*, defaults to 0.0):
            Dropout rate
        use_memory_efficient_attention (`bool`, *optional*, defaults to `False`):
            enable memory efficient attention https://arxiv.org/abs/2112.05682
        split_head_dim (`bool`, *optional*, defaults to `False`):
            Whether to split the head dimension into a new axis for the self-attention computation. In most cases,
            enabling this flag should speed up the computation for Stable Diffusion 2.x and Stable Diffusion XL.
        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
            Parameters `dtype`

    	query_dim   heads@   dim_head        dropoutFuse_memory_efficient_attentionsplit_head_dimdtypec                 C   s   | j | j }| j d | _tj|d| jdd| _tj|d| jdd| _tj|d| jdd| _tj| j	| jdd| _
tj| jd	| _d S )
Ng      Fto_q)use_biasrZ   nameto_kto_vto_out_0)rZ   r]   rate)rU   rS   scalennDenserZ   r   r   r   rQ   	proj_attnDropoutrW   dropout_layerself	inner_dimr   r   r   setup   s   zFlaxAttention.setupc                 C   sL   |j \}}}| j}|||||| }t|d}||| ||| }|S N)r         r   r'   rS   reshaper   	transposerj   tensor
batch_sizeseq_lendim	head_sizer   r   r   reshape_heads_to_batch_dim      z(FlaxAttention.reshape_heads_to_batch_dimc                 C   sL   |j \}}}| j}||| |||}t|d}||| ||| }|S rm   rp   rs   r   r   r   reshape_batch_dim_to_heads   rz   z(FlaxAttention.reshape_batch_dim_to_headsNTc                 C   s  |d u r|n|}|  |}| |}| |}| jrD|jd }t||d| j| jf}t||d| j| jf}	t||d| j| jf}
n| 	|}| 	|}	| 	|}
| j
r|ddd}|	ddd}	|
ddd}
|jd }|d dkr}t|d }n|d dkrt|d }n|d dkrt|d }nt|}t||	|
|d	d
}|ddd}nL| jrtd|	|}ntd||	}|| j }tj|| jrdndd}| jrtd||
}|jd }t||d| j| j f}ntd||
}| |}| |}| j||dS )Nr   r   ro   rn   r   rT         i @  )rB   r   zb t n h, b f n h -> b n f tzb i d, b j d->b i jr   zb n f t, b t n h -> b f n hzb i j, b j d -> b i ddeterministic)r   r   r   rY   r'   r   rq   rS   rU   ry   rX   rr   intrO   r   rc   rd   softmaxr{   rf   rh   )rj   hidden_statescontextr   
query_projkey_proj
value_projbquery_states
key_statesvalue_statesflatten_latent_dimrB   attention_scoresattention_probsr   r   r   __call__   sR   












zFlaxAttention.__call__)NT)__name__
__module____qualname____doc__r   __annotations__rS   rU   rW   floatrX   boolrY   r   float32rZ   rl   ry   r{   r   r   r   r   r   rP   z   s   
 rP   c                   @   s|   e Zd ZU dZeed< eed< eed< dZeed< dZe	ed< e
jZe
jed	< dZe	ed
< dZe	ed< dd ZdddZdS )FlaxBasicTransformerBlockae  
    A Flax transformer block layer with `GLU` (Gated Linear Unit) activation function as described in:
    https://arxiv.org/abs/1706.03762


    Parameters:
        dim (:obj:`int`):
            Inner hidden states dimension
        n_heads (:obj:`int`):
            Number of heads
        d_head (:obj:`int`):
            Hidden states dimension inside each head
        dropout (:obj:`float`, *optional*, defaults to 0.0):
            Dropout rate
        only_cross_attention (`bool`, defaults to `False`):
            Whether to only apply cross attention.
        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
            Parameters `dtype`
        use_memory_efficient_attention (`bool`, *optional*, defaults to `False`):
            enable memory efficient attention https://arxiv.org/abs/2112.05682
        split_head_dim (`bool`, *optional*, defaults to `False`):
            Whether to split the head dimension into a new axis for the self-attention computation. In most cases,
            enabling this flag should speed up the computation for Stable Diffusion 2.x and Stable Diffusion XL.
    rw   n_headsd_headrV   rW   Fonly_cross_attentionrZ   rX   rY   c              	   C   s   t | j| j| j| j| j| j| jd| _t | j| j| j| j| j| j| jd| _	t
| j| j| jd| _tjd| jd| _tjd| jd| _tjd| jd| _tj| jd| _d S )NrZ   )rw   rW   rZ   h㈵>)epsilonrZ   ra   )rP   rw   r   r   rW   rX   rY   rZ   attn1attn2FlaxFeedForwardffrd   	LayerNormnorm1norm2norm3rg   rh   rj   r   r   r   rl     s.   
	zFlaxBasicTransformerBlock.setupTc                 C   s   |}| j r| j| |||d}n
| j| ||d}|| }|}| j| |||d}|| }|}| j| ||d}|| }| j||dS Nr~   )r   r   r   r   r   r   r   rh   )rj   r   r   r   residualr   r   r   r   2  s   z"FlaxBasicTransformerBlock.__call__NT)r   r   r   r   r   r   rW   r   r   r   r   r   rZ   rX   rY   rl   r   r   r   r   r   r      s   
 r   c                   @   s   e Zd ZU dZeed< eed< eed< dZeed< dZeed< d	Z	e
ed
< d	Ze
ed< ejZejed< d	Ze
ed< d	Ze
ed< dd ZdddZdS )FlaxTransformer2DModela  
    A Spatial Transformer layer with Gated Linear Unit (GLU) activation function as described in:
    https://arxiv.org/pdf/1506.02025.pdf


    Parameters:
        in_channels (:obj:`int`):
            Input number of channels
        n_heads (:obj:`int`):
            Number of heads
        d_head (:obj:`int`):
            Hidden states dimension inside each head
        depth (:obj:`int`, *optional*, defaults to 1):
            Number of transformers block
        dropout (:obj:`float`, *optional*, defaults to 0.0):
            Dropout rate
        use_linear_projection (`bool`, defaults to `False`): tbd
        only_cross_attention (`bool`, defaults to `False`): tbd
        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
            Parameters `dtype`
        use_memory_efficient_attention (`bool`, *optional*, defaults to `False`):
            enable memory efficient attention https://arxiv.org/abs/2112.05682
        split_head_dim (`bool`, *optional*, defaults to `False`):
            Whether to split the head dimension into a new axis for the self-attention computation. In most cases,
            enabling this flag should speed up the computation for Stable Diffusion 2.x and Stable Diffusion XL.
    in_channelsr   r   ro   depthrV   rW   Fuse_linear_projectionr   rZ   rX   rY   c                    s   t jddd_jj  jrt j jd_nt j	 dddjd_ fdd	t
jD _jrBt j jd_nt j	 dddjd_t jjd
_d S )N    r   )
num_groupsr   r   )ro   ro   VALID)kernel_sizestridespaddingrZ   c                    s2   g | ]}t  jjjjjjjd qS ))rW   r   rZ   rX   rY   )r   r   r   rW   r   rZ   rX   rY   ).0rC   rk   rj   r   r   
<listcomp>~  s    z0FlaxTransformer2DModel.setup.<locals>.<listcomp>ra   )rd   	GroupNormnormr   r   r   re   rZ   proj_inConvranger   transformer_blocksproj_outrg   rW   rh   r   r   r   r   rl   o  s0   zFlaxTransformer2DModel.setupTc           
      C   s   |j \}}}}|}| |}| jr |||| |}| |}n| |}|||| |}| jD ]	}	|	|||d}q1| jrL| |}|||||}n|||||}| |}|| }| j||dS r   )r'   r   r   rq   r   r   r   rh   )
rj   r   r   r   batchheightwidthchannelsr   transformer_blockr   r   r   r     s"   




zFlaxTransformer2DModel.__call__Nr   )r   r   r   r   r   r   r   rW   r   r   r   r   r   r   rZ   rX   rY   rl   r   r   r   r   r   r   H  s   
 *r   c                   @   H   e Zd ZU dZeed< dZeed< ej	Z
ej
ed< dd Zdd	d
ZdS )r   a  
    Flax module that encapsulates two Linear layers separated by a non-linearity. It is the counterpart of PyTorch's
    [`FeedForward`] class, with the following simplifications:
    - The activation function is currently hardcoded to a gated linear unit from:
    https://arxiv.org/abs/2002.05202
    - `dim_out` is equal to `dim`.
    - The number of hidden dimensions is hardcoded to `dim * 4` in [`FlaxGELU`].

    Parameters:
        dim (:obj:`int`):
            Inner hidden states dimension
        dropout (:obj:`float`, *optional*, defaults to 0.0):
            Dropout rate
        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
            Parameters `dtype`
    rw   rV   rW   rZ   c                 C   s,   t | j| j| j| _tj| j| jd| _d S )Nr   )	FlaxGEGLUrw   rW   rZ   net_0rd   re   net_2r   r   r   r   rl     s   zFlaxFeedForward.setupTc                 C   s   | j ||d}| |}|S r   )r   r   )rj   r   r   r   r   r   r     s   
zFlaxFeedForward.__call__Nr   r   r   r   r   r   r   rW   r   r   r   rZ   rl   r   r   r   r   r   r     s   
 r   c                   @   r   )r   a  
    Flax implementation of a Linear layer followed by the variant of the gated linear unit activation function from
    https://arxiv.org/abs/2002.05202.

    Parameters:
        dim (:obj:`int`):
            Input hidden states dimension
        dropout (:obj:`float`, *optional*, defaults to 0.0):
            Dropout rate
        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
            Parameters `dtype`
    rw   rV   rW   rZ   c                 C   s4   | j d }tj|d | jd| _tj| jd| _d S )Nr}   rn   r   ra   )rw   rd   re   rZ   projrg   rW   rh   ri   r   r   r   rl     s   
zFlaxGEGLU.setupTc                 C   s6   |  |}tj|ddd\}}| j|t| |dS )Nrn   r   r~   )r   r   splitrh   rd   gelu)rj   r   r   hidden_linearhidden_gelur   r   r   r     s   
zFlaxGEGLU.__call__Nr   r   r   r   r   r   r     s   
 r   )r   )r3   rK   
flax.linenlinenrd   r   	jax.numpynumpyr   r   rA   r   	PrecisionHIGHESTrO   ModulerP   r   r   r   r   r   r   r   r   <module>   s"   4
0zTj"