o
    GiT                     @   s   d dl Z d dlZd dlmZ d dlZd dlmZ ddl	m
Z
 e
eZddefddZejjjddfd	edefd
dZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZdS )    N   )logging   key_chunk_sizec                    s   j dd \} j d t|t  tjtjddfdd fdd	}tjj	|t
d
|d\}}}	tj|	d
dd}
t|	|
 }|tj|dd9 }||9 }|jd
d}t|djd
d}|| S )zBMulti-head dot product attention with a limited number of queries.NF)prevent_csec                    sl   t jd| | d}t j|ddd}tj|}t || }t jd|| d}t d|}||jdd|fS )	Nz...qhd,...khd->...qhk	precisionr   Taxiskeepdimsz...vhf,...qhv->...qhfz...qhk->...qhr   )jnpeinsummaxjaxlaxstop_gradientexpsum)querykeyvalueattn_weights	max_scoreexp_weights
exp_valuesr	    S/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/models/attention_flax.pysummarize_chunk#   s   z/_query_chunk_attention.<locals>.summarize_chunkc                    s   t jjdgjd  | ddg tjd d  g d}t jjdgjd  | ddg tjd d g d}||S )Nr      r   operandstart_indicesslice_sizes)r   r   dynamic_slicendimlistshape)	chunk_idx	key_chunkvalue_chunk)
k_featuresr   r   	num_headsr   r    
v_featuresr   r   r   chunk_scanner0   s   z-_query_chunk_attention.<locals>.chunk_scannerr   )fxsTr   r   )r)   minr   sqrt	functoolspartialr   
checkpointr   maparanger   r   expand_dimsr   )r   r   r   r
   r   num_kvr0   chunk_valueschunk_weights	chunk_max
global_max	max_diffs
all_valuesall_weightsr   )	r-   r   r   r.   r
   r   r    r/   r   r   _query_chunk_attention   s   

 rC   i   query_chunk_sizec           	   	      s`   j dd \ f	dd}tjj|ddt d\}}tj|ddS )a  
    Flax Memory-efficient multi-head dot product attention. https://huggingface.co/papers/2112.05682v2
    https://github.com/AminRezaei0x443/memory-efficient-attention

    Args:
        query (`jnp.ndarray`): (batch..., query_length, head, query_key_depth_per_head)
        key (`jnp.ndarray`): (batch..., key_value_length, head, query_key_depth_per_head)
        value (`jnp.ndarray`): (batch..., key_value_length, head, value_depth_per_head)
        precision (`jax.lax.Precision`, *optional*, defaults to `jax.lax.Precision.HIGHEST`):
            numerical precision for computation
        query_chunk_size (`int`, *optional*, defaults to 1024):
            chunk size to divide query array value must divide query_length equally without remainder
        key_chunk_size (`int`, *optional*, defaults to 4096):
            chunk size to divide key and value array value must divide key_value_length equally without remainder

    Returns:
        (`jnp.ndarray`) with shape of (batch..., query_length, head, value_depth_per_head)
    r   Nc                    s`   t jjdgjd  | ddg tjd d tg d}|  t| dfS )Nr   r!   r   r"   )r   r   r   r
   r   )r   r   r&   r'   r(   r)   r3   rC   )r*   _query_chunk	r   r   r.   num_qr
   
q_featuresr   rD   r   r   r   r0   f   s    
z5jax_memory_efficient_attention.<locals>.chunk_scannerr   )r1   initr2   lengthr   )r)   r   r   scanmathceilr   concatenate)	r   r   r   r
   rD   r   r0   rE   resr   rG   r   jax_memory_efficient_attentionO   s   
rQ   c                   @   s   e Zd ZU dZeed< dZeed< dZeed< dZe	ed< d	Z
eed
< d	Zeed< ejZejed< dd Zdd Zdd ZdddZdS )FlaxAttentiona  
    A Flax multi-head attention module as described in: https://huggingface.co/papers/1706.03762

    Parameters:
        query_dim (:obj:`int`):
            Input hidden states dimension
        heads (:obj:`int`, *optional*, defaults to 8):
            Number of heads
        dim_head (:obj:`int`, *optional*, defaults to 64):
            Hidden states dimension inside each head
        dropout (:obj:`float`, *optional*, defaults to 0.0):
            Dropout rate
        use_memory_efficient_attention (`bool`, *optional*, defaults to `False`):
            enable memory efficient attention https://huggingface.co/papers/2112.05682
        split_head_dim (`bool`, *optional*, defaults to `False`):
            Whether to split the head dimension into a new axis for the self-attention computation. In most cases,
            enabling this flag should speed up the computation for Stable Diffusion 2.x and Stable Diffusion XL.
        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
            Parameters `dtype`

    	query_dim   heads@   dim_head        dropoutFuse_memory_efficient_attentionsplit_head_dimdtypec                 C   s   t d | j| j }| jd | _tj|d| jdd| _tj|d| jdd| _	tj|d| jdd| _
tj| j| jdd	| _tj| jd
| _d S )NFlax classes are deprecated and will be removed in Diffusers v1.0.0. We recommend migrating to PyTorch classes or pinning your version of Diffusers.g      Fto_q)use_biasr\   nameto_kto_vto_out_0)r\   r`   rate)loggerwarningrW   rU   scalennDenser\   r   r   r   rS   	proj_attnDropoutrY   dropout_layerself	inner_dimr   r   r   setup   s   zFlaxAttention.setupc                 C   sL   |j \}}}| j}|||||| }t|d}||| ||| }|S N)r   r      r!   r)   rU   reshaper   	transposero   tensor
batch_sizeseq_lendim	head_sizer   r   r   reshape_heads_to_batch_dim      z(FlaxAttention.reshape_heads_to_batch_dimc                 C   sL   |j \}}}| j}||| |||}t|d}||| ||| }|S rr   rt   rw   r   r   r   reshape_batch_dim_to_heads   r~   z(FlaxAttention.reshape_batch_dim_to_headsNTc                 C   s  |d u r|n|}|  |}| |}| |}| jrD|jd }t||d| j| jf}t||d| j| jf}	t||d| j| jf}
n| 	|}| 	|}	| 	|}
| j
r|ddd}|	ddd}	|
ddd}
|jd }|d dkr}t|d }n|d dkrt|d }n|d dkrt|d }nt|}t||	|
|d	d
}|ddd}| |}nL| jrtd|	|}ntd||	}|| j }tj|| jrdndd}| jrtd||
}|jd }t||d| j| j f}ntd||
}| |}| |}| j||dS )Nr   r   rs   r   r   rV         i @  )rD   r   zb t n h, b f n h -> b n f tzb i d, b j d->b i jr   zb n f t, b t n h -> b f n hzb i j, b j d -> b i ddeterministic)r   r   r   r[   r)   r   ru   rU   rW   r}   rZ   rv   intrQ   r   r   rh   ri   softmaxrk   rm   )ro   hidden_statescontextr   
query_projkey_proj
value_projbquery_states
key_statesvalue_statesflatten_latent_dimrD   attention_scoresattention_probsr   r   r   __call__   sT   












zFlaxAttention.__call__)NT)__name__
__module____qualname____doc__r   __annotations__rU   rW   rY   floatrZ   boolr[   r   float32r\   rq   r}   r   r   r   r   r   r   rR      s   
 rR   c                   @   s|   e Zd ZU dZeed< eed< eed< dZeed< dZe	ed< e
jZe
jed	< dZe	ed
< dZe	ed< dd ZdddZdS )FlaxBasicTransformerBlockau  
    A Flax transformer block layer with `GLU` (Gated Linear Unit) activation function as described in:
    https://huggingface.co/papers/1706.03762


    Parameters:
        dim (:obj:`int`):
            Inner hidden states dimension
        n_heads (:obj:`int`):
            Number of heads
        d_head (:obj:`int`):
            Hidden states dimension inside each head
        dropout (:obj:`float`, *optional*, defaults to 0.0):
            Dropout rate
        only_cross_attention (`bool`, defaults to `False`):
            Whether to only apply cross attention.
        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
            Parameters `dtype`
        use_memory_efficient_attention (`bool`, *optional*, defaults to `False`):
            enable memory efficient attention https://huggingface.co/papers/2112.05682
        split_head_dim (`bool`, *optional*, defaults to `False`):
            Whether to split the head dimension into a new axis for the self-attention computation. In most cases,
            enabling this flag should speed up the computation for Stable Diffusion 2.x and Stable Diffusion XL.
    r{   n_headsd_headrX   rY   Fonly_cross_attentionr\   rZ   r[   c              	   C   s   t d t| j| j| j| j| j| j| j	d| _
t| j| j| j| j| j| j| j	d| _t| j| j| j	d| _tjd| j	d| _tjd| j	d| _tjd| j	d| _tj| jd| _d S )Nr]   r\   )r{   rY   r\   h㈵>)epsilonr\   rd   )rf   rg   rR   r{   r   r   rY   rZ   r[   r\   attn1attn2FlaxFeedForwardffri   	LayerNormnorm1norm2norm3rl   rm   ro   r   r   r   rq   !  s4   
	zFlaxBasicTransformerBlock.setupTc                 C   s   |}| j r| j| |||d}n
| j| ||d}|| }|}| j| |||d}|| }|}| j| ||d}|| }| j||dS Nr   )r   r   r   r   r   r   r   rm   )ro   r   r   r   residualr   r   r   r   A  s   z"FlaxBasicTransformerBlock.__call__NT)r   r   r   r   r   r   rY   r   r   r   r   r   r\   rZ   r[   rq   r   r   r   r   r   r      s   
  r   c                   @   s   e Zd ZU dZeed< eed< eed< dZeed< dZeed< d	Z	e
ed
< d	Ze
ed< ejZejed< d	Ze
ed< d	Ze
ed< dd ZdddZdS )FlaxTransformer2DModela  
    A Spatial Transformer layer with Gated Linear Unit (GLU) activation function as described in:
    https://huggingface.co/papers/1506.02025


    Parameters:
        in_channels (:obj:`int`):
            Input number of channels
        n_heads (:obj:`int`):
            Number of heads
        d_head (:obj:`int`):
            Hidden states dimension inside each head
        depth (:obj:`int`, *optional*, defaults to 1):
            Number of transformers block
        dropout (:obj:`float`, *optional*, defaults to 0.0):
            Dropout rate
        use_linear_projection (`bool`, defaults to `False`): tbd
        only_cross_attention (`bool`, defaults to `False`): tbd
        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
            Parameters `dtype`
        use_memory_efficient_attention (`bool`, *optional*, defaults to `False`):
            enable memory efficient attention https://huggingface.co/papers/2112.05682
        split_head_dim (`bool`, *optional*, defaults to `False`):
            Whether to split the head dimension into a new axis for the self-attention computation. In most cases,
            enabling this flag should speed up the computation for Stable Diffusion 2.x and Stable Diffusion XL.
    in_channelsr   r   rs   depthrX   rY   Fuse_linear_projectionr   r\   rZ   r[   c                    s   t d tjddd_jj  jr tj j	d_
ntj dddj	d_
 fd	d
tjD _jrGtj j	d_ntj dddj	d_tjjd_d S )Nr]       r   )
num_groupsr   r   )rs   rs   VALID)kernel_sizestridespaddingr\   c                    s2   g | ]}t  jjjjjjjd qS ))rY   r   r\   rZ   r[   )r   r   r   rY   r   r\   rZ   r[   ).0rE   rp   ro   r   r   
<listcomp>  s    z0FlaxTransformer2DModel.setup.<locals>.<listcomp>rd   )rf   rg   ri   	GroupNormnormr   r   r   rj   r\   proj_inConvranger   transformer_blocksproj_outrl   rY   rm   r   r   r   r   rq   ~  s6   zFlaxTransformer2DModel.setupTc           
      C   s   |j \}}}}|}| |}| jr |||| |}| |}n| |}|||| |}| jD ]	}	|	|||d}q1| jrL| |}|||||}n|||||}| |}|| }| j||dS r   )r)   r   r   ru   r   r   r   rm   )
ro   r   r   r   batchheightwidthchannelsr   transformer_blockr   r   r   r     s"   




zFlaxTransformer2DModel.__call__Nr   )r   r   r   r   r   r   r   rY   r   r   r   r   r   r   r\   rZ   r[   rq   r   r   r   r   r   r   W  s   
 /r   c                   @   H   e Zd ZU dZeed< dZeed< ej	Z
ej
ed< dd Zdd	d
ZdS )r   a  
    Flax module that encapsulates two Linear layers separated by a non-linearity. It is the counterpart of PyTorch's
    [`FeedForward`] class, with the following simplifications:
    - The activation function is currently hardcoded to a gated linear unit from:
    https://huggingface.co/papers/2002.05202
    - `dim_out` is equal to `dim`.
    - The number of hidden dimensions is hardcoded to `dim * 4` in [`FlaxGELU`].

    Parameters:
        dim (:obj:`int`):
            Inner hidden states dimension
        dropout (:obj:`float`, *optional*, defaults to 0.0):
            Dropout rate
        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
            Parameters `dtype`
    r{   rX   rY   r\   c                 C   s6   t d t| j| j| j| _tj| j| jd| _	d S )Nr]   r   )
rf   rg   	FlaxGEGLUr{   rY   r\   net_0ri   rj   net_2r   r   r   r   rq     s
   zFlaxFeedForward.setupTc                 C   s   | j ||d}| |}|S r   )r   r   )ro   r   r   r   r   r   r     s   
zFlaxFeedForward.__call__Nr   r   r   r   r   r   r   rY   r   r   r   r\   rq   r   r   r   r   r   r     s   
 r   c                   @   r   )r   a  
    Flax implementation of a Linear layer followed by the variant of the gated linear unit activation function from
    https://huggingface.co/papers/2002.05202.

    Parameters:
        dim (:obj:`int`):
            Input hidden states dimension
        dropout (:obj:`float`, *optional*, defaults to 0.0):
            Dropout rate
        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
            Parameters `dtype`
    r{   rX   rY   r\   c                 C   s>   t d | jd }tj|d | jd| _tj| jd| _	d S )Nr]   r   r   r   rd   )
rf   rg   r{   ri   rj   r\   projrl   rY   rm   rn   r   r   r   rq     s   
zFlaxGEGLU.setupTc                 C   s6   |  |}tj|ddd\}}| j|t| |dS )Nr   r   r   )r   r   splitrm   ri   gelu)ro   r   r   hidden_linearhidden_gelur   r   r   r   	  s   
zFlaxGEGLU.__call__Nr   r   r   r   r   r   r     s   
 
r   )r   )r5   rM   
flax.linenlinenri   r   	jax.numpynumpyr   utilsr   
get_loggerr   rf   r   rC   r   	PrecisionHIGHESTrQ   ModulerR   r   r   r   r   r   r   r   r   <module>   s&   
4
0Yo'