o
    NiU                     @  sr  d dl mZ d dlmZ d dlmZmZ d dlZd dlm	Z	m
Z
 d dlmZmZmZmZ d dlm  mZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZmZmZmZ eG dd dZdd Zdd Z dd Z!dd Z"ej#j$d6ddZ%dd Z&dd Z'd d! Z(e(e)Z*d7d#d$Z+		%d8d&d'Z,d(d) Z-d9d+d,Z.	*	-d:d.d/Z/d0d1 Z0d2d3 Z1G d4d5 d5e	Z2dS );    )annotations)partial)TupleCallableN)Module	Parameter)catnneinsumTensor)
namedtuplewraps)version)	dataclass)	rearrangerepeatpackunpackc                   @  sj   e Zd ZU dZded< dZded< dZded< dZded< dZded< dZ	d	ed
< dZ
ded< dd ZdS )IntermediatesNzTensor | Noneqk_similaritiespre_softmax_attnpost_softmax_attnvaluesztuple[Tensor, Tensor] | None	cached_kvz
str | None
layer_typehybrid_hiddenc                 C  s   | j | j| jfS Nr   r   r   )self r    I/home/ubuntu/.local/lib/python3.10/site-packages/x_transformers/attend.pyto_tuple      zIntermediates.to_tuple)__name__
__module____qualname__r   __annotations__r   r   r   r   r   r   r"   r    r    r    r!   r      s   
 r   c                 C  s   | d uS r   r    )valr    r    r!   exists#   s   r)   c                 C  s   t | r| S |S r   )r)   )r(   dr    r    r!   default&   r#   r+   c                  G  s   t g tt| dkS )N   )summapint)boolsr    r    r!   at_most_one_of)   s   r1   c                 C  s   g t t| S r   )filterr)   )arrr    r    r!   compact,   s   r4   tr   valuefloatc                 C  s   | |   | S r   )tanh)r5   r6   r    r    r!   	softclamp/   s   r9   c                 C  s   t | g|S r   )r   )r5   patternr    r    r!   pack_one3   s   r;   c                 C  s   t | ||d S )Nr   )r   )r5   psr:   r    r    r!   
unpack_one6   r#   r=   c                   s   d t  fdd}|S )NFc                   s    rd S d | S NTr    )xcalledfnr    r!   inner;   s   zonce.<locals>.innerr   )rB   rC   r    r@   r!   once9   s   rD   	intermedsc                 C  s4   | j jdd}| jjddd}|d|}t|dS )NdimTrH   keepdimzb h i 1 -> b h i)r   log_softmaxr   argmaxgatherr   )rE   	log_probsone_hotlog_probr    r    r!   log_prob_from_hard_attendH   s   
rQ   Tc                 C  s   g | j dd  | jR \}}}t|| d d df }t|}|r-| }d|d| f< tj||d}||krDtj||| dfdd}d| | }tj|ddd}|j	dd	}| t
|d
 S )Nr           .device      ?r6   )r   r   r,   rF   rG   zb i j -> b 1 i j)shaperU   r+   Freluclonetorcheyepadcumsumr   )simsim_head_gateno_mask_sosijrU   gater]   r    r    r!   selective_attnT   s    
rf   c                 C  sV   |j dkrt|d| jd d}t| d\} }t|d\}}t| |d }t||dS )	N   zb j d -> b h j dr,   hz* i dz* j d   z* i j)ndimr   rX   r;   r\   cdistr=   )qkpacked_shape_l2_dist_squaredr    r    r!   qk_l2_dist_squaredo   s   
rr   rV   c                 C  sB   | j ddd}t| d|d}| | jdd}|| |  S )NrF   TrI   rV   rG   )rL   r\   
zeros_likescattersoftmaxdetach)logitstemperatureone_hot_indicesrO   	soft_attnr    r    r!   one_hot_straight_through{   s   r{   Fc                 C  s   | }t | jj }| j|dd\}}| |ddd f k| |k@ }| | |} | jdd}	|s2|	S || jdd}
|	 |
 |
  S )NrF   rG   .)r\   finfodtypemaxtopkmasked_fillru   rv   )rw   sparse_topkrx   straight_throughorig_logits
mask_value
top_valuesrp   sparse_topk_mask	topk_attnrz   r    r    r!   sparse_topk_attn   s   r   c                 C  s$   t j| |f|t jd||  d S )N)rU   r}   r,   )r\   onesbooltriu)rc   rd   rU   r    r    r!   create_causal_mask   s   $r   c                 C  s>   t j| |d}t|dt|dk }tj|||  dfdd}|S )NrT   zi -> i 1zj -> 1 jr   FrW   )r\   aranger   rY   r^   )rc   rd   rU   rcausal_maskr    r    r!   onnx_create_causal_mask   s   r   c                      s   e Zd Zddddddddddddddddddddddddddeddddddd fddZ			dddZ				dddZ  ZS )AttendrS   FNrV   Tg      I@)enable_flashenable_mathenable_mem_efficient)dropoutcausalheadspre_talking_headspost_talking_headspre_scale_post_talking_headsr   sparse_topk_straight_throughscaleqk_norml2_distancesigmoidgumbel_softmaxgumbel_softmax_tempgumbel_softmax_hard
cog_signedcustom_attn_fnflashsoftclamp_logitslogit_softclamp_valueadd_zero_kvhead_learned_sink	selectivehardcopeonnxable
sdp_kwargsflash_pack_seqr   Callable | Noner   dictc          %        s  t    |	| _|| _|rtnt| _t|}|r|rJ d|r&|r&J d|r.|r.J dt|||||s8J t|r@|| _n8|rGt	j
| _n1|rMt| _n+|rXtt||d| _n |rett	jd||d| _ntt	jdd}|
sut|tjdn|| _|| _t|| _|r|s|s|rJ d	|rtj||d
ddnd | _|rtj||d
ddnd | _|rtj||d
ddnd | _t| jrtj| jj t| jrtj| jj t| jrtj| jj |r|rJ d|r|sJ d|| _|r|rJ d|| _|| _|| _ |r|rJ d|| _!|rt"t#|nd | _$|r-|r&J d|dks-J || _%|| _&|| _'|| _(|| _)t*+tj,}|rQ|t*+dk rQJ d| j(r| j)rzddl-m.}  | | _.W n t/yp   t/dw tj01 \}!}"|!dksJ d|! |" d|t*+dkrddl2m3}# t4|#j5|#j6|#j7|#j8d  fdd|9 D }$ttjj:j;|$| _<d S ttj=j0j>fi || _<d S d S ) Nz)sigmoid attention not available for flashz&hard attention not available for flashz&topk attention not available for flash)r   r   rF   )rH   taur   rG   )r}   z1talking heads not compatible with flash attentionr,   F)biasz2selective attention cannot work on flash attentionz2selective attention is designed for autoregressivez%cog attention not available for flashz%not supported for flash attention yetz=flash attention not compatible with logit softclamp value yetrS   z2.0.0zGin order to use flash attention, you must be using pytorch 2.0 or abover   )flash_attn_varlen_funcztblock masking with Flash Attention requires the flash-attn package. Please install it with `pip install flash-attn`.   z]block masking with Flash Attention requires SM80+ (Ampere or newer) GPUs, but your GPU has SM.z2.3)
SDPBackend)r   r   r   enable_cudnnc                   s   g | ]
\}}|r | qS r    r    ).0
enable_strenablestr_to_backendr    r!   
<listcomp>I  s    z#Attend.__init__.<locals>.<listcomp>)?super__init__r   r   r   r   r)   r1   attn_fnrY   r   r{   r   r   r   ru   r\   float32r   r	   Dropoutattn_dropoutConv2dpre_softmax_talking_headspost_softmax_talking_headsr   initdirac_weightr   r   r   r   r   r   zeroshead_attn_sinkr   r   r   r   r   r   parse__version__
flash_attnr   ImportErrorcudaget_device_capabilitytorch.nn.attentionr   r   FLASH_ATTENTIONEFFICIENT_ATTENTIONMATHCUDNN_ATTENTIONitems	attentionsdpa_kernelsdp_context_managerbackends
sdp_kernel)%r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   is_sparse_topk_attn
softmax_fntorch_versionr   majorminorr   sdpa_backends	__class__r   r!   r      s   
$




zAttend.__init__c              	   C  s  g |j |j d |j|jR \}}}	}
}}}|jdkr't|d|j d d}|jdkr6t|d|j d d}| jrm|jdddd	 }tj|d
dd}t	||fdd}|jdddd	 }t	d	| |fdd}tj|d
dd}t
| jr|j d d }|| j|  }| j}|	dkr|rd}t
|r|jdksJ ||||	|}||	kr|r| j|	||d}t
|s| }n|| @ }d}t
|r|r| j|	||d}|| @ }d}d }t
|r|jdd }t
|r|||dd}t|jj }t
|r|| |d	 }n|r| j|	||d}|||d	 }d}|}| jrt
|s%J d|dd }|dd }|j d dkrI|j d dkrI|j d dksYJ d|j  d|j  d|j  t
|sht
|rht
|slJ d|j |j kr|jdkr| s| s| dk r| dk sJ d|r||k sJ d| jd"t|dt|dt|d|| jr| jndd|}t|d}n'|   tj||||| jr| jnd|d }W d    n	1 sw   Y  t
|r| r||d! d}|t  fS )#NrR   rg   zb ... -> b h ...r,   rh   rF   TrI   rj   )r   r,   g      rW   rG         F   rT   zGflash_pack_seq_kwargs must be provided when self.flash_pack_seq is Truecu_seqlens_kcu_seqlens_qr   z4batch size must be 1 for block masking. Shape was q=z, k=z, v=z7mask cannot be passed with cu_seqlens for block maskingzUcu_seqlens_q/k should be same-length 1D cumulative sequence lengths for block maskingzDcausal attention with different cu_seqlens for q and k not supportedz1 h t d ->t h drS   )rm   rn   vr   	dropout_pzt h d -> 1 h t d)	attn_maskr   	is_causal.Nr    )!rX   is_cudarU   rk   r   r   normrY   r^   r   r)   r   r   expandr   anyr\   r|   r}   r~   r   r   getis_floating_pointdiffallr   r   trainingr   r   scaled_dot_product_attentionr   )r   rm   rn   r   mask	attn_biasflash_pack_seq_kwargsbatchr   q_lenrp   k_lenr   rU   	k_norm_sq	q_norm_sqdefault_scaler   r   row_is_entirely_maskedr   r   r   attoutr    r    r!   r   O  s   0






P&Z


zAttend.flash_attnc                   s  |j d |j d |j d |jf\} }	t| j|j d d }
| j}t|r0|jdkr0t|d}|dkr8|r8d}dkrJtdd	 ||fD \}}n k r^t fd
d	||fD \}}| j	rtdd	 ||fD \}}t|rzt
j|ddd}t|rt
j|ddd}| jrt|rJ d| j||||||dS |jdkrdnd}| jstd| d||}nt|| }||
 }t|r|| }| }t| jr| |}t| jr|| | }t|r|| }| jrt|| j}| jr| }| }g |j dd |jR \}}}t|jj }t|r|| |}|r0| j|||	d}|||}d}t|r>|j dd }t| j!rL|| !|| }| j"rTt#|}| j$rt%| j&d|j d |j d d}| jr|| | }}t'||fdd}t'||fdd}|}| (|}|)|}| jr|| }|}| j$r|dddf }| *|}t| j+r| +|}t| jr|| }td| d||}t,|||d }t|r|  r||d! d}||fS )"z
        einstein notation
        b - batch
        h - heads
        n, i, j - sequence length (base sequence length, source, target)
        d - feature dimension
        rR   r,   rF   r   rj   zb j -> b 1 1 jFc                 s  s    | ]}t |d V  qdS )zb 1 n d -> b n dN)r   r   r5   r    r    r!   	<genexpr>  s    z!Attend.forward.<locals>.<genexpr>c                 3  s"    | ]}t |d   dV  qdS )zb kvh n d -> b (r kvh) n d)r   N)r   r  r   kv_headsr    r!   r    s     c                 s  s     | ]}t j|d ddV  qdS ))r   r   r,   r   rS   rW   N)rY   r^   r  r    r    r!   r    s    )r,   r   TrW   rS   z6residual attention not compatible with flash attention)r   r   r   rg   zb j dzb h j dz	b h i d, z -> b h i jNrT   rG   zh -> b h i 1r   )brc   .z	b h i j, z -> b h i dr   r   )-rX   rU   r+   r   r   r)   rk   r   tupler   rY   r^   r   r   r   r
   rr   r[   r   r   r   r9   r   r   signabsr}   r\   r|   r~   r   r   r   r   r   rf   r   r   r   r   r   typer   r   r   )r   rm   rn   r   r   r   	prev_attnr   nrU   r   r   kv_einsum_eqr`   r   pre_to_post_scalesim_signrc   rd   r}   r   r   r  	attn_sinkattn_sink_signr   attnr   r  intermediatesr    r  r!   forward  s   (
 


 





zAttend.forward)r   r   r   r   )NNN)NNNN)r$   r%   r&   r   r   r   r  __classcell__r    r    r   r!   r      sZ     +
 
r   )r5   r   r6   r7   )rE   r   r>   )rV   )rV   F)3
__future__r   	functoolsr   typingr   r   r\   torch.nnr   r   r   r	   r
   r   torch.nn.functional
functionalrY   collectionsr   r   	packagingr   dataclassesr   einopsr   r   r   r   r   r)   r+   r1   r4   jitscriptr9   r;   r=   rD   print
print_oncerQ   rf   rr   r{   r   r   r   r   r    r    r    r!   <module>   sH    



