o
    پi$<                     @   s  d dl Z d dlmZ d dlZd dlm  mZ d dlm	Z	m
Z
 G dd dejjZejZG dd dejjZejZddd	Zd
d ZdddZ							dddZ		 				dddZ				d ddZ													 	 					d!deej fddZdS )"    N)Optional)	rearrangerepeatc                   @   $   e Zd Zedd Zedd ZdS )IndexFirstAxisc              	   C   sh   |  | |jdksJ |jd |jdd  | _}| }tt|ddt|d|dj	dg|R  S )N   r      b ... -> b (...)z -> z dd)
save_for_backwardndimshapefirst_axis_dimnumeltorchgatherr   r   reshape)ctxinputindicesother_shape
second_dim r   R/home/ubuntu/.local/lib/python3.10/site-packages/flash_attn_origin/cute/testing.pyforward
   s   
zIndexFirstAxis.forwardc                 C   s   | j \}|jdksJ |jdd  }t|d}tj| j|jd g|j|jd}|	dt
|d|jd d| |j| jg|R  d fS )Nr   r   r	   devicedtyper   r
   r   )saved_tensorsr   r   r   r   zerosr   r   r    scatter_r   r   )r   grad_outputr   r   
grad_inputr   r   r   backward   s   
zIndexFirstAxis.backwardN__name__
__module____qualname__staticmethodr   r&   r   r   r   r   r   	   s
    
r   c                   @   r   )IndexPutFirstAxisc                 C   sZ   |  | |jdksJ |jdksJ tj|g|jdd  R |j|jd}|||< |S )Nr   r   r   )r   r   r   r"   r   r   r    )r   valuesr   r   outputr   r   r   r   )   s   
zIndexPutFirstAxis.forwardc                 C   s   | j \}|| }|d d fS N)r!   )r   r$   r   grad_valuesr   r   r   r&   4   s   
zIndexPutFirstAxis.backwardNr'   r   r   r   r   r,   (   s
    

r,   c           	      C   s   |d ur|| n|}|j dtjd}|j dtjd}tj| dd }|  }ttj	|dtjdd}t
t| d|||||fS )Nr   )dimr    F)as_tupler   )r   r   b s ... -> (b s) ...)sumr   int32nonzeroflattenmaxitemFpadcumsumindex_first_axisr   )	hidden_statesattention_maskunused_mask	all_masksseqlens_in_batchused_seqlens_in_batchr   max_seqlen_in_batch
cu_seqlensr   r   r   unpad_input>   s   rF   c                 C   s   t | ||| }t|d|dS )Nz(b s) ... -> b s ...b)index_put_first_axisr   )r>   r   batchseqlenr.   r   r   r   	pad_inputN   s   rL   randomFc                 C   s   |dv sJ |dkrt j|df| |t jd}n1|dkr2t jt|r"dnd| d | d |df|d}nt jt|r9dnd| d	 | d |df|d}|r_t|D ]}|d
 dkrZd||< qNd|d< tt j| |dd|d|k }|S )N)fullrM   thirdrN   r   r   rM   r      )r         r   zs -> b srG   )r   rN   r5   randintr8   ranger   arange)
max_seqlen
batch_sizer   modezero_lengthslengthsipadding_maskr   r   r   generate_random_padding_maskS   s2   r]   c
           "         sN  |r|rJ | j \ }
}|j d }|j \}}}|j  ||fks$J |j  ||fks/J |d us7|	d ur?|r;J |r?J |d urdt| ||\}}}} fdd}|d urat|d nd }n*t| d}tjd d  tj|jd}d }} fd	d}|d urt|dnd }|d urt|||	\}}}}t|||	^}}nt|d}t|d}tjd d  tj|jd}d }}|r||k sJ |
|ksJ tj|||gdd
}tj| ||gdd
}|d ur fdd}n fdd}|	 
 |||	 
 ||fS |rTtj||gdd
}tj||gdd
}|}|d ur1 fdd} n fdd} |	 
 |	 
 ||||| 	 
 |	 
 ||| fS |}|d urd fdd}!n fdd}!|	 
 |	 
 |	 
 |d ur|	 nd ||||||| 	 
 |	 
 |	 
 |d ur|	 nd |||!fS )Nr   c                       t |  S r/   rL   output_unpadrW   	indices_qseqlen_qr   r   <lambda>   s    zgenerate_qkv.<locals>.<lambda>r3   zb s h d -> (b s) h dr   r   )stepr    r   c                       t | d dS Nz(b s) h d -> b s h drG   r   r`   rW   r   r   re          r1   r   c                    r^   r/   r_   
dqkv_unpadrb   r   r   re          c                    rg   Nz(b s) t h d -> b s t h drG   ri   rm   rj   r   r   re      rk   c                    r^   r/   r_   	dkv_unpadrW   	indices_kseqlen_kr   r   re      ro   c                    rg   rp   ri   rq   rj   r   r   re      rk   c                    r^   r/   r_   dk_unpadrs   r   r   re      ro   c                    rg   rh   ri   rv   rj   r   r   re      ro   )r   rF   r   r   rU   r5   r   allstackdetachrequires_grad_)"qkvquery_padding_maskkey_padding_maskqvkvpacked	qkvpackedquery_unused_maskkey_unused_masknheadsr   d_v_nheads_kq_unpadcu_seqlens_qmax_seqlen_q	seqused_qoutput_pad_fnqv_unpadk_unpadcu_seqlens_kmax_seqlen_k	seqused_kv_unpad	qkv_unpadqkvdqkv_pad_fnkv_unpadkv	dq_pad_fn
dkv_pad_fn	dk_pad_fnr   )rW   rt   rc   ru   rd   r   generate_qkvq   s   

















r   NNc              	   C   s<  t tj| |tjdd}tj||tjd}	|d ur3t |d}t|	d|jd d}	t|	|k|	| d}	|d u r9|nt |dd}
|d u rG| nt |dd}|d d u ra|	||
 | |d	  kS |d u rkt|	|n|
}
|d	 d u rx|	|
k}n|	t	||
 | |d	  |
k}t
|t|	||
 | |d  k |	|kS )
Nr   s -> s 1b -> b 1 1 1s -> b 1 1 sr   rG           r   r   )r   r   rU   longr   r   wherer4   	full_likeminimum
logical_orlogical_and)rd   ru   window_sizesink_token_lengthr   r   key_leftpadr   row_idxcol_idxsksqlocal_mask_leftr   r   r   construct_local_mask   s4   


 r   c                 C   s   t tj| |tjdd}tj||tjd}|d ur3t |d}t|d|jd d}t||k|| d}|d u r9|nt |dd}	|d u rG| nt |dd}
|d u rYt||n|	}	||	 |
 ||	 |
 |  }t	||k ||| kS )	Nr   r   r   r   r   rG   r   r   )
r   r   rU   r   r   r   r   r4   r   r   )rd   ru   attention_chunkr   r   r   r   r   r   r   r   col_limit_left_chunkr   r   r   construct_chunk_mask  s&   	
r           Tlearnable_sinkc           (   
   C   s  |	r|d df}| j }|r%|  | | } }}|
d ur#|
 nd }
|d urRt|d| jd |jd  d}|  | | j } |
d urP|
 | |
j nd }
|d urd| t|d j|j d}|d urv| t|d j|j d}| jd |jd }}t|d| jd |jd  d}t|d| jd |jd  d}| jd	 }|jd	 }d
t|
d u r|n||  }|st	d| | |}n	t	d| || }|
d ur|t	d|
| | }|dkrt
|| | }|d ur|t| dtd d }|d d us	|d d urt|||||||| jd}|dkr4t||||||| jd}|d ur2t||n|}|d urA||td |d urJ|| }|d u r[tj|d	d|j }n6|tj} tj| d	dd}!t|d}t||!}"t| |" }#|#jd	ddt||"  }$|#|$ |j }|d ur|t| dd}|d ur|t| dd}|d ur|tj|d	ddd}d
d|  }%|d ur|| d}&n|}&|d ur|&||&j }&t	d|&||% }'|d ur|'t| dd |'j|d|j|dfS )Nr   zb h -> b 1 (h g) 1r   )gzb h -> b 1 h 1)r    r   zb s h d -> b s (h g) dr   g      ?zbthd,bshd->bhtszb s -> b 1 1 sz-inf)r   r   rl   T)r1   keepdimz
h -> h 1 1zb s -> b 1 s 1r   zbhts,bshd->bthdzb s -> b s 1 1)r    floatr   r   tor   mathsqrtr   einsumtanhmasked_fill_r   r   r   r   softmaxfloat32amaxmaximumexpr4   masked_fillrx   )(r|   r}   r~   r   r   r   	attn_bias	dropout_pdropout_maskcausalr   	q_descale	k_descale	v_descaler   r   r   r   softcapupcastreorder_opsintermediate_dtypedtype_ogrd   ru   r   dvsoftmax_scalescores
local_mask
chunk_mask	attentionscores_fp32
logits_maxlogits_or_sinks_maxunnormalized_scores
normalizerdropout_scalingattention_dropr.   r   r   r   attention_ref8  s    














r   r/   )rM   F)NNNFFNN)r   r   NNNN)NNNN)NNNNr   NFNNNNr   r   r   Nr   TFN)r   typingr   r   torch.nn.functionalnn
functionalr:   einopsr   r   autogradFunctionr   applyr=   r,   rI   rF   rL   r]   r   r   r   Tensorr   r   r   r   r   <module>   sn    

"
 
.
$