o
    پi                     @   s8   d dl Z d dlmZ d dlm  mZ G dd dZdS )    Nc                   @   s  e Zd Zdeje deje deje deje fddZej	dej
dej
d	ejd
ejdeje deje fddZej	dej
dej
dejd	ejd
ejdejfddZej	dej
dej
dejd	ejd
ejdejfddZej	dej
dej
dejd	ejd
ejdejfddZdS )PackGQAm_block_sizehead_dim_paddedcheck_hdim_oobqhead_per_kvheadc                 C   s   || _ || _|| _|| _d S )N)r   r   r   r   )selfr   r   r   r    r   S/home/ubuntu/.local/lib/python3.10/site-packages/flash_attn_origin/cute/pack_gqa.py__init__   s   
zPackGQA.__init__tensorcRowstidxblockthreads_per_rownum_threadsc                 C   s   t t ||}t |tj}t|D ].}	|	| |||  d  }
|| j |
 }|| j }||| j  }t	
|||ff ||	< q|S Nr   )cuteceil_divsizemake_fragmentcutlassInt64range_constexprr   r   utilselem_pointertoint)r   r   r   r   r   r   r   num_ptr_per_threadtPrPtrirowidxm_idxh_idxr   r   r	   compute_ptr   s   

zPackGQA.compute_ptrmQsQgmem_tiled_copyseqlenc              
   C   s  | |}t| j| jf}||}	||}
| d|}tj|
|j	d d}|
d }|j
j	d d }tjj| dksBJ d|j}| |d |||||}tt|	j	d D ]}tj|||  || |d}tj|j|tjjdd	}|d|df d || j || j  |d d  k rt|| jf}t|	j	d d }t||f}tt|	j	d
 D ],}|
dd|f d | }tj||d |f |	d ||f t| jr|d ||f nd d qq\d S Nr      )limit)r   Nr   %threads_per_row must divide WARP_SIZEr   width   assumed_align   )pred)	get_slicer   make_identity_tensorr   r   partition_Dpartition_Sr   predicate_kshapelayout_tv_tiledarch	WARP_SIZEr   r#   r   r   shuffle_syncmake_ptrelement_typeAddressSpacegmemr   make_tensortiled_dividecopy
const_exprr   )r   r$   r%   r&   r   r   r'   gmem_thr_copycQtQsQtQcQt0QcQtQpQtQcQ_rowr   r   tPrQPtrm	q_ptr_i64
q_gmem_ptrmQ_curelems_per_loadmQ_cur_copykkir   r   r	   load_Q+   sD   




zPackGQA.load_QmLSEtLSErLSE	tiled_mmac                 C   s:  | |}t| j| jf}||}	t|	d }
t|t|
ks&J |j	j
d d }tjj| dks:J dt||ksCJ |j}| ||
||||}tt|D ]B}tj|||  || |d}tj|j|tjjdd}|| j |
| d  }|	d d dkr||| j k rt|d}|| |d< qXd S )	Nr   r   r+   r,      r/   r)   )r)   )r3   r   r4   r   r   partition_Cr   make_acc_tensor_mn_viewr   tv_layout_Cr8   r:   r;   r#   r   r   r<   r=   r>   r?   r@   r   rA   )r   rV   rW   rX   r   r   r'   thr_mmacaccOtaccOcOtaccOcO_rowr   r   	tPrLSEPtrrM   lse_ptr_i64lse_gmem_ptrr   	mLSE_copyr   r   r	   	store_LSEX   s2   



zPackGQA.store_LSEmOtOrOc              
   C   s  | |}t| j| jf}||}	| d|}
tj|	|jd d}|	d }|j	jd d }tj
j| dks=J d|j}| |d |||||}tt|jd D ]}tj|||  || |d}tj|j|tjjdd	}|
d|df d || j || j  |d d  k rt|| jf}t|jd d }t||f}tt|jd
 D ],}|	dd|f d | }tj||d ||f |d |f t| jr|d ||f nd d qqWd S r(   )r3   r   r4   r   r   r6   r   r7   r8   r9   r:   r;   r   r#   r   r   r<   r=   r>   r?   r@   r   rA   rB   rC   rD   r   )r   rf   rg   r&   r   r   r'   rE   cOtOcOt0OcOtOpOtOcO_rowr   r   tPrOPtrrM   	o_ptr_i64
o_gmem_ptrmO_currQ   mO_cur_copyrS   rT   r   r   r	   store_O{   sB   



zPackGQA.store_ON)__name__
__module____qualname__r   	Constexprintboolr
   r   jitTensorInt32r#   	TiledCopyrU   TiledMmare   rr   r   r   r   r	   r   
   s    
,"r   )r   cutlass.cuter   flash_attn_origin.cute.utilsr   r   r   r   r   r	   <module>   s   