o
    پi1                     @   s  d dl Z d dlZd dlmZmZ d dlZd dlmZ d dlmZm	Z	m
Z
mZmZ d dlmZ ej	d#dejdedejdejd	ejf
d
dZej		d$dejdedejdejdejdee d	ejfddZej		d%dejdedejdeej dee dejd	ejfddZej					d&dejejB dejdeje deej deej dee dejdee d	ejfddZej					d'dejdeje deej deej dee dee ded	e
e
eej gfddZej	d(dejdeded ed	ejf
d!d"ZdS ))    N)CallableOptional)Int32Int64Float32Boolean
const_expr        valopreduction_bufferinit_valreturnc           
      C   s|   t j t j }}t |jd }|| || }}|dkr&| |||f< t j  |}	||k r7|||f }	t j|	|S )zDreduction_buffer has shape (num_warps / warp_per_row, warps_per_row)   r   )cutearchlane_idxwarp_idxsizeshapebarrierwarp_reduction)
r
   r   r   r   r   r   warps_per_rowrow_idxcol_idxblock_reduce_val r   @/home/ubuntu/.local/lib/python3.10/site-packages/quack/reduce.pyblock_reduce   s   
r   mbar_ptrphasec                 C   sJ  t j }t j t j }}|j\}	\}
}||
 ||
 }}|dkrLt j  |	|
 }t j||| |jj	 d  W d   n1 sGw   Y  ||k rat
j| t
||||ff||d t jj||durk|ndd |}t |
| t jj}t|D ]}||t jj  }|t j|dgdk r|||||f }qt j||S )zRreduction_buffer has shape (num_warps / warps_per_row, (warps_per_row, cluster_n))r      Npeer_cta_rank_in_clusterr    r   mode)r   r   block_idx_in_clusterr   r   r   	elect_onembarrier_arrive_and_expect_txelement_typewidthutilsstore_shared_remoteelem_pointermbarrier_waitceil_div	WARP_SIZEcutlassrange_constexprr   r   )r
   r   r   r   r   r    cta_rank_in_clusterr   r   rows_per_blockr   	cluster_nr   r   	num_warpsr   num_iteriidxr   r   r   cluster_reduce   s8   

r;   c                 C   s0   t |du rt| |||dS t| |||||dS )zPPerform either block or cluster reduction based on whether mbar_ptr is provided.N)r   r    r   )r   r   r;   )r
   r   r   r   r    r   r   r   r   block_or_cluster_reduceE   s   
r=   xthreads_per_rowhook_fnc                 C   s   t t| tjr| j||dd}n| }tjjtjtjj	t | j
tkr&tjjnttjjttjjtji| }	tjj||	t|tjjd}t |durK|  t |durx|jd \}
}|dksd|dusdJ dt |
dkpl|dkrxt||	||||d}|S )zXreduction_buffer must have shape (num_warps / warps_per_row, (warps_per_row, cluster_n))r   r   reduction_profilethreads_in_groupNr   /mbar_ptr must be provided for cluster reductionr<   )r   
isinstancer   	TensorSSAreduceReductionOpADDoperatoraddMAXdtyper   r   fmaxmaxMINminMULmulr   r1   r   r=   )r>   r   r?   r   r   r    r   r@   r
   warp_opr   r6   r   r   r   
row_reduceU   s6   

rV   Freturn_exp_xc                 C   s  | j tks	J d	 tjj| jtjjtj ddtjj	t
|tjjd}ttj}tjj| | ||  dd}	tjj|	jtjjdddtjt
|tjjd}
t|d urX|  t|d ur|j\}\}}|dksr|d usrJ d	t|dkpz|dkr|jtksJ d
tj tj }}|| || }}t|d u r|dkrt||
|||f< tj  tj }d}
||k rt|||f \}}
tj|tjj	}|
tjj|| dd9 }
tj|
tj}
t|r|	tjj|| dd9 }	|}ntj }|dkr+tj  || }tj ||| |jj! d  W d    n	1 s&w   Y  ||k rEtj"t||
t#||||ff||d tjj$||d urP|ndd t%|| tjj}t&|t}|'tj  t&|t}|'d t()|D ]%}||tjj  }|tj*|dgdk rt|||f \||< ||< q{|+ jtjjtj dd}tj|tjj	}d}
t()|D ]}|
|| tjj|| | dd 7 }
qtj|
tj}
t|r|	tjj|| dd9 }	|}||
t|r|	fS d fS )Nzx must be of type Float32r   rA   rC   T)fastmathr	   r   rE   z+reduction_buffer must be of type cute.Int64r!   r"   r$   r%   ),rN   r   r   r   r   rH   rI   rM   infrO   rR   r1   mathlog2eexp2rJ   rK   rL   r   r   r*   r   r   r   r,   f32x2_to_i64r   i64_to_f32x2expr'   r(   r)   r+   r-   r.   r/   r0   make_fragmentfillr2   r3   r   load)r>   r?   r   r   r@   r    rW   max_xlog2_eexp_x	sum_exp_xr5   r   r6   r   r   r   r   max_x_single_warpmax_x_finalr4   r7   r8   sum_exp_x_single_warpr9   r:   r   r   r   online_softmax_reduce~   s   









rk   r       Xelem_per_lanesubwarp_size	warp_sizec              	   C   s  |dkr|dkr|dt t|> ksJ |dkr*|| dkr*|dt t|> ks,J tj | }t| t|} tj| dgd}|d| ksLJ t	j
t t|t t|| ddD ]%}t	j
t| ddD ]}tjj| | d|> | d}| | | | |< qlq`t	t ttj| dgdd ddD ]_}	d|	> }
t	j
|
ddD ]Q}| d	|f }| d	||
 f }t||
@  }t	j
t|ddD ]/}|| || }}|r|n|||< |r|n|||< tjj|| |
| d}|| | ||< qqq| d
 S )aX  
    For warp reduction, we use Swap Shuffle
    The normal way to reduction among threads:
    use shuffle to let *** the first half of threads *** have *** whole data *** from the second half of threads.
    After each step of reduction, a half of threads won't work in the following steps.
    That is, as the reduction progresses, the efficiency of shuffle & reduction instructions gradually change from 1/2, 1/4 to 1/32 (the worst case).
    To overcome this shortcoming, for a NxN matrix to be reduced among N threads as a 1XN vectors,
    we use swap & shuffle aiming to let *** each half of threads *** have *** a half of data *** from the other half of threads.
    After reduction, each half of threads should deal with a (N/2)x(N/2) sub-matrix independently in the following step.
    We can recursively do this until the problem size is 1.
    r   rl   r   r%   T)unroll_full)offsetN)Nr   )intrZ   r[   r   r   r   logical_dividemake_layoutr   r2   rangeshuffle_sync_bflyr3   r   )rm   rn   ro   rp   r   numvecr9   vshfl_vallogmmrfrg_Afrg_Bshould_swaplowerupperr   r   r   sum_swap_shuffle   s@   
,r   )r	   )r	   N)Nr	   )NNNr	   N)NNNNF)r   r   rl   )rZ   rK   typingr   r   r2   cutlass.cuter   r   r   r   r   r   quack.utilsr,   jitNumericTensorr   Pointerr;   r=   rG   rI   	Constexprrt   rV   boolrk   r   r   r   r   r   <module>   s   %
	(b