o
    پiW                     @   s  d dl Z d dlmZ d dlmZmZ d dlZd dlm  m	Z
 d dlZd dlmZ d dlmZmZmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ G d
d dZ ej!j"dddhddej#de$de%dej#dej#ddfddZ&i e&_'d'dej#de$de%fddZ(G dd deZ)ej!j"ddhddej#deej# dej#de$de%dej#ddfddZ*i e*_'	d'dej#deej# dej#d e$de%dej#fd!d"Z+G d#d$ d$ej,j-Z.d'dej#de$de%fd%d&Z/dS )(    N)partial)TypeOptional)Int32Float32
const_expr)make_fake_tensor)ReductionBase)
row_reduce)torch2cute_dtype_map)bitonic_topkc                   @   s   e Zd Zddeej dededefddZdd	 Z	d
d Z
ejdejdejdejdejfddZejdejdejdejdejdejdeje fddZdS )TopKFdtypeNksoftmaxc                 C   s|   || _ || _d|j | _|| _|| _|dtt| ks!J d|dtt| ks0J d|dks6J |dks<J d S )N      zN must be a power of 2i   )	r   r   widthvecsizer   r   intmathlog2selfr   r   r   r    r   >/home/ubuntu/.local/lib/python3.10/site-packages/quack/topk.py__init__   s   zTopK.__init__c                 C   s&   | j }tt|| j d|d d}|S )N    @      )r   maxminr   )r   r   num_threads_per_rowr   r   r   _threads_per_row$   s   zTopK._threads_per_rowc           	      C   sp   | j }| j}|dkrdnd}|  }|| }tt|d| |}||| | f}tj| j|||d}|||fS )N @  r      num_copy_elems)	r   r   r$   cuteceil_divr"   
copy_utilstiled_copy_2dr   	r   r   r   num_threadsthreads_per_rowcols_per_blocknum_blocks_Ntiler_mn
tiled_copyr   r   r   _get_tiled_copy+   s   

zTopK._get_tiled_copymXmValuesmIndicesstreamc           	      C   s   |j | jksJ |j | jksJ |j tksJ |  \}}}|j}| ||||||jt|j	d |d ddg|ddg|d d S )Nr   r    gridblockr8   )
element_typer   r   r4   sizekernellaunchr)   r*   shape)	r   r5   r6   r7   r8   r3   r2   r/   r.   r   r   r   __call__8   s   
zTopK.__call__r2   r3   r/   c           1         sf  t j \}}}t j \ }}|j}	|j}
t |
} fdd||fD \}}||}||}||d }t 	|}t
|
d d k}|rMd ntj|||
d d}ttj|d}|d d |
d k ro||| t |jt}|| t tt| j}d|> d }t
t |	jd }t |t}tjt |dd	D ]*}t|||  d ||  }|| dkr| n|}||@ }|| | @ |B ||< qt
| rt|||jj   t!|| j"|d
}t
t#| j"|d|jj$ }| j"| dksJ t
t %| j"|| }t jj&| } | d> t jj&d B }!t ||ft}"tjt %| j"|dd	D ]Q}|| || k}#tj|dd	D ]>}$t
|dkrp|| |$ | j"k rnt jj'||| |$  d|!d}%|#rn|%|"|$|| f< qA||| |$  |"|$|| f< qAq0t |"t}&t |&jt}'tjt |&dd	D ]&}|&| |@ }|&| | @ |&|< || dkr| n|}t||@ |'|< qt
| j(r6tjt j|"dgddd	D ]&}|| ||  }(|(| j"| krtj|dd	D ]}$tj  |"|$|f< qqt jj'|d d|!d})ttj)}*t jj*|" |* |)|*  dd}+t jj+|+j,t j-j.ddd|d},|"|+t j/|,  t 	|"|j}-|-|" |j |d d }.d dks\|.|
d k rt 0||.d f |f}/t 0||.d f |f}0tjt |-jdgdd	D ]1}|| ||  }(|(| j"| k rt 1|-d |f |/d |(f  t 1|'d |f |0d |(f  qd S d S )Nc                       g | ]}t | d fqS r   r)   
local_tile.0mTbidxr2   r   r   
<listcomp>\       zTopK.kernel.<locals>.<listcomp>)r   NNNr    limitpredr   Tunroll_full)
warp_widthr      )offsetmask_and_clamp)mode)fastmathg        )init_valreduction_profile)threads_in_group)2r)   arch
thread_idx	block_idxlayout_tv_tiledr@   make_identity_tensor	get_slicepartition_Smake_fragment_liker   r+   predicate_kr   copymake_fragmentr   storeloadtor   r   r   r   r=   recast_tensorr   cutlassrangeutilsfill_oobr<   infr   r   r"   r   r*   	WARP_SIZEshuffle_syncr   eexp2warp_reduction_sumreduceReductionOpADD
rcp_approxtiled_divideautovec_copy)1r   r5   r6   r7   r2   r3   r/   tidx_	tv_layoutr@   idXgXcXthr_copytXgXtXcXtXrX	is_even_NtXpXrf   tXrX_f32log_Nidx_maskr   tXrX_i32icol_idxencoded_idx	topk_valsvecsize_outnvec_per_threadmaskrW   topk_vals_splitshould_receivevvaltopk_vals_i32topk_indicescolmax_vallog2_eexp_xdenomtopk_vals_outrowmValues_storemIndices_storer   rI   r   r>   K   s   






 zTopK.kernelNF)__name__
__module____qualname__r   rl   Numericr   boolr   r$   r4   r)   jitTensorcudaCUstreamrA   r>   Shape	TiledCopy	Constexprr   r   r   r   r      s8    "r   zquack::_topk_fwdvaluesindices)mutates_argsxr   r   returnc                 C   s  |   dks
J d| jsJ d| jtjtjtjfv s J d|dkr+|| jd ks/J d| d}t	| j }||||f}|t
jvrt }td|j |}	t|||f|	}
t|||f|	}tt||f|	}t||||d	}tj||
||tjjd
dddt
j|< t
j| | || dS )a  Top-k forward pass.
    Args:
        x: Input tensor of shape (M, N)
        k: Number of top elements to return
        softmax: Whether to apply softmax to the top-k values
    Returns:
        Tuple of (values tensor of shape (M, k), indices tensor of shape (M, k))
    r   zInput must be 2DzTensor must be on CUDA deviceUnsupported dtyper   r    zk must be positive and <= Nr   r   Tuse_tvm_ffi_env_stream--enable-tvm-ffioptionsN)dimis_cudar   torchfloat16bfloat16float32r@   r=   r   	_topk_fwdcompile_cacher)   sym_intr   gcdr   fake_tensorr   r   compileruntimemake_fake_stream)r   r   r   r   r   r   r   compile_key	batch_symdivx_cutevalues_cuteindices_cutetopk_opr   r   r   r      s.   


r   Fc                 C   sR   |  d}tj||f| j| jd}tj||ftj| jd}t| |||| ||fS )  Top-k operation.

    Args:
        x: Input tensor of shape (M, N)
        k: Number of top elements to return
        softmax: Whether to apply softmax to the top-k values

    Returns:
        Tuple of (values tensor of shape (M, k), indices tensor of shape (M, k))
    r   r   device)r=   r   emptyr   r   int32r   )r   r   r   Mr   r   r   r   r   topk_fwd  s
   
r   c                       s   e Zd Zddeej dededef fddZdd	 Z	ddede
e fddZejdejde
ej dejdejdejf
ddZejdejde
ej dejdejdejdejdeje fddZ  ZS )TopKBackwardFr   r   r   r   c                    sH   t  j||dtd || _|| _|| _|| _||ksJ |dks"J d S )Nr    )stagereduction_dtypei   )superr   r   r   r   r   r   r   	__class__r   r   r     s   zTopKBackward.__init__c                 C   s   | j dkrdS dS )Nr%   r   r&   )r   )r   r   r   r   _num_threads  s   zTopKBackward._num_threadsNr   c           	      C   s   |d u rt |d| jj }|| dksJ d| d| |  }t || |}|| }t|| |}||| | f}tj| j|||d}|||fS )Nr   r   zInput N z! is not divisible by vector size r'   )r"   r   r   r   r)   r*   r+   r,   r-   r   r   r   r4   !  s    

zTopKBackward._get_tiled_copymdValuesr6   r7   mdXr8   c              	   C   s   |j | jksJ t|d ur|j | jksJ |j tksJ |   ttdd ||||fD  }t| jd| }| j	| j|d\}}	}
|j
}| |||||	||
jt|jd |	d ddg|ddg|d d S )Nc                 s   s     | ]}|d ur|j jV  qd S N)r<   r   )rG   tr   r   r   	<genexpr>?  s    z(TopKBackward.__call__.<locals>.<genexpr>r   )r   r   r    r9   )r<   r   r   r   _set_cluster_nr!   r   r   r   r4   r=   r>   r?   r)   r*   r@   )r   r   r6   r7   r   r8   largest_dtype_widthr   r3   r2   r/   r.   r   r   r   rA   /  s6   	
zTopKBackward.__call__r2   r3   r/   c           2   
      s  t j \}}	}	t j \ }	}	|j}
|j}t |}t |j} fdd||fD \}} fdd||||fD \}}}}tj	 }|j
|jt jdddd}| ||
\}}||}||}t|d urq||nd }||}t |}t|d urt |nd }t |}||jj t|d ur||jj |d ||}||} ||d	 }!t | }"t|d
 d
 k}#tj|||jd
 d}$|#rd ntj|||d
 d}%ttj|$d}&ttj|%d}'|!d d }(tt j d d  })tj|d |jjd |(|d k r-|&|| t|d ur(|&|| |&|| t j  | t}*t| jrZ| t}+t |*|+ t j!j"||d },|+|*|,  }-n|*}-t #|j|j}.|.$|-|j |(|d k rtj%|jd d
 ddD ]?}/tj%|jd ddD ]1}0|$|/d|0f rtj%|jd d ddD ]}1|.|1|/fd|0f ||(|) ||1|/fd|0f f< qqqt j  t &||" |(|d k r|'|"|  d S d S )Nc                    rB   rC   rD   rF   rI   r   r   rK   f  rL   z'TopKBackward.kernel.<locals>.<listcomp>c                    s*   g | ]}|d urt | dfnd qS )Nr   rD   rF   rI   r   r   rK   g  s    )r    r   )order   )byte_alignmentr   rM   r    rN   rP   )
fill_value)NNr   TrR   r   )'r)   r]   r^   r_   r`   r@   ra   rl   rn   SmemAllocatorallocate_tensorr<   make_ordered_layout#_allocate_reduction_buffer_and_mbarrb   rc   r   rd   fillzeropartition_Dr+   re   r   rf   r   ro   barrierri   rj   r   r   r
   rw   rx   rg   rh   rm   r{   )2r   r   r6   r7   r   r2   r3   r/   r|   r}   r~   r@   r   idTopKgdXr   gdValsgValsgIdxcTopKsmemsdXreduction_buffermbar_ptrr   tXgdVtXgVtXgItXrdVtXrVtXrItXsdXtXgdXr   tXrdXr   tXpVr   copy_kcopy_dxr   tile_row_start	dvals_f32vals_f32dotgradsgrad_cvtrest_vnr   r   rI   r   r>   S  s   















"
zTopKBackward.kernelr   r   )r   r   r   r   rl   r   r   r   r   r   r   r4   r)   r   r   r   r   rA   r>   r   r   r   __classcell__r   r   r   r   r     s@    &	#r   zquack::_topk_bwddxdvaluesc              	   C   sz  |   dks
J d|dur|  dksJ d|  dks"J d| jr(|js,J d| jtjtjtjfv s;J d|d}t| j }|durNt|j n|}t|j }	|||	|||f}
|
t	j
vrt }td	|j |}t|||f|}|durt|||f|nd}tt||f|}t|	||f|}t||||d
}tj|||||tjjddddt	j
|
< t	j
|
 | ||| dS )ai  Top-k backward pass.
    Args:
        dvalues: Upstream gradients tensor of shape (M, k)
        values: Forward top-k values tensor of shape (M, k)
        indices: Indices tensor of shape (M, k) from forward pass
        k: Number of top elements
        softmax: Whether softmax was applied in forward
        dx: Output gradient tensor of shape (M, N)
    r   zdvalues must be 2DNzvalues must be 2Dzindices must be 2DzTensors must be on CUDA devicer   r    r   r   Tr   r   r   )r   r   r   r   r   r   r   r=   r   	_topk_bwdr   r)   r   r   r   r   r   r   r   r   r   r   )r  r   r   r   r   r  r   r   	val_dtypedx_dtyper   r   r   dvalues_cuter   r   dx_cutetopk_bwd_opr   r   r   r    s:   



	r  r   c                 C   s8   | j \}}tj||f| j| jd}t| ||||| |S )a  Top-k backward pass.

    Args:
        dvalues: Upstream gradients tensor of shape (M, k)
        values: Forward top-k values tensor of shape (M, k), required if softmax=True
        indices: Indices tensor of shape (M, k) from forward pass
        N: Size of the original input dimension
        softmax: Whether softmax was applied in forward

    Returns:
        Input gradients tensor of shape (M, N)
    r   )r@   r   zerosr   r   r  )r  r   r   r   r   r   r   r  r   r   r   topk_bwd  s   
r  c                   @   sJ   e Zd ZeddejdedefddZeddejd	e	ej fd
dZ
dS )TopKFunctionFr   r   r   c                 C   sZ   t |||d\}}| |r|nd | || _|jd | _|| _| | | d ||fS )Nr   r    F)r   save_for_backwardr   r@   r   r   mark_non_differentiableset_materialize_grads)ctxr   r   r   r   r   r   r   r   forward
  s   

zTopKFunction.forwardNr  	dindices_c                 C   s*   | j \}}t|||| j| jd}|d d fS )N)r   r   )saved_tensorsr  r   r   )r  r  r   r   r   r  r   r   r   backward  s   

zTopKFunction.backwardr   r   )r   r   r   staticmethodr   r   r   r   r  r   r"  r   r   r   r   r  	  s
    
"r  c                 C   s   t | ||S )r   )r  apply)r   r   r   r   r   r   topk  s   r%  r   )0r   	functoolsr   typingr   r   r   cuda.bindings.driverbindingsdriverr   rl   cutlass.cuter)   r   r   r   quack.utilsrn   quack.copy_utilsr+   quack.compile_utilsr   r   quack.reduction_baser	   quack.reducer
   quack.cute_dsl_utilsr   quack.sort.bitonic_sortr   r   library	custom_opr   r   r   r   r   r   r   r  r  autogradFunctionr  r%  r   r   r   r   <module>   s    B% (1
