o
    c۷i]                     @   sp  d dl Z d dlmZmZ d dlmZmZ d dlZd dlm	  m
Z d dlZd dlmZ d dlmZmZmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lm Z  d d
l!m"Z" G dd dZ#ej$j%dddhddej&de'de(dej&dej&ddfddZ)e)j*dej&de'de(dej&dej&ddfddZ+edddd Z,d1dej&de'de(fddZ-G dd  d eZ.ej$j%d!d"hdd#ej&deej& dej&de'de(d"ej&ddfd$d%Z/e/j*d#ej&deej& dej&de'de(d"ej&ddfd&d'Z0eddd(d) Z1	d1d#ej&deej& dej&d*e'de(dej&fd+d,Z2G d-d. d.ej3j4Z5d1dej&de'de(fd/d0Z6dS )2    N)	lru_cachepartial)TypeOptional)Int32Float32
const_expr)make_fake_tensor)ReductionBase)
row_reducecompile_and_cache)torch2cute_dtype_map)bitonic_topkc                   @   s   e Zd Zddeej dededefddZdd	 Z	d
d Z
ejdejdejdejdejfddZejdejdejdejdejdejdeje fddZdS )TopKFdtypeNksoftmaxc                 C   s|   || _ || _d|j | _|| _|| _|dtt| ks!J d|dtt| ks0J d|dks6J |dks<J d S )N      zN must be a power of 2i   )	r   r   widthvecsizer   r   intmathlog2selfr   r   r   r    r   @/home/ubuntu/vllm_env/lib/python3.10/site-packages/quack/topk.py__init__   s   zTopK.__init__c                 C   s&   | j }tt|| j d|d d}|S )N    @      )r   maxminr   )r   r   num_threads_per_rowr   r   r   _threads_per_row%   s   zTopK._threads_per_rowc           	      C   sp   | j }| j}|dkrdnd}|  }|| }tt|d| |}||| | f}tj| j|||d}|||fS )N @  r      num_copy_elems)	r   r   r'   cuteceil_divr%   
copy_utilstiled_copy_2dr   	r   r   r   num_threadsthreads_per_rowcols_per_blocknum_blocks_Ntiler_mn
tiled_copyr   r   r   _get_tiled_copy,   s   

zTopK._get_tiled_copymXmValuesmIndicesstreamc           	      C   s   |j | jksJ |j | jksJ |j tksJ |  \}}}|j}| ||||||jt|j	d |d ddg|ddg|d d S )Nr   r#   gridblockr;   )
element_typer   r   r7   sizekernellaunchr,   r-   shape)	r   r8   r9   r:   r;   r6   r5   r2   r1   r   r   r   __call__9   s   
zTopK.__call__r5   r6   r2   c           1         sf  t j \}}}t j \ }}|j}	|j}
t |
} fdd||fD \}}||}||}||d }t 	|}t
|
d d k}|rMd ntj|||
d d}ttj|d}|d d |
d k ro||| t |jt}|| t tt| j}d|> d }t
t |	jd }t |t}tjt |dd	D ]*}t|||  d ||  }|| dkr| n|}||@ }|| | @ |B ||< qt
| rt|||jj   t!|| j"|d
}t
t#| j"|d|jj$ }| j"| dksJ t
t %| j"|| }t jj&| } | d> t jj&d B }!t ||ft}"tjt %| j"|dd	D ]Q}|| || k}#tj|dd	D ]>}$t
|dkrp|| |$ | j"k rnt jj'||| |$  d|!d}%|#rn|%|"|$|| f< qA||| |$  |"|$|| f< qAq0t |"t}&t |&jt}'tjt |&dd	D ]&}|&| |@ }|&| | @ |&|< || dkr| n|}t||@ |'|< qt
| j(r6tjt j|"dgddd	D ]&}|| ||  }(|(| j"| krtj|dd	D ]}$tj  |"|$|f< qqt jj'|d d|!d})ttj)}*t jj*|" |* |)|*  dd}+t jj+|+j,t j-j.ddd|d},|"|+t j/|,  t 	|"|j}-|-|" |j |d d }.d dks\|.|
d k rt 0||.d f |f}/t 0||.d f |f}0tjt |-jdgdd	D ]1}|| ||  }(|(| j"| k rt 1|-d |f |/d |(f  t 1|'d |f |0d |(f  qd S d S )Nc                       g | ]}t | d fqS r   r,   
local_tile.0mTbidxr5   r   r   
<listcomp>]       zTopK.kernel.<locals>.<listcomp>)r   NNNr#   limitpredr   Tunroll_full)
warp_widthr      )offsetmask_and_clamp)mode)fastmathg        )init_valreduction_profile)threads_in_group)2r,   arch
thread_idx	block_idxlayout_tv_tiledrC   make_identity_tensor	get_slicepartition_Smake_fragment_liker   r.   predicate_kr   copymake_rmem_tensorr   storeloadtor   r   r   r   r@   recast_tensorr   cutlassrangeutilsfill_oobr?   infr   r   r%   r   r-   	WARP_SIZEshuffle_syncr   eexp2warp_reduction_sumreduceReductionOpADD
rcp_approxtiled_divideautovec_copy)1r   r8   r9   r:   r5   r6   r2   tidx_	tv_layoutrC   idXgXcXthr_copytXgXtXcXtXrX	is_even_NtXpXri   tXrX_f32log_Nidx_maskr   tXrX_i32icol_idxencoded_idx	topk_valsvecsize_outnvec_per_threadmaskrZ   topk_vals_splitshould_receivevvaltopk_vals_i32topk_indicescolmax_vallog2_eexp_xdenomtopk_vals_outrowmValues_storemIndices_storer   rL   r   rA   L   s   






 zTopK.kernelNF)__name__
__module____qualname__r   ro   Numericr   boolr    r'   r7   r,   jitTensorcudaCUstreamrD   rA   Shape	TiledCopy	Constexprr   r   r   r   r      s8    "r   zquack::_topk_fwdvaluesindices)mutates_argsxr   r   returnc                 C   s   |   dks
J d| jsJ d| jtjtjtjfv s J d|dkr+|| jd ks/J d| d}t	| j }t
||||| || dS )	a  Top-k forward pass.
    Args:
        x: Input tensor of shape (M, N)
        k: Number of top elements to return
        softmax: Whether to apply softmax to the top-k values
    Returns:
        Tuple of (values tensor of shape (M, k), indices tensor of shape (M, k))
    r   zInput must be 2DzTensor must be on CUDA deviceUnsupported dtyper   r#   zk must be positive and <= NN)dimis_cudar   torchfloat16bfloat16float32rC   r@   r   _compile_topk_fwd)r   r   r   r   r   r   r   r   r   r   	_topk_fwd   s   

r   c           
      C   s|   ddl m} t| dtjpt|tj}|r:|s<| d}t| j }t| j }	t|||| t	|||	||| d S d S d S Nr   )COMPILE_ONLYr#   )
quack.cache_utilsr   
isinstancer@   r   SymIntr   r   r   _compile_topk_bwd)
r   r   r   r   r   r   
has_symintr   r   dx_dtyper   r   r   _topk_fwd_fake   s   


r   )maxsizec                    s*   d f} fdd}t ||S )Ntopk_fwdc                     sz   t  } tdj  }t|  f|}t| f|}tt| f|}t d}t j||||t j	j
ddddS Nr   r   T)use_tvm_ffi_env_streamz--enable-tvm-ffi)options)r,   sym_intr   gcdr   fake_tensorr   r   compileruntimemake_fake_stream)	batch_symdivx_cutevalues_cuteindices_cutetopk_opr   r   r   r   r   r   _compile  s   z#_compile_topk_fwd.<locals>._compiler   )r   r   r   r   keyr   r   r   r   r      s   
r   Fc                 C   sR   |  d}tj||f| j| jd}tj||ftj| jd}t| |||| ||fS )  Top-k operation.

    Args:
        x: Input tensor of shape (M, N)
        k: Number of top elements to return
        softmax: Whether to apply softmax to the top-k values

    Returns:
        Tuple of (values tensor of shape (M, k), indices tensor of shape (M, k))
    r   r   device)r@   r   emptyr   r   int32r   )r   r   r   Mr   r   r   r   r   r     s
   
r   c                       s   e Zd Zddeej dededef fddZdd	 Z	ddede
e fddZejdejde
ej dejdejdejf
ddZejdejde
ej dejdejdejdejdeje fddZ  ZS )TopKBackwardFr   r   r   r   c                    sH   t  j||dtd || _|| _|| _|| _||ksJ |dks"J d S )Nr#   )stagereduction_dtypei   )superr    r   r   r   r   r   r   	__class__r   r   r    *  s   zTopKBackward.__init__c                 C   s   | j dkrdS dS )Nr(   r   r)   )r   )r   r   r   r   _num_threads3  s   zTopKBackward._num_threadsNr   c           	      C   s   |d u rt |d| jj }|| dksJ d| d| |  }t || |}|| }t|| |}||| | f}tj| j|||d}|||fS )Nr   r   zInput N z! is not divisible by vector size r*   )r%   r   r   r   r,   r-   r.   r/   r0   r   r   r   r7   6  s    

zTopKBackward._get_tiled_copymdValuesr9   r:   mdXr;   c              	   C   s   |j | jksJ t|d ur|j | jksJ |j tksJ |   ttdd ||||fD  }t| jd| }| j	| j|d\}}	}
|j
}| |||||	||
jt|jd |	d ddg|ddg|d d S )Nc                 s   s     | ]}|d ur|j jV  qd S N)r?   r   )rJ   tr   r   r   	<genexpr>T  s    z(TopKBackward.__call__.<locals>.<genexpr>r   )r   r   r#   r<   )r?   r   r   r   _set_cluster_nr$   r   r   r   r7   r@   rA   rB   r,   r-   rC   )r   r   r9   r:   r   r;   largest_dtype_widthr   r6   r5   r2   r1   r   r   r   rD   D  s6   	
zTopKBackward.__call__r5   r6   r2   c           2   
      s  t j \}}	}	t j \ }	}	|j}
|j}t |}t |j} fdd||fD \}} fdd||||fD \}}}}tj	 }|j
|jt jdddd}| ||
\}}||}||}t|d urq||nd }||}t |}t|d urt |nd }t |}||jj t|d ur||jj |d ||}||} ||d	 }!t | }"t|d
 d
 k}#tj|||jd
 d}$|#rd ntj|||d
 d}%ttj|$d}&ttj|%d}'|!d d }(tt j d d  })tj|d |jjd |(|d k r-|&|| t|d ur(|&|| |&|| t j  | t}*t| jrZ| t}+t |*|+ t j!j"||d },|+|*|,  }-n|*}-t #|j|j}.|.$|-|j |(|d k rtj%|jd d
 ddD ]?}/tj%|jd ddD ]1}0|$|/d|0f rtj%|jd d ddD ]}1|.|1|/fd|0f ||(|) ||1|/fd|0f f< qqqt j  t &||" |(|d k r|'|"|  d S d S )Nc                    rE   rF   rG   rI   rL   r   r   rN   {  rO   z'TopKBackward.kernel.<locals>.<listcomp>c                    s*   g | ]}|d urt | dfnd qS )Nr   rG   rI   rL   r   r   rN   |  s    )r#   r   )order   )byte_alignmentr   rP   r#   rQ   rS   )
fill_value)NNr   TrU   r   )'r,   r`   ra   rb   rc   rC   rd   ro   rq   SmemAllocatorallocate_tensorr?   make_ordered_layout#_allocate_reduction_buffer_and_mbarre   rf   r   rg   fillzeropartition_Dr.   rh   r   ri   r   rr   barrierrl   rm   r   r   r   rz   r{   rj   rk   rp   r~   )2r   r   r9   r:   r   r5   r6   r2   r   r   r   rC   r   idTopKgdXr   gdValsgValsgIdxcTopKsmemsdXreduction_buffermbar_ptrr   tXgdVtXgVtXgItXrdVtXrVtXrItXsdXtXgdXr   tXrdXr   tXpVr   copy_kcopy_dxr   tile_row_start	dvals_f32vals_f32dotgradsgrad_cvtrest_vnr   r   rL   r   rA   h  s   















"
zTopKBackward.kernelr   r   )r   r   r   r   ro   r   r   r   r    r   r   r7   r,   r   r   r   r   rD   rA   r   r   r   __classcell__r   r   r   r   r   )  s@    &	#r   zquack::_topk_bwddxdvaluesc           
      C   s   |   dks
J d|dur|  dksJ d|  dks"J d| jr(|js,J d| jtjtjtjfv s;J d|d}t| j }|durNt|j nd}t|j }	t	|||	|||| ||| dS )	ai  Top-k backward pass.
    Args:
        dvalues: Upstream gradients tensor of shape (M, k)
        values: Forward top-k values tensor of shape (M, k)
        indices: Indices tensor of shape (M, k) from forward pass
        k: Number of top elements
        softmax: Whether softmax was applied in forward
        dx: Output gradient tensor of shape (M, N)
    r   zdvalues must be 2DNzvalues must be 2Dzindices must be 2DzTensors must be on CUDA devicer   r#   )
r   r   r   r   r   r   r   r@   r   r   )
r  r   r   r   r   r  r   r   	val_dtyper   r   r   r   	_topk_bwd  s   


 r   c                 C   st   ddl m} |r6t|dtjs8|d}t| j }|d ur$t|j nd }	t|j }
t||	|
||| d S d S d S r   )	r   r   r   r@   r   r   r   r   r   )r  r   r   r   r   r  r   r   r   r  r   r   r   r   _topk_bwd_fake  s   



r!  c                    s2   d f} fdd}t ||S )Ntopk_bwdc               	      s   t  } tdj  }t| f|}d ur!t| f|nd }tt| f|}t|  f|}t d}t j|||||t j	j
ddddS r   )r,   r   r   r   r   r   r   r   r   r   r   )r   r   dvalues_cuter   r   dx_cutetopk_bwd_opr   r   r   r   r   r  r   r   r     s    z#_compile_topk_bwd.<locals>._compiler   )r   r  r   r   r   r   r   r   r   r&  r   r     s   
r   r   c                 C   s8   | j \}}tj||f| j| jd}t| ||||| |S )a  Top-k backward pass.

    Args:
        dvalues: Upstream gradients tensor of shape (M, k)
        values: Forward top-k values tensor of shape (M, k), required if softmax=True
        indices: Indices tensor of shape (M, k) from forward pass
        N: Size of the original input dimension
        softmax: Whether softmax was applied in forward

    Returns:
        Input gradients tensor of shape (M, N)
    r   )rC   r   zerosr   r   r   )r  r   r   r   r   r   r   r  r   r   r   r"    s   
r"  c                   @   sJ   e Zd ZeddejdedefddZeddejd	e	ej fd
dZ
dS )TopKFunctionFr   r   r   c                 C   sZ   t |||d\}}| |r|nd | || _|jd | _|| _| | | d ||fS )Nr   r#   F)r   save_for_backwardr   rC   r   r   mark_non_differentiableset_materialize_grads)ctxr   r   r   r   r   r   r   r   forward7  s   

zTopKFunction.forwardNr  	dindices_c                 C   s*   | j \}}t|||| j| jd}|d d fS )N)r   r   )saved_tensorsr"  r   r   )r,  r  r.  r   r   r  r   r   r   backwardB  s   

zTopKFunction.backwardr   r   )r   r   r   staticmethodr   r   r   r   r-  r   r0  r   r   r   r   r(  6  s
    
"r(  c                 C   s   t | ||S )r   )r(  apply)r   r   r   r   r   r   topkI  s   r3  r   )7r   	functoolsr   r   typingr   r   r   cuda.bindings.driverbindingsdriverr   ro   cutlass.cuter,   r   r   r   quack.utilsrq   quack.copy_utilsr.   quack.compile_utilsr	   r   quack.reduction_baser
   quack.reducer   r   r   quack.cute_dsl_utilsr   quack.sort.bitonic_sortr   r   library	custom_opr   r   r   r   register_faker   r   r   r   r   r!  r   r"  autogradFunctionr(  r3  r   r   r   r   <module>   s    B
 (

