o
    c۷i >                  	   @   s  d dl Z d dlmZ d dlmZmZ d dlZd dlm  m	Z
 d dlZd dlmZ d dlmZmZmZ d dlmZ d dlmZ d dlmZ d dlmZmZ d dlmZ d dlmZ d d	lm Z  G d
d deZ!edddd Z"ej#j$ddhddej%dej%ddfddZ&e&j'dej%dej%ddfddZ(dej%dej%fddZ)G dd deZ*edddd Z+ej#j$ddhdd ej%d!ej%dej%ddfd"d#Z,e,j'd ej%d!ej%dej%ddfd$d%Z-d ej%d!ej%dej%fd&d'Z.G d(d) d)ej/j0Z1dej%dej%fd*d+Z2dS ),    N)Type)	lru_cachepartial)Int64Float32
const_expr)make_fake_tensor)
row_reduceonline_softmax_reduce)ReductionBasecompile_and_cache)torch2cute_dtype_mapc                       s   e Zd Zddeej dedef fddZdd Z	d	d
 Z
ejdejdejdejfddZejdejdejdejdejdeje f
ddZ  ZS )SoftmaxTdtypeNonline_softmaxc                    s.   t  j|||s	dnd|stntd || _d S )N      stagereduction_dtype)super__init__r   r   r   )selfr   r   r   	__class__ C/home/ubuntu/vllm_env/lib/python3.10/site-packages/quack/softmax.pyr      s   


zSoftmax.__init__c                 C   (   | j }dD ]\}}||kr|  S qdS )N)@            i       i   r!   ) @  r$      r   r   r   limitthreadsr   r   r   _threads_per_row#      zSoftmax._threads_per_rowc                 C   sT   | j }t| jjdkrg d}ng d}|D ]\}}||kr$|| _ d S qd| _d S )Nr%   )r)   r   )   r   )      )   r"   ))r2   r   )r3   r   )r5   r4   )i   r"   r   r   r   width	cluster_nr   r   
thresholdsr-   clusterr   r   r   _set_cluster_n*      

zSoftmax._set_cluster_nmXmOstreamc           	      C   s   |j | jksJ |   ttdd ||fD }| jd| d\}}}|j}| |||||jt	
|jd |d | jdg|ddgt| jdkrOd| jdgnd |d d S )Nc                 s       | ]}|j jV  qd S Nelement_typer7   .0tr   r   r   	<genexpr>?       z#Softmax.__call__.<locals>.<genexpr>r$   vecsizer   r   gridblockr;   r@   rD   r   r<   r   max_get_tiled_copysizekernellaunchcuteceil_divshaper8   )	r   r>   r?   r@   largest_dtype_width
tiled_copytiler_mnthreads_per_rownum_threadsr   r   r   __call__6   s   
zSoftmax.__call__rZ   rY   r[   c           #   
      s  |j }tj \}}}tj \ }}t| jdkrtdntj d |j}	t|	}
 fdd|||
fD \}}}t	j
 }|j|jtjdddd}| ||\}}||}||}||}||}||d	 }d
d ||fD \}}t|	d d | j k}|rd ntj|||	d d}ttj|d}t|tjj }| ||| |d d |	d k r|||dd tj  tjd t| rt
|||jj  t|| |  tj!}t| j" rKt#|tj$j%||d t| jdkr|d nd t!j t| jdkrtjj&nd d}t'(t'j)}tj'j*|| ||  dd} t#| tj$j+||d t| jdkrE|d nd dd}!nt,|||d |t| jdkr^tjj&nd dd\}}!} | tj-|! }"|.|" |j |d d |	d k r||| d S d S )Nr   r   c                       g | ]}t | fqS r   rU   
local_tilerF   mTbidx	cluster_yrZ   r   r   
<listcomp>]   s    z"Softmax.kernel.<locals>.<listcomp>r   r   orderr%   byte_alignment)r   NNNc                 S      g | ]}t |qS r   rU   make_fragment_likerF   thrr   r   r   rf   k       r-   predTis_asyncNNr   init_valhook_fn)fastmath)NNr           )rz   )r{   return_exp_x)/layout_tv_tiledrU   arch
thread_idx	block_idxr   r8   rW   make_identity_tensorcutlassutilsSmemAllocatorallocate_tensorrD   make_ordered_layout#_allocate_reduction_buffer_and_mbar	get_slicepartition_Spartition_D
copy_utilspredicate_kr   copyrR   	WARP_SIZE_initialize_clustercp_async_commit_groupcp_async_wait_groupfill_oobinfautovec_copyloadtor   r   r	   ReductionOpMAXcluster_waitmathlog2eexp2ADDr
   
rcp_approxstore)#r   r>   r?   rZ   rY   r[   	tv_layouttidx_rW   idXgXgOcXsmemsXreduction_buffermbar_ptr
thr_copy_XtXgXtXsXtXgOtXcXtXrXtXrO	is_even_NtXpXr   	num_warpsxmax_xlog2_eexp_xdenomyr   rc   r   rS   K   s   	$
"






			zSoftmax.kernel)T)__name__
__module____qualname__r   r   Numericintboolr   r/   r<   rU   jitTensorcudaCUstreamr]   rS   Shape	TiledCopy	Constexpr__classcell__r   r   r   r   r      s0    "
r   )maxsizec                    s&   d f} fdd}t ||S )Nsoftmax_fwdc                     s`   t   tdj  fddfD \} }t}t j|| |t jjddddS )Nr$   c                       g | ]
}t | fqS r   fake_tensorrF   dtr   	batch_symdivr   r   rf      s    z:_compile_softmax_fwd.<locals>._compile.<locals>.<listcomp>Tuse_tvm_ffi_env_stream--enable-tvm-ffioptions)	rU   sym_intr   gcdr7   r   compileruntimemake_fake_stream)x_cuteout_cute
softmax_opr   r   	out_dtyper   r   r   _compile   s   
z&_compile_softmax_fwd.<locals>._compiler   )r   r   r   keyr   r   r   r   _compile_softmax_fwd   s   
r   zquack::_softmax_fwdout)mutates_argsr   returnc                 C   sv   |   dks
J d| jsJ d| jtjtjtjfv s J d| d}dd | |fD \}}t|||| | dS )	zSoftmax forward pass.
    Args:
        x: Input tensor of shape (M, N)
    Returns:
        Softmax output tensor of same shape as x
    r   zInput must be 2DzTensor must be on CUDA deviceUnsupported dtyper   c                 S      g | ]}t |j qS r   r   r   rE   r   r   r   rf      rr   z _softmax_fwd.<locals>.<listcomp>N)	dimis_cudar   torchfloat16bfloat16float32rR   r   )r   r   r   r   r   r   r   r   _softmax_fwd   s   
r   c                 C   sh   ddl m} |r0t| dtjs2| d}dd | |fD \}}t||| t|||| d S d S d S )Nr   COMPILE_ONLYr   c                 S   r   r   r   rE   r   r   r   rf      rr   z%_softmax_fwd_fake.<locals>.<listcomp>)quack.cache_utilsr   
isinstancerR   r   SymIntr   _compile_softmax_backward)r   r   r   r   r   r   r   r   r   _softmax_fwd_fake   s   
r   c                 C   s   t | }t| | |S rB   )r   
empty_liker   )r   r   r   r   r   r      s   

r   c                       s   e Zd Zdeej def fddZdd Zdd Z	d	d
 Z
ejdejdejdejdejfddZejdejdejdejdejdejdeje fddZ  ZS )SoftmaxBackwardr   r   c                    s   t  j||dtd d S )Nr   r   )r   r   r   )r   r   r   r   r   r   r      s   zSoftmaxBackward.__init__c                 C   r   )N)r    r#   r&   r(   )    r$   r*   r+   r,   r   r   r   r/      r0   z SoftmaxBackward._threads_per_rowc                 C   sT   | j }t| jjdkrg d}ng d}|D ]\}}||kr$|| _ d S qd| _d S )Nr%   r1   r6   r9   r   r   r   r<      r=   zSoftmaxBackward._set_cluster_nc                 C   s   | j dkrdS dS )Nr   r$   r*   r+   )r   r   r   r   _num_threads   s   zSoftmaxBackward._num_threadsmdYmYmdXr@   c           
      C   s   |j | jksJ |   ttdd |||fD }| jd| d\}}}|j}	| ||||||jt	
|jd |d | jdg|	ddgt| jdkrQd| jdgnd |d d S )Nc                 s   rA   rB   rC   rE   r   r   r   rH     rI   z+SoftmaxBackward.__call__.<locals>.<genexpr>r$   rJ   r   r   rL   rO   )
r   r  r  r  r@   rX   rY   rZ   r[   r\   r   r   r   r]      s   
zSoftmaxBackward.__call__rZ   rY   r[   c           '   
      s  t j \}}}t j \ }}t| jdkrtdnt j d |j}	|j}
t |
} fdd||||fD \}}}}t	j
 }|j|jt jdddd}|j|jt jdddd}| ||	\}}||}||}||}||}||}||}||d	 }d
d |||fD \}}}t|
d d | j k}|rd ntj|||
d d} ttj| d}!t |t jj }"| |||" |d d |
d k r|!||dd |!||dd t j  t jd t || t || | t j}#| t j}$t |#|$ t j!j"||d t| jdkr#|nd dt| jdkr1t jj#nd d}%|$|#|%  }&|$|&|j |d d |
d k rV|!|| d S d S )Nr   r   c                    r^   r   r_   ra   rc   r   r   rf   %  s    z*SoftmaxBackward.kernel.<locals>.<listcomp>rg   rh   r%   rj   rl   c                 S   rm   r   rn   rp   r   r   r   rf   :  rr   rs   rt   Trv   rx   r}   ry   )%rU   r   r   r   r   r8   r   rW   r   r   r   r   r   rD   r   r   r   r   r   r   r   r   r   rR   r   r   r   r   r   r   r   r   r	   r   r   r   r   )'r   r  r  r  rZ   rY   r[   r   r   r   rW   r   gdYgYgdXr   r   sdYsYr   r   thr_copytdYgdYtdYsdYtYgYtYsYtdXgdXr   tdYrdYtYrYtdXrdXr   r   r   r   dyr   dotdxr   rc   r   rS     sl   
$









zSoftmaxBackward.kernel)r   r   r   r   r   r   r   r   r/   r<   r   rU   r   r   r   r   r]   rS   r   r   r   r   r   r   r   r   r      s:    r   c                    s*   d f} fdd}t ||S )Nsoftmax_bwdc                     sf   t   tdj  fddfD \} }}t}t j|| ||t jjddddS )Nr$   c                    r   r   r   r   r   r   r   rf   k  s    z?_compile_softmax_backward.<locals>._compile.<locals>.<listcomp>Tr   r   r   )	rU   r   r   r   r7   r   r   r   r   )dy_cutey_cutedx_cutesoftmax_backward_opr   r   dx_dtypey_dtyper   r   r   h  s   
z+_compile_softmax_backward.<locals>._compiler   )r   r  r  r   r   r   r   r  r   r   d  s   
r   zquack::_softmax_backwardr  r  r   c                 C   s   |   dks
J d|  dksJ d| j|jksJ d| jr$|js(J d| jtjtjtjfv s7J d|j| jksAJ d| d}d	d
 | ||fD \}}}t	||||| || dS )zSoftmax backward pass.
    Args:
        dy: Upstream gradients tensor of shape (M, N)
        y: Softmax output tensor of shape (M, N)
    Returns:
        Input gradients tensor of same shape as dy and y
    r   zdy must be 2Dzy must be 2Dzdy and y must have same shapezTensors must be on CUDA devicer   zdy and y must have same dtyper   c                 S   r   r   r   rE   r   r   r   rf     rr   z%_softmax_backward.<locals>.<listcomp>N)
r   rW   r   r   r   r   r   r   rR   r   )r  r   r  r   r   r  r  r   r   r   _softmax_backward{  s   	
r  c                 C   s`   ddl m} |r,t| dtjs.| d}dd | ||fD \}}}t|||| d S d S d S )Nr   r   r   c                 S   r   r   r   rE   r   r   r   rf     rr   z*_softmax_backward_fake.<locals>.<listcomp>)r   r   r   rR   r   r   r   )r  r   r  r   r   r   r  r  r   r   r   _softmax_backward_fake  s   
r  c                 C   s   t | }t| || |S rB   )r   r   r  )r  r   r  r   r   r   r    s   
r  c                   @   s$   e Zd Zedd Zedd ZdS )SoftmaxFunctionc                 C   s   t |}| | |S rB   )r   save_for_backward)ctxr   r   r   r   r   forward     
zSoftmaxFunction.forwardc                 C   s   | j \}t||}|S rB   )saved_tensorsr  )r!  r  r   r  r   r   r   backward  r#  zSoftmaxFunction.backwardN)r   r   r   staticmethodr"  r%  r   r   r   r   r    s
    
r  c                 C   s
   t | S )zSoftmax forward pass with automatic differentiation support.

    Args:
        x: Input tensor of shape (M, N)

    Returns:
        Softmax output tensor of same shape as x
    )r  apply)r   r   r   r   softmax  s   
	r(  )3r   typingr   	functoolsr   r   r   cuda.bindings.driverbindingsdriverr   r   cutlass.cuterU   r   r   r   quack.utilsr   quack.copy_utilsr   quack.compile_utilsr   r   quack.reducer	   r
   quack.reduction_baser   r   r   quack.cute_dsl_utilsr   r   r   library	custom_opr   r   register_faker   r   r   r   r  r  r  autogradFunctionr  r(  r   r   r   r   <module>   sF    
 
""
