o
    پi8                  	   @   s  d dl Z d dlmZ d dlmZ d dlZd dlm  mZ	 d dl
Z
d dlmZ d dl
mZmZmZ d dlmZ d dlmZ d dlmZ d dlmZmZ d dlmZ d dlmZ G d	d
 d
eZejj ddhddej!dej!ddfddZ"i e"_#dej!dej!fddZ$G dd deZ%ejj ddhddej!dej!dej!ddfddZ&i e&_#dej!dej!dej!fddZ'G dd dej(j)Z*dej!dej!fd d!Z+dS )"    N)Type)partial)Int64Float32
const_expr)make_fake_tensor)
row_reduceonline_softmax_reduce)ReductionBase)torch2cute_dtype_mapc                       s   e Zd Zddeej dedef fddZdd Z	d	d
 Z
ejdejdejdejfddZejdejdejdejdejdeje f
ddZ  ZS )SoftmaxTdtypeNonline_softmaxc                    s.   t  j|||s	dnd|stntd || _d S )N      stagereduction_dtype)super__init__r   r   r   )selfr   r   r   	__class__ A/home/ubuntu/.local/lib/python3.10/site-packages/quack/softmax.pyr      s   


zSoftmax.__init__c                 C   (   | j }dD ]\}}||kr|  S qdS )N)@            i       i   r   ) @  r!      r   r   r   limitthreadsr   r   r   _threads_per_row"      zSoftmax._threads_per_rowc                 C   sT   | j }t| jjdkrg d}ng d}|D ]\}}||kr$|| _ d S qd| _d S )Nr"   )r&   r   )   r   )      )   r   ))r/   r   )r0   r   )r2   r1   )i   r   r   r   r   width	cluster_nr   r   
thresholdsr*   clusterr   r   r   _set_cluster_n)      

zSoftmax._set_cluster_nmXmOstreamc           	      C   s   |j | jksJ |   ttdd ||fD }| jd| d\}}}|j}| |||||jt	
|jd |d | jdg|ddgt| jdkrOd| jdgnd |d d S )Nc                 s       | ]}|j jV  qd S Nelement_typer4   .0tr   r   r   	<genexpr>>       z#Softmax.__call__.<locals>.<genexpr>r!   vecsizer   r   gridblockr8   r=   rA   r   r9   r   max_get_tiled_copysizekernellaunchcuteceil_divshaper5   )	r   r;   r<   r=   largest_dtype_width
tiled_copytiler_mnthreads_per_rownum_threadsr   r   r   __call__5   s   
zSoftmax.__call__rW   rV   rX   c           #   
      s  |j }tj \}}}tj \ }}t| jdkrtdntj d |j}	t|	}
 fdd|||
fD \}}}t	j
 }|j|jtjdddd}| ||\}}||}||}||}||}||d	 }d
d ||fD \}}t|	d d | j k}|rd ntj|||	d d}ttj|d}t|tjj }| ||| |d d |	d k r|||dd tj  tjd t| rt
|||jj  t|| |  tj!}t| j" rKt#|tj$j%||d t| jdkr|d nd t!j t| jdkrtjj&nd d}t'(t'j)}tj'j*|| ||  dd} t#| tj$j+||d t| jdkrE|d nd dd}!nt,|||d |t| jdkr^tjj&nd dd\}}!} | tj-|! }"|.|" |j |d d |	d k r||| d S d S )Nr   r   c                       g | ]}t | fqS r   rR   
local_tilerC   mTbidx	cluster_yrW   r   r   
<listcomp>\   s    z"Softmax.kernel.<locals>.<listcomp>r   r   orderr"   byte_alignment)r   NNNc                 S      g | ]}t |qS r   rR   make_fragment_likerC   thrr   r   r   rc   j       r*   predTis_asyncNNr   init_valhook_fn)fastmath)NNr           )rw   )rx   return_exp_x)/layout_tv_tiledrR   arch
thread_idx	block_idxr   r5   rT   make_identity_tensorcutlassutilsSmemAllocatorallocate_tensorrA   make_ordered_layout#_allocate_reduction_buffer_and_mbar	get_slicepartition_Spartition_D
copy_utilspredicate_kr   copyrO   	WARP_SIZE_initialize_clustercp_async_commit_groupcp_async_wait_groupfill_oobinfautovec_copyloadtor   r   r   ReductionOpMAXcluster_waitmathlog2eexp2ADDr	   
rcp_approxstore)#r   r;   r<   rW   rV   rX   	tv_layouttidx_rT   idXgXgOcXsmemsXreduction_buffermbar_ptr
thr_copy_XtXgXtXsXtXgOtXcXtXrXtXrO	is_even_NtXpXr   	num_warpsxmax_xlog2_eexp_xdenomyr   r`   r   rP   J   s   	$
"






			zSoftmax.kernel)T)__name__
__module____qualname__r   r   Numericintboolr   r,   r9   rR   jitTensorcudaCUstreamrZ   rP   Shape	TiledCopy	Constexpr__classcell__r   r   r   r   r      s0    "
r   zquack::_softmax_fwdout)mutates_argsr   returnc                    s   |   dks
J d| jsJ d| jtjtjtjfv s J d| d dd | |fD \}}|| f}|tj	vrmt
 td|j   fd	d||fD \}}t| }t
j|||t
jjd
dddtj	|< tj	| | | dS )zSoftmax forward pass.
    Args:
        x: Input tensor of shape (M, N)
    Returns:
        Softmax output tensor of same shape as x
    r   zInput must be 2DzTensor must be on CUDA deviceUnsupported dtyper   c                 S      g | ]}t |j qS r   r   r   rB   r   r   r   rc      ro   z _softmax_fwd.<locals>.<listcomp>r!   c                       g | ]
}t | fqS r   fake_tensorrC   dtr   	batch_symdivr   r   rc      s    Tuse_tvm_ffi_env_stream--enable-tvm-ffioptionsN)dimis_cudar   torchfloat16bfloat16float32rO   _softmax_fwdcompile_cacherR   sym_intr   gcdr4   r   compileruntimemake_fake_stream)r   r   r   	out_dtypecompile_keyx_cuteout_cute
softmax_opr   r   r   r      s&   



r   c                 C   s   t | }t| | |S r?   )r   
empty_liker   )r   r   r   r   r   softmax_fwd   s   

r   c                       s   e Zd Zdeej def fddZdd Zdd Z	d	d
 Z
ejdejdejdejdejfddZejdejdejdejdejdejdeje fddZ  ZS )SoftmaxBackwardr   r   c                    s   t  j||dtd d S )Nr   r   )r   r   r   )r   r   r   r   r   r   r      s   zSoftmaxBackward.__init__c                 C   r   )N)r   r    r#   r%   )    r!   r'   r(   r)   r   r   r   r,      r-   z SoftmaxBackward._threads_per_rowc                 C   sT   | j }t| jjdkrg d}ng d}|D ]\}}||kr$|| _ d S qd| _d S )Nr"   r.   r3   r6   r   r   r   r9      r:   zSoftmaxBackward._set_cluster_nc                 C   s   | j dkrdS dS )Nr   r!   r'   r(   )r   r   r   r   _num_threads   s   zSoftmaxBackward._num_threadsmdYmYmdXr=   c           
      C   s   |j | jksJ |   ttdd |||fD }| jd| d\}}}|j}	| ||||||jt	
|jd |d | jdg|	ddgt| jdkrQd| jdgnd |d d S )Nc                 s   r>   r?   r@   rB   r   r   r   rE      rF   z+SoftmaxBackward.__call__.<locals>.<genexpr>r!   rG   r   r   rI   rL   )
r   r   r   r   r=   rU   rV   rW   rX   rY   r   r   r   rZ      s   
zSoftmaxBackward.__call__rW   rV   rX   c           '   
      s  t j \}}}t j \ }}t| jdkrtdnt j d |j}	|j}
t |
} fdd||||fD \}}}}t	j
 }|j|jt jdddd}|j|jt jdddd}| ||	\}}||}||}||}||}||}||}||d	 }d
d |||fD \}}}t|
d d | j k}|rd ntj|||
d d} ttj| d}!t |t jj }"| |||" |d d |
d k r|!||dd |!||dd t j  t jd t || t || | t j}#| t j}$t |#|$ t j!j"||d t| jdkr#|nd dt| jdkr1t jj#nd d}%|$|#|%  }&|$|&|j |d d |
d k rV|!|| d S d S )Nr   r   c                    r[   r   r\   r^   r`   r   r   rc     s    z*SoftmaxBackward.kernel.<locals>.<listcomp>rd   re   r"   rg   ri   c                 S   rj   r   rk   rm   r   r   r   rc   $  ro   rp   rq   Trs   ru   rz   rv   )%rR   r}   r~   r   r   r5   r|   rT   r   r   r   r   r   rA   r   r   r   r   r   r   r   r   r   rO   r   r   r   r   r   r   r   r   r   r   r   r   r   )'r   r   r   r   rW   rV   rX   r   r   r   rT   r   gdYgYgdXr   r   sdYsYr   r   thr_copytdYgdYtdYsdYtYgYtYsYtdXgdXr   tdYrdYtYrYtdXrdXr   r   r   r   dyr   dotdxr   r`   r   rP      sl   
$









zSoftmaxBackward.kernel)r   r   r   r   r   r   r   r   r,   r9   r   rR   r   r   r   r   rZ   rP   r   r   r   r   r   r   r   r   r      s:    r   zquack::_softmax_backwardr  r  r   c                    s>  |   dks
J d|  dksJ d| j|jksJ d| jr$|js(J d| jtjtjtjfv s7J d|j| jksAJ d| d d	d
 | ||fD \}}}||| f}|t	j
vrt td|j   fdd
|||fD \}}}	t| }
tj|
|||	tjjddddt	j
|< t	j
| | || dS )zSoftmax backward pass.
    Args:
        dy: Upstream gradients tensor of shape (M, N)
        y: Softmax output tensor of shape (M, N)
    Returns:
        Input gradients tensor of same shape as dy and y
    r   zdy must be 2Dzy must be 2Dzdy and y must have same shapezTensors must be on CUDA devicer   zdy and y must have same dtyper   c                 S   r   r   r   rB   r   r   r   rc   _  ro   z%_softmax_backward.<locals>.<listcomp>r!   c                    r   r   r   r   r   r   r   rc   d  s    Tr   r   r   N)r   rT   r   r   r   r   r   r   rO   _softmax_backwardr   rR   r   r   r   r4   r   r   r   r   )r  r   r  r   y_dtypedx_dtyper   dy_cutey_cutedx_cutesoftmax_backward_opr   r   r   r  N  s2   	


r  c                 C   s   t | }t| || |S r?   )r   r   r  )r  r   r  r   r   r   softmax_bwdv  s   
r  c                   @   s$   e Zd Zedd Zedd ZdS )SoftmaxFunctionc                 C   s   t |}| | |S r?   )r   save_for_backward)ctxr   r   r   r   r   forward}     
zSoftmaxFunction.forwardc                 C   s   | j \}t||}|S r?   )saved_tensorsr  )r  r  r   r  r   r   r   backward  r  zSoftmaxFunction.backwardN)r   r   r   staticmethodr  r  r   r   r   r   r  |  s
    
r  c                 C   s
   t | S )zSoftmax forward pass with automatic differentiation support.

    Args:
        x: Input tensor of shape (M, N)

    Returns:
        Softmax output tensor of same shape as x
    )r  apply)r   r   r   r   softmax  s   
	r  ),r   typingr   	functoolsr   r   cuda.bindings.driverbindingsdriverr   r   cutlass.cuterR   r   r   r   quack.utilsr   quack.copy_utilsr   quack.compile_utilsr   r   quack.reducer   r	   quack.reduction_baser
   quack.cute_dsl_utilsr   r   library	custom_opr   r   r   r   r   r  r  autogradFunctionr  r  r   r   r   r   <module>   s8     "$