o
     ii                     @   sX  U d dl Z d dlZd dlZd dlmZ d dlmZmZmZm	Z	m
Z
mZmZmZ d dlZddlmZmZmZmZ eG dd deZeG dd	 d	eZeG d
d deZeG dd deZdeeeef fddZe Zddd eD ZeG dd deZeG dd deZdefddZ 	dkddZ!dldej"fddZ#dldej"fdd Z$dldej"fd!d"Z%dldej"fd#d$Z&dldej"fd%d&Z'dej"fd'd(Z(e j)d)d* Z*d+d, Z+i ej,j-j.e+ej,j-j/e+ej,j-j0e(ej,j-j1e!ej,j-j2e!ej,j-j3e!ej,j-j4ee!d-d.ej,j-j5e!ej,j-j6ee!d/d.ej,j-j7ee!d-d.ej,j-j8ee!d/d.ej,j-j9e#ej,j-j:e#ej,j-j;e&ej,j-j<e'ej,j-j=e%Z>ej,j-j.e+ej,j-j/e+ej,j-j0e(ej,j-j;e&ej,j-j<e'ej,j-j9e#ej,j-j:e#ej,j-j?e$ej,j-j=e%i	Z@G d0d1 d1ej"ZAG d2d3 d3eAZBi ZCeeeeeeDejEef ef eFd4< ejGHd5d6d6kZId7e	e d8ej"d9ej"d:e
ej" d;edefd<d=ZJejKjLd>dd?gd@d7e	e d8ej"d9ej"d:e
ej" d;edej"fdAdBZMejKNd>d7e	e d8ej"d9ej"d:e
ej" d;edej"fdCdDZOG dEdF dFeAZPejQReP ejQReB dGZSdHZTdIZUdJZVdKZWdLZXdMej"dNeDdOeDdeAfdPdQZYG dRdS dSejZj[Z\G dTdU dUejZj[Z]G dVdW dWejZj[Z^edXedYef dZZ_d[e_de_fd\d]ZReRd^eSeVfdMej"dNeDd_eDdOeDdeAf
d`daZ`eRd^eVdbdbfdMej"dNeDdOeDdceaddeadeAfdedfZbeReSd^dfdMej"dgej"d_eDdOeDdhe
e deAfdidjZcdS )m    N)partial)AnyCallableDictListOptionalTupleTypeVarcast   )BaseOperatorget_operatorget_xformers_operatorregister_operatorc                   @      e Zd ZedZdZdZdS )SparsifyBothWayssparse24_sparsify_both_wayssp24N__name__
__module____qualname__r   OPERATOROPERATOR_CATEGORYNAME r   r   E/home/ubuntu/.local/lib/python3.10/site-packages/xformers/ops/sp24.pyr          r   c                   @   r   )SparsifyApplysparse24_applyr   Nr   r   r   r   r   r      r   r   c                   @   r   )SparsifyApplyDenseOutputsparse24_apply_dense_outputr   Nr   r   r   r   r   r       r   r    c                   @   r   )Sp24Gemm_sparse24_gemmr   Nr   r   r   r   r   r"   &   r   r"   returnc                  C   sF   t jj sdS t jj } | du rdS | d d | d d | d fS )zJ
    Returns the version of the cusparselt.so library used by pytorch
    )r   r   r   Ni'  d   )torchbackends
cusparseltis_availableversion)r*   r   r   r   _get_cusparselt_torch_version-   s   r+   .c                 c   s    | ]}t |V  qd S N)str).0vr   r   r   	<genexpr>:   s    r1   c                   @   $   e Zd ZeddZdZde ZdS )Sp24GemmCuspltSearchaten_cslt_sparse_mm_searchr   z_cslt_sparse_mm_search@Nr   r   r   r   r   r   _cusplt_version_strr   r   r   r   r   r3   =       
r3   c                   @   r2   )Sp24GemmCuspltr4   _cslt_sparse_mmr   z_cslt_sparse_mm@Nr6   r   r   r   r   r9   D   r8   r9   c                  C   sB   t dk} | sdS d}tj rtjd}t dk r|dkrdS | S )N)r      r   F)r   r   cuda)   r   r   )	   r   )_cusplt_versionr&   r<   r)   get_device_capability)	availablecompute_capabilityr   r   r   _has_cusparseLtK   s   
rC   r   c           	      C   s8  d }|D ]	}t |tr|}q|d usJ g }t|D ]]\}}t |tjrrt |tsG||v r3t||}ntd| j d| j d| dt	| |j
d u se|j
d u se|j
 |j
 kse|j
 |j
 krrtd| j d| j d|| qt |tsJ dt|j| dd |D  |j| d	d |D  |j|j
S )
Nz
Operation r,   zL on Sparse24Tensor requires all operands to be Sparse24Tensors, but operand z is a z] on Sparse24Tensor requires all operands to be Sparse24Tensors with the same sparsity patternz$Only implemented for CUTLASS tensorsc                 S       g | ]}t |tr|jn|qS r   )
isinstanceSparse24Tensorpackedr/   xr   r   r   
<listcomp>~   s     z)sparse24_pointwise_op.<locals>.<listcomp>c                 S   rD   r   )rE   rF   packed_trH   r   r   r   rJ      s    )rE   rF   	enumerater&   Tensorsparsify24_like
ValueErrorr   r   typethreads_masksdata_ptrstrideappendSparse24TensorCutlassshapemetameta_t)	functypesargskwargsallow_sparsify_args_listselftensorargs_updatedir   r   r   sparse24_pointwise_opZ   s\   



rb   c                 C   st   t |dksJ |\}}|jdks|jdkrtdt|tr$||S | }t|ts/J |j| dd S )N   8`Sparse24Tensor` matmul: Broadcasting is not implementedT)prefer_col_major_output)lenndimNotImplementedErrorrE   rF   _mmt)rY   rZ   r[   r\   ABB_tr   r   r   sparse24_mm   s   

rn   c                 C   s   t |dksJ |\}}}|jdks|jdkrtd|jdkr(td|j t|tr1td| }t|ts<J |j| |dd S )	N   rc   rd   r   z:`Sparse24Tensor` matmul: only bias dim=1 supported. Shape=z@`Sparse24Tensor` matmul: only operand B of `addmm` can be sparseTbiasre   )rf   rg   rh   rV   rE   rF   rj   ri   )rY   rZ   r[   r\   rq   rk   rl   rm   r   r   r   sparse24_addmm   s"   



rr   c                 C   sd   t |dv sJ |d d \}}t |dkr|d nd }|d u r&||  S td d ||| gdS )N)rc   ro   rc   ro   )rY   rZ   r[   )rf   rj   rr   )rY   rZ   r[   r\   rk   rl   rq   r   r   r   sparse24_linear   s   rs   c              
   C   sp   t |dksJ |d }t|tsJ t |jdksJ |j|jd |jd f|j|j|j|j|j	
dddS )Nr   r   rc   rG   rW   rK   rX   rQ   )rf   rE   rF   rV   	__class__rK   rX   rG   rW   rQ   	transposerY   rZ   r[   r\   r^   r   r   r   
sparse24_t   s   ry   c                 C   s:   t |dksJ |\}}t||jkrtd| d|S )Nrc   zO`view` is not implemented for Sparse24Tensor, except for the dummy case (shape=))rf   tuplerV   rh   )rY   rZ   r[   r\   r^   rV   r   r   r   sparse24_view   s   
r|   c              	   C   s<   t |dksJ |d }|j|j|j|j|j|j|jddS )Nr   r   F)rV   rG   rW   rK   rX   rQ   requires_grad)rf   rv   rV   rG   rW   rK   rX   rQ   rx   r   r   r   sparse24_detach   s   r~   c                  c   s     t j } zd V  W ~ d S ~ w r-   )r&   _C_DisableTorchDispatch)guardr   r   r   no_dispatch   s
   

r   c                 C   s2   t   | | W  d    S 1 sw   Y  d S r-   )r   )rY   rZ   r[   r\   r   r   r   fallback_dispatcher   s   $r   )r   r   )r]   )r   c                   @   s   e Zd ZU ejed< ejed< ejed< ejed< ejed< g dZedddejdejdejdejdejf
d	d
Zdd Z	dejfddZ
ddddejdedeej dejfddZejjZdd Zedd ZdS )rF   rG   rW   rK   rX   rQ   ru   Fr}   c          	      C   sL   t |tjsJ tjj| ||j|j|d}||_||_||_||_	||_
|S )N)devicedtyper}   )rE   r&   rM   _make_wrapper_subclassr   r   rG   rW   rK   rX   rQ   )	clsrV   rG   rW   rK   rX   rQ   r}   r_   r   r   r   __new__+  s   zSparse24Tensor.__new__c                 C   s   | j j d| j dS )Nz(shape=rz   )rv   r   rV   r^   r   r   r   __repr__F  s   zSparse24Tensor.__repr__r$   c                 C   s*   t j| jd | jd | j| jd}| | S )Nr   )r   r   )r&   eyerV   r   r   )r^   er   r   r   _sp24_to_denseI  s   zSparse24Tensor._sp24_to_denseNre   rq   rl   re   rq   c                C   s   t  r-   )rh   )r^   rl   re   rq   r   r   r   ri   Q  s   zSparse24Tensor._mmc                 C   s   | j | j| jffS r-   )	__slots__rV   r}   r   r   r   r   __tensor_flatten__\     z!Sparse24Tensor.__tensor_flatten__c                 C   s    |\}}| |fi |d|iS )Nr}   r   )r   inner_tensorsflatten_spec
outer_sizeouter_striderV   r}   r   r   r   __tensor_unflatten___  s   z#Sparse24Tensor.__tensor_unflatten__)r   r   r   r&   rM   __annotations__r   staticmethodr   r   r   boolr   ri   r   _disabled_torch_function_impl__torch_function__r   classmethodr   r   r   r   r   rF   !  sJ   
 






	rF   c                	   @   sF   e Zd Zddddejdeej dedejfdd	ZedddZ	dS )rU   NFrp   rl   rq   re   r$   c                C   s   t |tr	td|d urtdt dt d| jdks"|jdkr,td| jj d| j	d |j	d	 krXtd| jj d
| j	d	  d| j	d  d|j	d	  d|j	d  dt
| j|| jd | j	d	  S )NB`Sparse24Tensor @ Sparse24Tensor` is not supported by the hardwarez`Sparse24Tensor` with backend='zF' does not support matmul with bias. Remove the bias, or use backend=''rc   `)` matmul: Broadcasting is not implementedr   r   ` matmul: invalid shapes     (, ) @ (rz   )rE   rF   rO   rh   BACKEND_CUTLASSBACKEND_CUSPARSELTrg   rv   r   rV   r"   r   rG   rW   )r^   rl   rq   re   r   r   r   ri   l  s6   
 zSparse24TensorCutlass._mmr   c                 C   6   |j tvrt| j d|j dt|j  ||||S NzI only supports a specific set of operations, can't perform requested op (rz   )_overloadpacketSPARSE24_DISPATCH_CUTLASSrh   r   r   rY   rZ   r[   r\   r   r   r   __torch_dispatch__     
z(Sparse24TensorCutlass.__torch_dispatch__r   N)
r   r   r   r&   rM   r   r   ri   r   r   r   r   r   r   rU   k  s    
rU   _CUSPLT_ALG_CACHEXFORMERS_CUSPARSELT_TUNE1rV   rG   rl   rq   transpose_resultc              
   C   s.  t sdS | \}}|jd }d}||ddkrdnd7 }||r!dnd7 }|||||j|duf}	|	tv r7t|	 S d}
g }tdD ]G}d	}t|
D ](}ztj|||||d
 W n tya   d}Y  nw |dkrot	j
  t }qG|rt nt	j
  t | }|||f q?|  |d d t|	< t|	 S )a  
    cuSPARSELt has multiple algorithms (that correspond to different kernels)
    to run a given GEMM, because the optimal kernel depends on the GEMM dimensions.
    This function attempts to find the most efficient one by benchmarking all
    of them.
    NOTE: cuSPARSELt also provides a function to search the best algorithm
    (exposed via `aten:_cslt_sparse_mm_search`) but it often fails to find the best
    algorithm, so we need this workaround.
    r   r   rrt   cN
   F   Frq   r   alg_idT)_CUSPLT_TUNErV   rS   r   r   ranger9   r   RuntimeErrorr&   r<   synchronizetime	monotonicrT   sort)rV   rG   rl   rq   r   MKNfmthREPEAT	TIME_ALGOalgo	has_errorra   rj   dtr   r   r   _cusplt_find_alg  sF   




r   zxformers::_cusplt_mmr<   )mutates_argsdevice_typesc                 C   s&   t | ||||d}tj|||||dS )z
    This operator wraps find_algo + gemm. This is because we don't want find_algo
    to be visible by torch compile, otherwise it will remove it from the graph.
    rq   r   r   )r   r9   r   )rV   rG   rl   rq   r   r   r   r   r   
_cusplt_mm  s   

r   c                 C   sF   | \}}|j d }|rtj||g|j|jdS tj||g|j|jdS )Nr   )r   r   )rV   r&   emptyr   r   )rV   rG   rl   rq   r   r   r   r   r   r   r   _cusplt_mm_meta  s
   
r   c                	   @   sF   e Zd Zddddejdedeej dejfdd	ZedddZ	dS )Sparse24TensorCuSparseLtFNr   rl   re   rq   r$   c                C   s  t |tr	td| jdks|jdkrtd| jj d| jd |jd krItd| jj d| jd  d| jd  d	|jd  d|jd  d
|jd d dkrhtd| jj dt| j dt|j d|j	| j	krtd| jj dt| j dt|j d| j	 d|j	 d|d ur|j	| j	krtd| jj dt| j dt|j d| j	 d|j	 dt
 sJ tjjj| j| j|||d}|r| }|d | jd  S )Nr   rc   r   r   r   r   r   r   r   rz      z` matmul: trying to do `A=z @ B=zD`. The dense matrix B should have the second dimension aligned to 8.z`, with A.dtype=z and B.dtype=zH. This operation is only supported when A and B have the same data type.z + C`, with A.dtype=B.dtype=z and C.dtype=zK. This operation is only supported when A, B and C have the same data type.r   )rE   rF   rO   rg   rh   rv   r   rV   r{   r   rC   r&   opsxformersr   rG   rj   )r^   rl   re   rq   outr   r   r   ri     sf   
&$$
zSparse24TensorCuSparseLt._mmr   c                 C   r   r   )r   SPARSE24_DISPATCH_CUSPARSELTrh   r   r   r   r   r   r      r   z+Sparse24TensorCuSparseLt.__torch_dispatch__r   )
r   r   r   r&   rM   r   r   ri   r   r   r   r   r   r   r     s    
1r   24sparse24densestecutlassr(   denserI   r   backendc          	   	   C   s|   |t tfv sJ d| t| tr| jd u rtd| S tj| ||d\}}}}}|t kr0tnt	}|| j
|||||ddS )NInvalid backend: z'Input to `sparsify24` is already sparse)	algorithmr   F)rG   rW   rK   rX   rQ   r}   )r   r   rE   rF   rQ   rO   r   r   rU   r   rV   )	rI   r   r   rG   rW   rK   rX   rQ   r   r   r   r   _sparsify24_forward8  s4   

r   c                	   @   s@   e Zd ZedejdededefddZedejfdd	Zd
S )_Sparsify24FuncrI   r   gradientr   c              	   C   sf   |t ttfvrtd| dt  dt dt t|||d}|j| _|j| _|j| _|j| _|| _	|S )NzInvalid gradient type: 'z'. Expected 'z' or 'r   r   )
GRADIENT_SP24GRADIENT_DENSEGRADIENT_STErO   r   rQ   rW   rX   r   r   )ctxrI   r   r   r   r   r   r   r   forwardV  s"   z_Sparsify24Func.forwardgrad_outc              	   C   s   t |ts
| jtkr|d d d fS t |trJ |j| jksJ | jtkr@t|| j\}}}}t	|j
|| j|| j| j|jd}n| jtkrT| j sLJ t|| j}nJ d| j |d d d fS )Nr   FzUnsupported gradient type: )rE   rF   r   r   r   r   r   r   rQ   rU   rV   rW   rX   r}   r   is_contiguousr    )r   r   rG   _rK   grad_inr   r   r   backwarde  s0   

	z_Sparsify24Func.backwardN)	r   r   r   r   r&   rM   r.   r   r   r   r   r   r   r   U  s
    r   c                   @   sD   e Zd Zedejdedededef
ddZedejfd	d
Z	dS )_Sparsify24STEFuncrI   r   r   bw_mul0bw_mul1c                 C   s&   t |||d}|j| _|| _|| _|S )Nr   )r   rQ   r   r   )r   rI   r   r   r   r   r   r   r   r   r     s
   	z_Sparsify24STEFunc.forwardr   c                 C   sN   t |trJ | jdkr| jdkr|}ntj|| j| j| jd}|d d d d fS )N      ?)mul0mul1)rE   rF   r   r   r    r   rQ   )r   r   r   r   r   r   r     s   z_Sparsify24STEFunc.backwardN)
r   r   r   r   r&   rM   r.   floatr   r   r   r   r   r   r     s    r   c                	   @   s@   e Zd ZedejdededefddZedejfdd	Z	d
S )_Sparsify24LikeFuncrI   patternr   r   c           	   	   C   s  t |ts	td|j std|tttfvr!td| d|j| _|j	| _	|j
| _
|j| _|| _|tkrF| j s?J t|| jS tj|| j|d\}}}}|tkrgt|j|| j	|| j
| j|jdS |tksrJ d| ||j	 ||j
 t|j||||| j|jdS )Nz4`sparsify24_like`: `pattern` must be a sparse tensorzA`sparsify24_like` is not implemented when `pattern` is transposedz*`sparsify24_like`: invalid gradient type ""r   r   r   )rE   rF   rh   rQ   r   r   r   r   rO   rW   rX   r   r   BACKEND_DENSEr    r   r   r   rU   rV   r}   r   copy_r   )	r   rI   r   r   r   rG   rW   rK   rX   r   r   r   r     sV   

	z_Sparsify24LikeFunc.forwardr   c              	   C   s   | j tks
t|tr|d d d fS t|trJ |j| jksJ | j tkr6| j s+J t	|| jd d d fS | j t
ks=J tj	|| jtd\}}}}t|j|| j|| j| j|jdd d d fS )Nr   r   )r   r   rE   rF   r   r   rQ   r   r    r   r   r   r   rU   rV   rW   rX   r}   )r   r   rG   r   rK   r   r   r   r     s8   
	z_Sparsify24LikeFunc.backwardN)
r   r   r   r   r&   rM   rF   r.   r   r   r   r   r   r   r     s
    -r   F.)boundrY   c                 C   s   t ttj| S r-   )r
   r   r&   _dynamoallow_in_graph)rY   r   r   r   r     r   r    r   c                 C   s   t | |||S r-   )r   apply)rI   r   r   r   r   r   r   
sparsify24  s   r   r   r   r   c                 C   s   t | ||||S )a  
    2:4 sparsification, with Straight Through Estimator for the
    backward pass (eg the gradient is *not* sparsified).
    Optionally, `bw_mul[0-1]` provide the option to rescale the gradient
    differently for pruned (`bw_mul0`) and kept values (`bw_mul1`).
    )r   r   )rI   r   r   r   r   r   r   r   sparsify24_ste  s   r  r   	out_densec                 C   s|   |d ur|rt }|dkrt|trtnt}t|ts#tdt| |j	 s6t
|  | || S t
| |||S )Nr   z/`pattern` must be a `Sparse24Tensor` but got a )r   rE   r   r   r   rF   rO   rP   rQ   r   r   r   rj   )rI   r   r   r   r  r   r   r   rN     s   

rN   )r   Nr   r   )d
contextlibosr   	functoolsr   typingr   r   r   r   r   r   r	   r
   r&   commonr   r   r   r   r   r   r    r"   intr+   r?   joinr7   r3   r9   r   rC   rb   rM   rn   rr   rs   ry   r|   r~   contextmanagerr   r   r   r4   is_same_sizedetach_detachrelugelusilumuladdgelu_backwardsilu_backwardthreshold_backwardmmmatmulrj   viewlinearr   addmmr   rF   rU   r   r.   r   r   environgetr   r   library	custom_opr   register_faker   r   r   r   r   r   r   r   r   r   r   autogradFunctionr   r   r   r   r   r   r  rN   r   r   r   r   <module>   s  
(
2

















 
$







J&(
4
>."V	