o
    TÃi´€  ã                   @   sv   d dl Z d dlZd dlad dlmZ d dlm  mZ d dl	m
Z
 tjdd„ ƒZG dd„ dejjƒZG dd„ dƒZdS )	é    N)Úget_acceleratorc           [      K   sX  |d }|d }|d }|d }|d }t  d¡}t  d¡}t  d¡}|d	 r¬|| }t  d|¡| }t  d|¡| } |||  d
 }!| d
 }"||||  ||  d
  }#t  |#d ¡}$t  |#d |! ¡}%t  |#d |" ¡}&|| }'t  |dkdd¡}(||' })||' }*d}+d},d}-d}.|}/d}0|$}1|$}2|%| t  d|¡|  }3|&| t  d|¡|  }4n¸||d  }#t  |#d ¡}5t  |#d ¡}'t  |#d ¡}6t  |#d ¡}7t  |#d
 ¡}(t  |#d ¡}/||5 }8|7}0|d r|| },|6| }+d}9|| }:t  |8¡}*t  |*d¡}*d}.d};d})t  |8d ¡}-t  |-d¡}-|-| | }-d}1|7}2n6|| }+|6| },d}9|| };t  |8¡})t  |)d¡})d}-d}:d}*t  |8d ¡}.t  |.d¡}.|.| | }.|7}1d}2|;t  d|¡ }3|:t  d|¡ }4|)t  d|¡ }<|*t  d|¡ }=| ||  |1|  |- |3d d …d f |  |<d d d …f |  }>|||  |2|  |. |4d d d …f |
  |=d d …d f |	  }?|d rÄ|3d d …d f |k }@n|'dk}@|d rØ|4d d d …f |k }An|'dk}At j|>|@dd}Bt j|?|Add}Ct j||ft jd}Dt|'d| ƒD ]…}E|Dt  	|B|C¡7 }D|d	 r|| }F||	 }Gn|8d7 }8|d r:t  |8¡}Gt  |8d ¡}Ft  |Gd¡}Gt  |Fd¡}F|G|	 }G|d r[t  |8¡}Ft  |8d ¡}Gt  |Fd¡}Ft  |Gd¡}G|F| }F|>|F7 }>|?|G7 }?|E|k}H|E|k}I|@|H@ }J|A|I@ }Kt j|>|Jd}Bt j|?|Kd}Cqý|D 
|jj¡}L|d	 rÝd}Mt  d|¡| }Nt  d|¡| }O|N||  d
 }P|Od
 }Qd|Pd d …d f  |Qd d d …f  }Rt  |#|R ¡}S|S| | }9t  d|¡| }Tt  d|¡| }Un|+t  d|¡ }T|,t  d|¡ }U|d rü|Ud d d …f |k }M|d r|Td d …d f |k }M||9 |0|  ||  |Td d …d f |  |Ud d d …f |  }V|(dkr:t j|V|L|Md d S |t  d¡| t  d¡  t  d¡|  |( d }W|Wt  d¡t  d¡ |  }Xt  |Wdd¡dkrv	 t  |Wdd¡dkskt  |X¡}Y|Ydkr‰t j|V|L|Md nt j|V|Md}Zt j|V|Z|L |Md t  |X|Yd |/ ¡ t  |Wd¡ d S )NÚTMÚTNÚTKÚTZÚBLOCKr   é   é   ÚSDDé   é   é   é   ÚDSDé   ÚDDSg        )ÚmaskÚother©Údtype)r   T)ÚtlÚ
program_idÚarangeÚloadÚwhereÚmultiple_ofÚzerosÚfloat32ÚrangeÚdotÚtor   Ú
element_tyÚstoreÚnum_programsÚ
atomic_casÚatomic_xchg)[ÚAÚBÚCÚ	stride_zaÚ	stride_haÚ	stride_maÚ	stride_kaÚ	stride_zbÚ	stride_hbÚ	stride_kbÚ	stride_nbÚ	stride_zcÚ	stride_hcÚ	stride_mcÚ	stride_ncÚDS0ÚDS1ÚSDD_KÚSDD_off_widthÚlutÚlocksÚnlocksÚmetar   r   r   r   r   Úpid0Úpid1ÚpidzÚblockidmÚblockidnÚofflutmÚofflutnÚheaderÚzÚiÚjÚAS1ÚlockidÚoffkaÚoffkbÚoffmcÚoffncÚoffpaÚoffpbÚmaxidÚoffhcÚoffhaÚoffhbÚramÚrbnÚoffsetÚcolumnÚdepthÚpincÚoffpcÚoffnbÚoffmaÚrkaÚrkbÚpaÚpbÚcheckamÚcheckbnÚaÚbÚaccÚkÚinc_aÚinc_bÚcheckakÚcheckbkÚcheckaÚcheckbÚcÚcheckcÚrr_blockidmÚrr_blockidnÚ
rr_offlutmÚ
rr_offlutnÚoff_bkidÚbkidÚrcmÚrcnÚpcÚplockÚpcountÚcountÚd© r|   úY/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/ops/sparse_attention/matmul.pyÚ_kernel   s   





@@








$

@
2ÿ

r~   c                   @   s¶   e Zd Zeƒ Zeƒ Zeƒ Zeƒ Zedd„ ƒZ	edd„ ƒZ
edd„ ƒZedd„ ƒZed	d
„ fdd„ƒZedd„ ƒZedd„ ƒZe e¡e e¡e e¡dœZedd„ ƒZedd„ ƒZdS )Ú_sparse_matmulc                 C   s¶  |   ¡ }| | dk  ¡ }|}t t |d¡dƒ}| | }| | }|| |k  ¡  ||k ¡  }| ¡ }	tj|	| jd}
t 	|
¡}t 
|
¡}t 
|
¡}d}d}d}tt| ƒƒD ]k}|| || }}| | |k }|| ||k | }||||…< |dks|dkr“||kr“|d7 }||||…< || |||…< ||
||| …< ||k r­|s­|
|| d   |7  < ||ks³|r¹||
|| < |}|d7 }qTt 
|
¡}tj|
d d… dd|dd …< |
||||fS )Nr   r   r   r   éÿÿÿÿ©Údim)ÚmaxÚminÚtritonÚcdivÚlongÚsumÚtorchÚemptyr   Ú
empty_likeÚ
zeros_liker   ÚlenÚcumsum)ÚsizesÚblockÚmax_sizeÚmin_sizeÚseg_maxÚseg_minÚdivÚremÚpacksÚwidthÚsegmentsrW   rI   rP   r;   ÚcurrentÚcol_idxrF   r{   ÚrÚisemptyÚlastÚoffsetsr|   r|   r}   Úload_balanceÏ   sD   




z_sparse_matmul.load_balancec                 C   s@   |t jvs| t j|  d¡krtj| tj|dt j|< t j| S )Nr   ©r   Údevice)r   r:   Úsizer‰   r   Úint32)r£   Údevr|   r|   r}   Ú	get_locks  s   

z_sparse_matmul.get_locksc                 C   sÄ   |dkrdnd| }|   tj¡} t |  ¡ | jd | jd | jd |¡}g g g }}}|D ]-\}	}
	 |
 dd¡}
|
jd |	|	  }| t 	|
¡  tj¡ 
|¡¡ | |¡ | |	¡ q.|d ||fS )	Né   é€   é    r   r   r	   r€   r   )Útyper‰   r¤   Ú	libtritonÚ
superblockÚdata_ptrÚshapeÚreshapeÚappendÚ
from_numpyr    )Úlayoutr   r   r¢   Ústart_widthÚ	segmentedÚlutsÚwidthsr—   r£   Únnzr˜   r|   r|   r}   Úmake_sdd_lut  s   "ÿ	
z_sparse_matmul.make_sdd_lutc           $         s¤  |r|| } }| | }}|   d¡}|rdnd}|rdnd}| j| |j| }}||kr?td|› d|› d|› d|› d	ƒ‚|d dkrItd	ƒ‚|   d¡‰ |   |rTd
nd¡}| j}|d dk}|d dk}|d dk}|srtd	ƒ‚| j}tdd„ t|	|
ƒD ƒƒ}tjˆ |||f|| jd}t||	|
ƒD ]»\}‰}ddg}dg}||r¦dgng 7 }||r¯dgng 7 }tj	|tj
|i| }d}|| || ||d dddddœ} t dˆ | | | j¡}!d‰|râdnd }"tdˆˆƒD ]d‰‡ ‡‡‡fdd„}#t|# | |||  d¡|  d¡|  |r
d
nd¡|  |rdnd
¡| d¡| d¡| |r$d
nd¡| |r-dnd
¡| d¡| d¡| d¡| d
¡|||ˆ||!|fddi| ¤Ž qêq”|S )Nr   éþÿÿÿr€   zSize of tensor A along the z dim (z() must match size of tensor B along the ú)r§   z/Reduction size for SDD must be a multiple of 16r   r	   r©   é@   c                 S   s   g | ]
\}}|| | ‘qS r|   r|   )Ú.0r˜   Úpackr|   r|   r}   Ú
<listcomp>C  s    z._sparse_matmul._sdd_matmul.<locals>.<listcomp>r¡   r   r   TF)r   r   r   r   r   r
   r   r   i À  c                    s   | d t ˆˆˆ ƒˆ gS )Nr   )r„   ©r<   ©Ú
batch_sizeÚ	max_widthÚ	off_widthr˜   r|   r}   Ú<lambda>^  s    z,_sparse_matmul._sdd_matmul.<locals>.<lambda>Ú	num_warpsr   )r£   r®   Ú
ValueErrorr   r¢   rˆ   Úzipr‰   rŠ   r   Úfloat16r   r¦   r   r~   Ústride)$rc   rd   Útrans_aÚtrans_bÚtrans_cÚspdimsr   rµ   Ú	num_locksr¶   r—   ÚbenchÚtimeÚAS0Úa_dimÚb_dimÚa_innerÚb_innerÚa_outerr   Úis_16_multipleÚis_32_multipleÚis_64_multipler¢   Útotal_widthrm   r9   r½   ÚF32TKÚF16TKr   Únum_lockr<   r:   ÚtotalÚgridr|   rÀ   r}   Ú_sdd_matmul)  s˜   

ÿ
ÿ
øëêéþz_sparse_matmul._sdd_matmulc                 C   s   | S ©Nr|   )Úidxr|   r|   r}   rÄ     s    z_sparse_matmul.<lambda>c           %      C   sš  t jg t j| jd}| ¡ }| ¡ }| ¡ }	| ¡ }
| ¡ }| ¡ }d}d}t|  d¡ƒD ]~}|rAt  | |d d …d d …f d¡}nt  | |d d …d d …f d¡}t 	||¡\}}}}}|t  
|¡ }||dk  |7  < | ¡ }t  ||f¡}t  ||f¡}t  |	|f¡}	t  ||f¡}t  ||| f¡}t  |
|f¡}
|| |d d …d d …f  ¡ 7 }q-||9 }|r·|  ¡ }n|  dd¡ ¡ }| d¡}t  ||d t  
|¡ ¡}||d d …df | ƒ}| ¡ }|dd …  |d d… 8  < || }| dd¡ d|¡}||d d …dd …f< |d d …df  |d | 8  < |||dk  |||dk df< | d¡}|r5t  |¡}nB| ¡ }d}t|  d¡ƒD ]4}| |d d …d d …f  ¡ }| ¡ }dt  |¡ ||dk< t  |||j|jdk  d f¡}||7 }qB|}|| | }|dd …  |d d… | | 8  < | dd¡ d|¡}|r¹||d d …dd …f< |d d …df  |d | 8  < n|| |d d …dd …f< |d d …df  |d | | 8  < |||dk  |||dk df< | d¡}|d| 9 }||9 }| d¡} |d|  7 }t j||||	|
|fdd d¡ ¡ }!t j||fdd d¡ ¡ }"t  |"t jd|"j|"jdf¡}"t  |!|"f¡}#|# t j¡ |¡}#td|
 ¡ ƒ}$|#|$| d fS )	Nr¡   r   r   r	   r€   r   r   )r¢   r   )r‰   ÚtensorÚint64r¢   Úcloner   r£   rˆ   r   r    Ú	ones_likerƒ   ÚcatÚnonzeroÚ	transposer„   ÚviewÚrepeatr   ÚTÚstackÚ
contiguousr   r   rª   r¤   r    )%r²   r   ÚstepÚtransr¢   Ú	transformÚ_emptyr™   rW   rX   rI   rP   rŸ   Úcurrent_offsetÚcurrent_maxidrE   r   Ú
z_segmentsÚz_columnÚz_lockidÚz_maxidÚ	z_offsetsÚz_depthr·   Ú
num_blocksrâ   Úxincsr•   ÚwidxÚlayoutwÚmsumÚwincsr˜   rD   Úincsr9   rÎ   r|   r|   r}   Úmake_dxx_lut€  sŽ     

  
"$"$ 

$z_sparse_matmul.make_dxx_lutc                    s¼  t d u r	t d¡a |  d¡‰ |  d¡}|  |rdnd¡‰|  |r"dnd¡}|d }|||r/dnd  }|||r9dnd  }| j}|dd|dddd	d
œ}ˆ }|}|rS|nˆ}|rYˆn|}t dˆ  ˆ d | | j¡}tj	||||f|| jd}‡ ‡‡fdd„}t
| | |||  d¡|  d¡|  |r“dnd¡|  |r›dnd¡| d¡| d¡| |r«dnd¡| |r³dnd¡| d¡| d¡| |rÃdnd¡| |rËdnd¡ˆ|dd|||fddi|¤Ž |S )Nr…   r   r   r   r	   r¨   r§   FT)r   r   r   r   r   r
   r   r   r©   r¡   c                    ó   ˆt  ˆ| d ¡ˆ gS )Nr   ©r…   r†   r¿   ©rÑ   ÚAS2r˜   r|   r}   rÄ   ð  ó    z,_sparse_matmul._dds_matmul.<locals>.<lambda>rÅ   r   ©r…   Ú	importlibÚimport_moduler£   r   r   r¦   r¢   r‰   rŠ   r~   rÉ   )rc   rd   rÊ   rË   rÌ   rÍ   r   r9   rÎ   r˜   r—   rÏ   rÐ   rH   ÚAS3ÚBS0ÚBS1ÚBS2r   r<   ÚCS0ÚCS1ÚCS2ÚCS3r:   rm   rß   r|   r  r}   Ú_dds_matmulØ  s\   


ëêéz_sparse_matmul._dds_matmulc                    s¬  t d u r	t d¡a |d }|||rdnd  }|||rdnd  }| d¡‰ | d¡}| |r1dnd¡}| |r:dnd¡‰| j}|dd|ddd	dd
œ}ˆ }|}|rSˆn|}|rY|nˆ}t dˆ  ˆ d | | j¡}tj	||||f|| jd}‡ ‡‡fdd„}t
| | |||  d¡|  d¡|  |r“dnd¡|  |r›dnd¡| d¡| d¡| |r«dnd¡| |r³dnd¡| d¡| d¡| d¡| d¡ˆ|dd|||fddi|¤Ž |S )Nr…   r   r	   r   r   r¨   r§   FT)r   r   r   r   r   r
   r   r   r©   r¡   c                    r  )Nr   r  r¿   ©r  ÚBS3r˜   r|   r}   rÄ   $  r  z,_sparse_matmul._dsd_matmul.<locals>.<lambda>rÅ   r   r  )rc   rd   rÊ   rË   rÌ   rÍ   r   r9   rÎ   r˜   r—   rÏ   rÐ   rÑ   rH   r  r  r  r   r<   r  r  r  r  r:   rm   rß   r|   r  r}   Ú_dsd_matmul  s\   


ëêéz_sparse_matmul._dsd_matmul©ÚsddÚdsdÚddsc                 C   sœ   t j| ||||||||	|
||||ƒ}|  ||¡ || _|| _|| _|| _|| _|| _|| _	|| _
|| _|| _|| _|| _|| _|| _|| _|| _|| _|S rá   )r   ÚfnÚsave_for_backwardÚda_num_locksÚda_lutÚda_widthÚda_packsÚda_benchÚda_timeÚdb_lutÚdb_num_locksÚdb_widthÚdb_benchÚdb_packsÚdb_timeÚmoderÍ   r   rÊ   rË   )Úctxrc   rd   rÊ   rË   rÌ   r)  rÍ   r   Úc_lutÚc_num_locksÚc_widthÚc_packsÚc_benchÚc_timer  r  r  r   r!  r"  r#  r$  r%  r'  r&  r(  rm   r|   r|   r}   ÚforwardA  s,   ÿz_sparse_matmul.forwardc           	      C   s  | j \}}| j}| jd r7|d |d  |d  }tj| ||d| j | j| j| j| j	| j
| j| j| j| jƒ}| jd rf|d |d  |d  }tj| ||| j d| j| j| j| j| j| j| j| j| jƒ}||d d d d d d d d d d d d d d d d d d d d d d d d d fS )Nr   r   r	   F)Úsaved_tensorsr)  Úneeds_input_gradr   r  rË   rÊ   rÍ   r   r  r  r  r   r!  r"  r#  r$  r%  r'  r&  r(  )	r*  Údcrc   rd   r)  Úmode_daÚdaÚmode_dbÚdbr|   r|   r}   Úbackward\  s(   

 þ
 þ
üz_sparse_matmul.backwardN)Ú__name__Ú
__module__Ú__qualname__ÚdictÚ	sdd_cacheÚ	dsd_cacheÚ	dds_cacher:   Ústaticmethodr    r¦   r¸   rà   r  r  r  Ú__get__Úobjectr  r1  r9  r|   r|   r|   r}   r   Ä   s0    
2



VW
2
3
r   c                   @   s>   e Zd ZdZdd„ Zddd„Zedd„ ƒZd	d
„ Zdd„ Z	dS )ÚMatMulaA  Block-Sparse MatMul class; this class handles three types of matrix-multiplication:
       - sparse = dense X dense
       - dense = sparse X dense
       - dense = dense X sparse

    For more details about sparsity config, please see `Generative Modeling with Sparse Transformers`: https://arxiv.org/abs/1904.10509
    c                 C   s°  ||f}|| j v r| j | S | j| j}}d}| jdkr)t ||||¡\}}}	}
n(| jdkr>t |||| j |¡\}}}	}
n| jdkrQt |||| j|¡\}}}	}
| jdkrdt |||d|¡\}}}}n&| jdkrvt ||||¡\}}}}n| jdkrŠt |||| j |¡\}}}}| jdkrt |||d|¡\}}}}n%| jdkr±t |||| j|¡\}}}}n| jdkrÂt ||||¡\}}}}|||	|
||||||||f| j |< | j | S )zDGenerates the sparsity layout/s used in block-sparse matmul
        r§   r  r  r  TF)	Ú	lut_cacher²   r   r)  r   r¸   r  rÊ   rË   )Úselfr   r¢   Úkeyr²   r   rï   r+  r,  r-  r.  r  r  r  r   r#  r$  r%  r'  r|   r|   r}   Úmake_lut}  sH   



ÿ
ÿ



ÿ

ÿ

þ
zMatMul.make_lutFc                 C   s  |dvrt dƒ‚tƒ | _|| _|| _|| _|| _|| _|j}|dv s&J dƒ‚|dksd|dkr3||dfn||df\}}	}
|sD|
d	 d
  n|
| _	|	sK|
n|
d	 d
  }
|j
|
 | | _| ¡  ¡ ||f| _|d	krm| d¡}| ¡ }|j
| _|| _d| _d| _d| _dS )a¯  Initialize the Block-Sparse MatMul class.

        Arguments:
             layout: required: sparsity layout tensor
             block: required: an integer determining the block size.
             mode: required: a string determining type of matmul; ('sdd') sparse = dense X dense, ('dsd') dense = sparse X dense, ('dds') dense = dense X sparse
             trans_a: optional: a boolean determining if multiplication needs to be applied on transpose of input a; default is false
             trans_b: optional: a boolean determining if multiplication needs to be applied on transpose of input b; default is false
             bench: optional: set if you want to do benchmarking
        r  z"Supported modes are: sdd, dsd, dds)r	   r   z9Layout should be a 2 or 3 dimensional tensor of 0s and 1sr  r  r€   r¹   r	   r   r   N)ÚNotImplementedErrorr=  rE  rÊ   rË   r)  r   r²   ÚndimÚdense_inner_dimr®   Údense_inner_sizerˆ   ÚitemÚsparse_shapeÚ	unsqueezer‡   rÍ   rÏ   Útime_cÚtime_daÚtime_db)rF  r²   r   r)  rÊ   rË   rÏ   Ú
layout_dimÚtrans_denseÚtrans_sparseÚsparse_innerr|   r|   r}   Ú__init__£  s4   
ÿ

zMatMul.__init__c                 C   s0   |rdnd}t ||  ¡  ƒD ]}|  d¡} q| S )Nr   r   r   )r   r‚   rO  )ÚxÚ	is_sparseÚmax_dimrF   r|   r|   r}   Ú
_pad_shapeÖ  s   zMatMul._pad_shapec                 C   s  |   |j|j¡\}}}}}}}	}
}}}}dg}dg}dg}t|j|jƒ}|  ||¡\}}t || jdk¡}t || jdk¡}t	 
||| j| jd| j| j| j||||| j||||	|
| j|||||| j|¡}|j| }t|ƒD ]}| d¡}qm|d | _|d | _|d | _|S )a°  Applies Block-Sparse MatMul.

        For more details about sparsity config, please see `Generative Modeling with Sparse Transformers`: https://arxiv.org/abs/1904.10509

        Arguments:
             a: required: a dense/block-sparse tensor; first input of mat-mul
             b: required: a dense/block-sparse tensor; second input of mat-mul

        Return:
             c: a dense/block-sparse tensor result of a X b
        Nr  r  Fr   )rH  r   r¢   rƒ   rJ  Ú_validate_inputsrD  r[  r)  r   ÚapplyrÊ   rË   rÍ   r   rÏ   r   ÚsqueezerP  rQ  rR  )rF  rc   rd   r+  r,  r-  r.  r  r  r  r   r#  r$  r%  r'  rP  rQ  rR  Úoriginal_dimsrm   Údims_to_trimÚ_r|   r|   r}   Ú__call__Ý  s.   
þ ý



zMatMul.__call__c              
   C   s^  |j |j krtd|j › d|j › dƒ‚tƒ  |¡stdƒ‚t ¡ r+| ¡ | ¡ }}n|j|jkr>td|j› d|j› dƒ‚| j| j	| j
}}}|dkrŸ|d	krW|d
|dfn|d|d
f\}}}}	|j| j }
|
| jkr‚td|› d| j› d| j|j › d|
› d	ƒ‚|jt| jƒ d … | jkrŸtd| j› d|	› d|j› ƒ‚dd„ }||ƒ}||ƒ}||fS )Nz'Inputs must be on the same device; got z for tensor A and z for tensor Bz&Only GPU devices are supported for nowz#Inputs must be the same dtype; got z for A and z for Br  r  r&   r'   zExpected tensor z to have size z at dim z, got Ú.z2Expected tensor with trailing dimensions of shape z for argument c                 S   sJ   d| j  }|dkrdg| }| jg |¢| j¢R Ž } | S |dk r#tdƒ‚| S )Nr   r   r   z?Tensors with more than 4 dimensions are not currently supported)rJ  rê   r®   rÆ   )rX  Údims_neededÚ
singletonsr|   r|   r}   Úadd_extra_dims$  s   

ýz/MatMul._validate_inputs.<locals>.add_extra_dims)r¢   rÆ   r   Úon_acceleratorr‰   Úis_autocast_enabledÚhalfr   r)  rÊ   rË   r®   rK  rL  rJ  r   rN  )rF  rc   rd   r)  rÊ   rË   ÚdenseÚ
dense_nameÚsparseÚsparse_nameÚdense_innerrf  r|   r|   r}   r\  
  s<   
ÿ(

ÿ
ÿÿÿzMatMul._validate_inputsN)FFF)
r:  r;  r<  Ú__doc__rH  rW  rA  r[  rb  r\  r|   r|   r|   r}   rD  t  s    
&3
-rD  )r	  r‰   r…   Útriton.languageÚlanguager   Útriton._C.libtritonÚ_Cr«   Údeepspeed.acceleratorr   Újitr~   ÚautogradÚFunctionr   rD  r|   r|   r|   r}   Ú<module>   s   
 3   3