o
    iV                     @   s  d dl Z d dlmZ d dlZd dlmZ d dlmZmZm	Z	 d dl
mZ e ddZdddddddddd		d
dZ								 dddZddddddddddejdejdejdeej deej deej dedeeee ee ee f  dee fddZdadadd ZdS )    N)Optional)	warn_once)get_metaminimizeupdate)
has_tritonBSR_AUTOTUNEF   )	betaalpha
left_alpharight_alphaoutstoreverboseforceopnamec       	   #   	      s  ddl |du r
d}|jd }| }| }| d }|j||d  \}}|j|d |d  \}}tdddt|| dd	}td| | | ||   d}|j	}|du r]|}n|j	}||u rg|}n||f}d||f}||||| dk dk|dkf}t
|||d
d}|du rd}t
||d|dfd
d}|du r|}n|
s|S d
}| ||||f fdd	}|||||fdd}t||||d|	d\}} }!|	rtd|! d|dd| dd |r|r|kr||ustj }"t||"||tfddtD  S )zTune bsr_dense_addmm kernel parameters against the given inputs.

    When store is True, the tuning results will be stored in the
    database of kernel parameters.
    r   Nbsr_dense_addmmr	            )GROUP_SIZE_ROW
num_stages	num_warpsSPLIT_NT)versionexactF      ?c              	      s.    f	dd}	j j|dddS )Nc                      s   t  d	S )N)r
   r   r   r   metar   )r    )	r   r
   bsrdenseinputr   r   r   r   r    Q/home/ubuntu/.local/lib/python3.10/site-packages/torchao/kernel/bsr_triton_ops.py	test_funcX   s   z6tune_bsr_dense_addmm.<locals>.bench.<locals>.test_funci  d   )warmuprep)testingdo_bench)r   r#   r!   r"   r   r   r%   )r
   r   r   triton)r   r!   r"   r#   r   r   r$   benchW   s   z#tune_bsr_dense_addmm.<locals>.benchc	                 S   s   | dv }	t ddddd|  }
t t|| dd| }t ddddd|  }|	r:|dkr1|||  n||t|  }n|||  }|
d urIt||
}|d urRt||}| dkr^|| dkr^|S |S )N>   r   r   r	   )r   r   r   r   )r   r   r   r   )dictmaxgetabsmin)namevalue	directionr   MNKBMBKis_log	min_value	max_value
value_step
next_valuer    r    r$   step_meta_parameterh   s"   

z1tune_bsr_dense_addmm.<locals>.step_meta_parameter)max_stepr   z-> z
, speedup=z.1fz %, timing=z.3fz msc                 3   s    | ]} | V  qd S Nr    ).0k)r   r    r$   	<genexpr>   s    z'tune_bsr_dense_addmm.<locals>.<genexpr>)r+   shapevaluescrow_indicesdimr-   r.   round_nnzdtyper   r   printtorchcudaget_device_namer   tuplesorted)#r#   r!   r"   r
   r   r   r   r   r   r   r   r   r6   rF   rG   
batch_ndimr5   r7   r8   r9   reference_metasparsityrK   	out_dtypeversion_dtyper   keyinitial_metamay_skip_updater,   r?   speeduptimingsensitivity_messagedevice_namer    )r
   r   r   r   r+   r$   tune_bsr_dense_addmm   sr   
"
  
r^   c                 K   s\  |d u rt j}|d u r|}|d u rd}||	|
|hd hkr	t j }| |||||dk|dk|dkf}||u r9|}n||f}td|||||fd}|d u r[|dkr[td||||dfd}|d u rn||urntd||||dfd}|d u rtdg |d d d|dd  R |||dfd}|d u r||urtdg |d d d|dd  R |||dfd}t|pi D ]&}|| }|d }|d	 }|| }|| dkr||krt|}|| |d	< q|d ur|jdi | |S td
| d|d|d|d|d|d|d|d|d |pt	|| d}|pd}|
pd}
|	p!d}	td|||
|	d|S )Nr   r   r	   r   )r   r   *r   r   z@bsr_dense_addmm uses non-optimal triton kernel parameters for M=z K=z N=z Ms=z, Ks=z beta=z alpha=z dtype=z out_dtype=zC. To find optimal triton kernel parameters, run with BSR_AUTOTUNE=1r   )r   r   r   r   r    )
rM   float16rN   rO   r   rQ   r-   r   r   r.   )r5   r7   r6   MsKsr
   r   r   r   r   r   rT   rK   rU   _versionextrar]   rW   rV   r   matching_metamkeymeta_nsplit_ncr    r    r$   bsr_dense_addmm_meta   s   
   


rk   )r
   r   r   r   r   skip_checksmax_gridr   r#   r!   r"   r   r   r   rl   rm   r   c          $         s>  t   d}| }| }| }| d }|j||d  \}}|j|d |d  }|jd }ddlm
 ddlm	}m
}m}mm} ||||}|du rY||||f }| dksodkso|dkso|dkso|dkrdkry|  |S ||  dkr| |S du rtd| |d  |d  ||   d}trt| |||||d	d
d	ddnt||||d |d ||j|jd
d
d
	|du rd	|djg |||R  }n|jg ||dR  jg |||R  }|du rd		|djg |||R  }n|jg |d|R  jg |||R  }| d dks1J | d dks<J |}||| ||||\}}}} }}}}|\ dt| d}|| |}||f}|| f}|| f} ||f}||f}tj
jtj 
jtj
j!tj!
j!tj"
j#tj#
j#i|j |$d}|$dd }|$d}|||f} |	durt%|	dd ddd ddt&|	dd    }!nd}!|d|d|d| d|d|d|d|di}"dksJ  	
fdd}#||#|"| |! |' |' kr|||j |S )a  Compute

      out = beta * input + left_alpha.reshape(-1, 1) * (alpha * (bsr @ dense)) * right_alpha.reshape(1, -1)

    where left_alpha, right_alpha are (* + 1)-D tensors when
    specified, otherwise, these are treated as tensors filled with
    ones.
    r   r	   r   r   r   r   N)broadcast_batch_dimslaunch_kernelprepare_inputsptr_stride_extractortile_to_blocksizeTF)	r
   r   r   r   r   r   r   r   r   )rT   rK   rU   r    r   rA   )r   NN)r   Nr   )r   rt   )r   rt   Nc                    sP   t |  g | R dkdkdk	 
jkd
 d S )Nr	   r   )
beta_is_onebeta_is_nonzeroalpha_is_oneleft_alpha_is_oneright_alpha_is_oneBLOCKSIZE_ROWBLOCKSIZE_INNERBLOCKSIZE_COL
allow_tf32	acc_dtype)_bsr_strided_addmm_kernelfloat32)gridsliced_tensorsr9   r8   BNr   r
   dot_out_dtypery   r   rq   rz   tlr    r$   kernel  s(   
zbsr_dense_addmm.<locals>.kernel)(_lazy_init_tritonrF   rG   col_indicesrH   rE   triton.languagelanguagetorch.sparse._triton_opsrn   ro   rp   rq   rr   	new_emptyrJ   zero_copy_mul_rI   AUTOTUNEr^   rk   rK   expandviewstrider/   r.   rM   r`   r   bfloat16float64int8int32sizerP   lendata_ptr)$r#   r!   r"   r
   r   r   r   r   rl   rm   r   f_namerF   rG   r   rR   r5   r7   	blocksizer6   rn   ro   rp   rr   original_batch_dims_broadcastedrT   
out_backupr   out_untiled	n_batchesn_block_rowsn_block_cols	full_gridgrid_blockstensor_dims_mapr   r    r   r$   r      s  
,

*



	


4 r   c               !      s   t rd S da t sd S dd l} dd lm  | jd jd jd jd jd jd jd	 jd
 jd jd jd jd jd jd jd jd jf  fdd}|ad S )NTr   left_alpha_tiled_col_strideleft_alpha_col_block_strideright_alpha_tiled_row_strideright_alpha_row_block_striderv   rw   rx   ry   rz   r{   r}   r|   r   r~   r   r   c7           P         s  |dksJ |dksJ |dksJ |!dksJ  j dd}7 j dd}8 j dd}9 jdd}: jdd}; |8|9|:|;|5\}8}9|||7  ||8  }< |<}= |<| }>|>|= }? d|0}@ d|2}A|1dk sq|1d dkrtd}Bn|1}B d|B}C| ||7  ||=  ||@d d d f   ||Ad d d f   }D|||7  ||9  ||Ad d d f   ||Cd d d f   }E|#|$|7  |%|8  |&|9  |'|@d d d f   |(|Cd d d f   }F||	|7  |
|=  }G j|0|Bf|3d}Ht|?D ]2}I |D}J |G}K j|E||K  |Cd d d f |1k d}L|H j|J|L|4|3d7 }H|D|7 }D|G|
7 }Gq|-s+|H|*9 }H|.sW|||7  ||8  ||9  ||@d d d f   ||Cd d d f   }M|H |M9 }H|/s|||7  ||8  | |9  |!|@d d d f   |"|Cd d d f   }N|H |N9 }H|,r|||7  ||8  ||9  ||@d d d f   ||Cd d d f   }O|+r|H |O7 }Hn	|H|) |O 7 }H j|F|H	|#j
j|Cd d d f |1k d d S )	Nr   r   )axisr	      )rK   )mask)r~   rU   )
program_idnum_programs	swizzle2dloadarangezerosrangedotr   torK   
element_ty)P
values_ptrvalues_batch_stridevalues_nnz_stridevalues_row_block_stridevalues_col_block_stridecrow_indices_ptrcrow_indices_batch_stridecrow_indices_stridecol_indices_ptrcol_indices_batch_stridecol_indices_stride	input_ptrinput_batch_strideinput_tiled_row_strideinput_tiled_col_strideinput_row_block_strideinput_col_block_stride	dense_ptrdense_batch_stridedense_tiled_row_stridedense_tiled_col_stridedense_row_block_stridedense_col_block_strideleft_alpha_ptrleft_alpha_batch_strideleft_alpha_tiled_row_strider   left_alpha_row_block_strider   right_alpha_ptrright_alpha_batch_strider   right_alpha_tiled_col_strider   right_alpha_col_block_stride
output_ptroutput_batch_strideoutput_tiled_row_strideoutput_tiled_col_strideoutput_row_block_strideoutput_col_block_strider
   r   rv   rw   rx   ry   rz   r{   r}   r|   r   r~   r   r   	batch_pidrow_block_pidcol_block_pidr   r   crow_indices_offset_ptr
nnz_offsetnnz_offset_nextrow_nnzrow_block_arangeinner_block_arangePADDED_BLOCKSIZE_COLcol_block_arangevalues_block_ptrsdense_block_ptrsoutput_ptrscol_index_nnz_ptroutput_acc_block_values_blockdense_row_idxdense_blockleft_alpha_ptrsright_alpha_ptrs
input_ptrsr   r    r$   _bsr_strided_addmm_kernel_impl  s  K







z9_lazy_init_triton.<locals>._bsr_strided_addmm_kernel_impl)_triton_initializedr   r+   r   r   jit	constexprr   )r+   r   r    r   r$   r     sV   &(-/<=>?@ABCDEFG Xr   )NNNNNNNr   )ostypingr   rM   torch._dynamo.utilsr   torch.sparse._triton_ops_metar   r   r   torch.utils._tritonr   getenvr   r^   rk   TensorboolrP   intr-   r   r   r   r   r    r    r    r$   <module>   st    
f	

 G