o
    ãÊi//  ã                   @   s‚  d dl Z d dlZd dlZd dlmZ d dlmZ edd„ e j	g d¢ddD ƒg ƒZ
ejjjdkr>d	d„ e j	g d
¢ddD ƒZ
dd„ e
D ƒZ
ejdejdejdejdejfdd„ƒZejejfdejdejdejdejdejdejfdd„ƒZdd„ Zdd„ Zej dd¡Ze d¡ e d¡ ej edd¡d d!„ ƒZej edd"¡d#d$„ ƒZej ed%d¡d&d'„ ƒZej ed%d"¡d(d)„ ƒZdS )*é    N)Úget_best_config_fnc                 C   sÊ   g | ]a\}}}|||d d f|||d df|||ddf|||d df|||ddf|||ddf|||ddf|||d df|||ddf|||ddf|||ddf|||ddf|||ddf|||ddf|||ddfg‘qS )é   é   é   é   é   é   é   é   © )Ú.0ÚiÚjÚkr   r   úO/home/ubuntu/.local/lib/python3.10/site-packages/torchao/kernel/intmm_triton.pyÚ
<listcomp>   s&    ðñþr   )é    é@   é€   é   r   )ÚrepeatÚ
EXHAUSTIVEc              	   C   s4   g | ]\}}}d D ]}dD ]	}|||||f‘qq	qS ))r   r   r   r   r   r	   r
   r   )r   r   r   r   )r   ÚBLOCK_MÚBLOCK_NÚBLOCK_KÚ
num_stagesÚ	num_warpsr   r   r   r   )   s    ûúû
ÿ)é   r   r   r   r   c                 C   s0   g | ]\}}}}}t j|||d dœ||d‘qS )r   )r   r   r   ÚGROUP_M)r   r   )ÚtritonÚConfig)r   r   r   r   ÚsÚwr   r   r   r   C   s    ûýÿr   r   r   r   c                  C   sn  t jdd}t  ||¡}t  ||¡}|| }|| }|| }t|| |ƒ}|||  }|| | }t j| ||f||f|| df||fdd}t j|||f||	fd|| f||fdd}t j||ft jd}td||ƒD ](}t j|dd}t j|dd}|t  	||¡7 }t  
|d|f¡}t  
||df¡}qk|}t j|||f|
|f|| || f||fdd}t j||dd dS )	zqKernel for computing the matmul C = A x B.
    A has shape (M, K), B has shape (K, N) and C has shape (M, N)
    r   )Úaxis)r   r   )ÚbaseÚshapeÚstridesÚoffsetsÚblock_shapeÚorder©Údtype)r   r   )Úboundary_checkN)ÚtlÚ
program_idÚcdivÚminÚmake_block_ptrÚzerosÚint32ÚrangeÚloadÚdotÚadvanceÚstore) Úa_ptrÚb_ptrÚc_ptrÚMÚNÚKÚ	stride_amÚ	stride_akÚ	stride_bkÚ	stride_bnÚ	stride_cmÚ	stride_cnr   r   r   r   ÚpidÚ	num_pid_mÚ	num_pid_nÚnum_pid_in_groupÚgroup_idÚfirst_pid_mÚpid_mÚpid_nÚa_block_ptrÚb_block_ptrÚaccumulatorr   ÚaÚbÚcÚc_block_ptrr   r   r   Ú!matmul_kernel_with_block_pointersM   sT    
ú
úúrT   ÚEVEN_KÚACC_TYPEc           -      C   sr  t  d¡}|| d | }|| d | }|| }|| }t|||  |ƒ}|| ||  }|| | }|| t  d|¡ }|| t  d|¡ }t  t  || |¡|¡}t  t  || |¡|¡} t  d|¡}!| |d d …d f | |!d d d …f |   }"||!d d …d f |	 | d d d …f |
   }#t j||f|d}$t|d| ƒD ]C}%|r±t  |"¡}&t  |#¡}'n t j|"|!d d d …f |%k dd}&t j|#|!d d …d f |%k dd}'|$t  	|&|'¡7 }$|"|| 7 }"|#||	 7 }#q¢|| t  d|¡ }|| t  d|¡ }|d d …d f }(|d d d …f })|(|k |)|k @ }*|)||(  }+t j|t  
|(|*j¡ |*dd},t  |t  
|+|*j¡ |$|, |*¡ d S )Nr   r   r*   g        )ÚmaskÚotherÚ
evict_last)Úeviction_policy)r-   r.   r0   ÚarangeÚmax_contiguousÚmultiple_ofr2   r4   r5   r6   Úbroadcast_tor%   r8   )-r9   r:   r;   Ús1_ptrr<   r=   r>   r?   r@   rA   rB   rC   rD   Ú
stride_s1mÚ
stride_s1nr   r   r   r   rU   rV   rE   Úgrid_mÚgrid_nÚwidthrI   Ú
group_sizerK   rL   ÚrmÚrnÚramÚrbnÚrkÚAÚBÚaccr   rP   rQ   Úidx_mÚidx_nrW   ÚxindexÚtmp0r   r   r   Ú(scaled_matmul_kernel_with_block_pointers°   sJ   
,,
  ý$rr   c                    s„   | j \‰ }|j \}‰‡ ‡fdd„}t| | ||ˆ ˆ||  d¡|  d¡| d¡| d¡| d¡| d¡f|j|j|jdœ|j¤Ž |S )Nc                    ó"   t  ˆ | d ¡t  ˆ| d ¡ fS ©Nr   r   ©r   r/   ©ÚMETA©r<   r=   r   r   Ú<lambda>  ó   ÿz#int_matmul_kernel.<locals>.<lambda>r   r   )r   r   Únum_ctas)r%   rT   Ústrider   r   r{   Úkwargs)rP   rQ   rR   Úconfigr>   Úgridr   rx   r   Úint_matmul_kernelÿ   s0   

ôñðr€   c                    s    | j \‰ }|j \}‰‡ ‡fdd„}t| | |||ˆ ˆ||  d¡|  d¡| d¡| d¡| d¡| d¡| d¡| d¡f|j|j|j|d dkdœ|j¤Ž |S )Nc                    rs   rt   ru   rv   rx   r   r   ry   !  rz   z*int_scaled_matmul_kernel.<locals>.<lambda>r   r   r   )r   r   r{   rU   )r%   rr   r|   r   r   r{   r}   )rP   rQ   Úscales1rR   r~   r>   r   r   rx   r   Úint_scaled_matmul_kernel  s8   

ñ
íìr‚   ÚtorchaoÚFRAGMENTz(int_matmul(Tensor a, Tensor b) -> Tensorz?int_scaled_matmul(Tensor a, Tensor b, Tensor scales1) -> TensorÚ
int_matmulÚMetac                 C   s,   | j \}}|j \}}tj||f| jtjdS ©N©Údevicer+   )r%   ÚtorchÚemptyr‰   r3   )rP   rQ   r<   r>   r=   r   r   r   Úint_matmul_metaB  ó   

rŒ   ÚCUDAc                 C   sz   | j d |j d ksJ dƒ‚| j \}}|j \}}tj||f| jtjd}tt| ||gtƒ}|d u r6t g ¡S t| |||ƒS ©Nr   r   zIncompatible dimensionsrˆ   )	r%   rŠ   r‹   r‰   r3   r   r€   Úint8_mm_kernel_configsÚtensor)rP   rQ   r<   r>   r=   rR   Úbest_configr   r   r   Úint_matmul_cudaI  s   

ÿ
r“   Úint_scaled_matmulc                 C   s,   | j \}}|j \}}tj||f| j|jdS r‡   )r%   rŠ   r‹   r‰   r+   )rP   rQ   r   r<   r>   r=   r   r   r   Úint_scaled_matmul_meta]  r   r•   c                 C   sl   | j d |j d ksJ dƒ‚| j \}}|j \}}tj||f| j|jd}tt| |||gtƒ}t| ||||ƒS r   )r%   rŠ   r‹   r‰   r+   r   r‚   r   )rP   rQ   r   r<   r>   r=   rR   r’   r   r   r   Úint_scaled_matmul_cudad  s   

ÿr–   )Ú	itertoolsrŠ   r   Útriton.languageÚlanguager-   Útorchao.kernel.autotunerr   ÚsumÚproductr   Ú	_inductorr~   Úmax_autotune_gemm_search_spaceÚjitÚ	constexprrT   r3   rr   r€   r‚   ÚlibraryÚLibraryÚlibÚdefineÚimplrŒ   r“   r•   r–   r   r   r   r   Ú<module>   st   íêÿþú
íìëêbåêéèçæåN#




