o
    پi%                     @   s   d dl Z d dlmZ dd Zdd Ze jdd Ze je g dd	e jed
dej	dej	dej	dej	dej	f
ddZ
e jed
dej	dej	dej	dej	dej	dej	fddZe je g dd	e jed
dej	dej	dej	dej	fddZdS )    Nc                   C   s   dd dD S )Nc                 S   sN   g | ]#}d D ]}dD ]}dD ]}dD ]}t j|||dd||dqqq
qqS )   )@   )   )      )BLOCK_SIZE_MBLOCK_SIZE_NBLOCK_SIZE_KGROUP_SIZE_M)
num_stages	num_warps)tritonConfig).0BMBNBKsw r   `/home/ubuntu/.local/lib/python3.10/site-packages/flashinfer/triton/kernels/sm_constraint_gemm.py
<listcomp>   s0    z&matmul_get_configs.<locals>.<listcomp>r   r   r   r   r   r   matmul_get_configs   s   r   c                 C   s   i }|d |d |d }}}|j  d| d| d| d|d< d	|v r,|d	  }n|d
 r2dnd}d| | | |d|d  < ||| ||  ||   |d< |S )NMNKz [M=z, N=z, K=]namec_ptr
FP8_OUTPUT      g       @flopsr   bytes)r   element_size)gridkernelargsretr   r   r   bytes_per_elemr   r   r   _matmul_launch_metadata   s   " r+   c           
      C   s>   | | }|| }t || |}|| |  }| | | }	||	fS )N)min)
tile_idnum_pid_in_group	num_pid_mr   NUM_SMSgroup_idfirst_pid_mgroup_size_mpid_mpid_nr   r   r   _compute_pid&   s   r6   )r   r   r   )configskey)launch_metadatar   r	   r
   r   r0   c           .   
   C   s  t jdd}t ||}t ||}t ||}|| }|| }t d|}|| }t j|||ddD ])}t|||||\}}|| }|| }|t d| } |t d| }!t | |k | d} t |!|k |!d}!t t | ||} t t |!||}!t j	||ft j
d}"t|D ]g}#|#| t d| }$| | d d d f | |$d d d f |   }%||$d d d f | |!d d d f |	   }&t j|%|d d d f ||#|  k dd}'t j|&|d d d f ||#|  k dd}(t |'|(|"}"q||7 }t|||||\}}|| t d| })|| t d| }*||
|)d d d f   ||*d d d f   }+|)d d d f |k |*d d d f |k @ },|"|jj}-t |-||t j|+|,d }-t j|+|-|,d q3d S )	Nr   axisTflattendtype        maskotherrB   )tl
program_idcdivarangeranger6   wheremax_contiguousmultiple_ofzerosfloat32loaddottor?   
element_tyfmastore).a_ptrb_ptrr   r   r   r   	stride_am	stride_ak	stride_bk	stride_bn	stride_cm	stride_cnalphabetar   r	   r
   r   r0   	start_pidr/   	num_pid_nk_tiles	num_tiles	tile_id_coffs_k_for_maskr.   r-   r4   r5   start_mstart_noffs_amoffs_bnaccumulatorkioffs_ka_ptrsb_ptrsaboffs_cmoffs_cnc_ptrsc_maskcr   r   r   gemm_kernel_persistent0   s`   
&&
,(ru   EPILOGUE_SUBTILEc           +      C   sb  |j j}tjdd}t||}t||	}t||
}|| }tj| ||g|dg||
gd}tj|||g|dg|	|
gd}tj|||g|dg||sL|	n|	d gd}|| }|| }tj|||ddD ]}t|||||\}}|| }||	 }tj||	ftj	d}t|D ]}||
 } |
|| g}!|
|| g}"t|!|"j|}q||7 }t|||||\}}|| }#||	 }$|rt||d|	d f}%t|%d	}%t|%\}&}'t|&|||
|#|$g }&t|'|||
|#|$|	d  g }'|&|}(||#|$g|( |'|})||#|$|	d  g|) qdt||||
|#|$g }||}*||#|$g|* qdd S )
Nr   r:   r!   )shapestridesblock_shaper"   Tr<   r>   )r   r"   r!   )r?   rR   rE   rF   rG   make_tensor_descriptorrI   r6   rM   rN   rO   rP   TreshapepermutesplitrS   rQ   rT   )+rU   rV   r   r   r   r   r]   r^   r   r	   r
   r   rv   r0   r?   r_   r/   r`   ra   rb   a_descb_descc_descrc   r.   r-   r4   r5   rg   rh   ri   rj   rk   rn   ro   rp   rq   accacc0acc1c0c1rt   r   r   r   !gemm_kernel_descriptor_persistent   s   




r   c           +   	   C   s  t jdd}t ||}t ||}|| }|| }|| }t|| |}|||  }|| | }|| }|| }|t d| }|t d| }t ||k |d}t ||k |d}t t |||}t t |||}t d|}| |d d d f | |d d d f |   } ||d d d f | |d d d f |	   }!t j||ft j	d}"t
dt ||D ]=}#t j| |d d d f ||#|  k dd}$t j|!|d d d f ||#|  k dd}%t |$|%|"}"| || 7 } |!|| 7 }!q|"|jj}&|| t d| }'|| t d| }(||
|'d d d f   ||(d d d f   })|'d d d f |k |(d d d f |k @ }*t |&||t j|)|*d }&t j|)|&|*d d S )Nr   r:   r>   r@   rA   rD   )rE   rF   rG   r,   rH   rJ   rK   rL   rM   rN   rI   rO   rP   rQ   r?   rR   rS   rT   )+rU   rV   r   r   r   r   rW   rX   rY   rZ   r[   r\   r]   r^   r   r	   r
   r   pidr/   r`   r.   r1   r2   r3   r4   r5   re   rf   rg   rh   rk   rl   rm   ri   krn   ro   rt   rp   rq   rr   rs   r   r   r   gemm_kernel   sD   ,,((,(r   )r   triton.languagelanguagerE   r   r+   jitr6   autotune	constexprru   r   r   r   r   r   r   <module>   sd    
	

O	
\
