o
    پio                     @   s   d dl Z d dlZd dlmZ d dlmZ ddlT ddlm	Z	 ej
e g dejdejd	ejd
ejdejdejdejdejfddZde jde jde jde jfddZde jde jde jde jfddZdS )    N)Tuple   )*   )&get_mk_alignment_for_contiguous_layout)configskeyNKBLOCK_SIZE_MBLOCK_SIZE_NBLOCK_SIZE_KGROUP_SIZE_MIS_B_K_MAJORc           !      C   s  t jdd}t ||}t ||}|
| }|| }||
 }t|| |
}|||  }|| | }|| t d| }|| t d| }||k d d d f }t |||  t j}|dk r||d d d f t j|  |d d d f  }t j|t j	||f|j
jd|d d S | |d d d f t j|  t d|	d d d f  }t j	||ft jd}||| |  t d|	d d d f t j|rdn|  |d d d f t j|r|nd  }td||	D ]@}|t d|	 |k }t j||d d d f dd}t j||d d d f |@ dd} t || |}||	7 }||	|r#dn| 7 }q||d d d f t j|  |d d d f  }t j|||j
j|d d S )Nr   )axis)dtype)maskr   g        )r   other)tl
program_idcdivminarangeloadtoint64storezerosr   
element_tyfloat32rangedot)!a_ptrb_ptrd_ptrm_indices_ptrMr	   r
   r   r   r   r   r   pid	num_pid_m	num_pid_nnum_pid_in_groupgroup_idfirst_pid_mgroup_size_mpid_mpid_nm_rangen_rangen_maskbatch_idd_ptrsa_ptrsaccumulatorb_ptrskk_maskab r<   S/home/ubuntu/.local/lib/python3.10/site-packages/deep_gemm/legacy/m_grouped_gemm.py&m_grouped_bf16_gemm_contiguous_tl_impl
   sB   0"8*" 0r>   r:   r;   d	m_indicesc              
      sJ  |j \}}}|  r|js|j sJ | r| sJ | jtjkr*|jtjks,J |jtjkr8|jtjks:J |  dkrL| dkrL| dksNJ | d|krf| d|dkrf||dkshJ |	 | dkssJ | dt
  dksJ | j \ }|||}}	 fdd}
t|
 | ||| || d d S )Nr      r   r   c                    s"   t  | d t | d  fS )Nr   r   )tritonr   )METAr&   r	   r<   r=   <lambda>M   s   " z6m_grouped_bf16_gemm_nt_contiguous_tl.<locals>.<lambda>)r   )shapeis_contiguousmTr   torchbfloat16int32dimsizenumelr   r>   )r:   r;   r?   r@   r0r1r2r
   BK_gridr<   rD   r=   $m_grouped_bf16_gemm_nt_contiguous_tl=   s   (4

rU   c                 C   s   t | |j|| d S )N)rU   rH   )r:   r;   r?   r@   r<   r<   r=   $m_grouped_bf16_gemm_nn_contiguous_tlR   s   rV   )rI   rB   triton.languagelanguager   typingr   tune_options_Cr   autotuneget_m_grouped_gemm_configsjit	constexprr>   TensorrU   rV   r<   r<   r<   r=   <module>   s:    	1
