o
    پi                     @   sr   d dl Z d dlZd dlmZ ejdejfddZejdej	fddZ
ejdej	fd	d
Zdd Zdd ZdS )    Ndtypec              
   C   s*   t jdt jdt jdt jdt jdi}||  S )Nfp16bf16e2m1e4m3e5m2)tlfloat16bfloat16uint8
float8e4nvfloat8e5)r   mapping r   ]/home/ubuntu/.local/lib/python3.10/site-packages/triton_kernels/matmul_ogs_details/_common.pyget_scaled_dot_format_string   s   r   XCD_SWIZZLEc                 C   s:   || }|| }| | }| | }|| t || | }|S )a  
    Swizzle the program id based on integer XCD_SWIZZLE.
    This is useful for reording how blocks are ordered. A scheduler may, for example,
    assign sequential blocks 0, 1, 2, 3, ..., 8, 9, 10.. to its 8 hardware units 0, 1, 2, 3, ..., 0, 1, 2.
    This pattern may not be ideal for memory access, and it may be better to swizzle so the assignment
    becomes 0, 0, 0, 0, ..., 1, 1, 1, ... In the swizzled arrangement, sequential blocks are assigned to
    the same hardware unit.
    )min)piddomain_sizer   pids_per_groupextra_pid_groupsgroup	local_pidnew_pidr   r   r   xcd_swizzle   s   r   GROUP_Mc           	      C   sT   || }| | }t |||  |}t|dk || | |  }| | | }||fS )Nr   )r   r   assume)	r   grid_mgrid_nr   widthgroup_id
group_sizepid_mpid_nr   r   r   	swizzle2d.   s   r%   c                    s    fdd}|S )Nc                    s   | j | j fdd} fddfdddfdd|g d	D }d
fdd|g dD }d fdddD } d| d| d| S )Nc                    s    fddD S )Nc                    s   g | ]} | qS r   r   .0iLr   r   
<listcomp>>   s    zKmake_matmul_repr.<locals>.matmul_repr.<locals>.<lambda>.<locals>.<listcomp>r   r)   )orderr)   r   <lambda>>   s    z7make_matmul_repr.<locals>.matmul_repr.<locals>.<lambda>c                    s   |  v rdS dS )NNTr   )stride	constantsr   r   r-   ?   s    c                    sP   d| v r |  dd  dd }|S d| v rdS | d dkr&| dd  S | S )	N
tensordesc<   [r   u8mxfp4*)split)r   ret)convert_dtyper   r   r<   A   s   z<make_matmul_repr.<locals>.matmul_repr.<locals>.convert_dtypexc                    s   g | ]	} |  qS r   r   r&   )r<   	signaturer   r   r+   L   s    z9make_matmul_repr.<locals>.matmul_repr.<locals>.<listcomp>)YXW c                    s   g | ]} | qS r   r   r&   )layoutr   r   r+   M       )
stride_y_n
stride_x_k
stride_w_nc                    s   g | ]} |  qS r   r   r&   r1   r   r   r+   N   rD   )BLOCK_MBLOCK_NBLOCK_KSPLIT_K_)r>   r2   join)specializationreorderdtypeslayoutsblocks	base_namer,   )r2   r<   rC   r>   r   matmul_repr;   s   " 	z%make_matmul_repr.<locals>.matmul_reprr   )rT   r,   rU   r   rS   r   make_matmul_repr9   s   rV   c              
      s4  ddl m} t }|d |d |d }}}|d |d |d }}	}
|d	}|d
   d ur|d ur<| dn| rHt   nd| rft  }|
 |
	      dk  }n|d ury||d  }|
 |
	  }nd }d}|d	}|d ur| dnn
d }|
 |
	  } fdd}|	j
jd }d}d|v r|d dkr|d|d d }|j d| |d| d|d| d|d| d|j 
|d< |d }|d ur|dkr|d  d| 7  <  d ur|d u r|S |d ur|n|}|d ur
|n|}d| | | |d| < |dd }|	 |		  }| |	  } d ur|d us;J |d }|d urv| rvt|d }tjt||jtjd!}|d k}|| ||| < |d |fd kjdd" }n|}||	jd   |		  }||jd   |	  }t|| | |d#< |S )$N   )launch_metadata_allow_syncMr.   KYPtrXPtrWPtrTOKENS_PER_EXPT_FOR_ANNOTATIONExptHistr9   unknownr   N_EXPTS_TOTc                    s0   |d ur|  d| S dt   d|  d S )Nz = E_(z) = )len)sr=   histn_rowsr   r   r-      s   0 z(matmul_launch_metadata.<locals>.<lambda>   rB   
batch_sizer5   Bz, z [z] stgnameEPILOGUE_SUBTILEz ep/g       @flops
GatherIndxN_EXPTS_ACT)devicer   )dimbytes)proton_optsrX   dictgetintfloatmeansumnumelelement_sizer   itemsizerl   
num_stagestorch	full_likearangerd   rr   int32viewanyshape)gridkernelargsrX   r;   rY   r.   rZ   r?   r@   rA   tokens_per_exptn_tokens	n_w_bytesreprnbits
batch_repr
ep_subtilefMfKgindx	n_x_bytes	n_y_bytesn_expts_actdstidxmaskn_read_rowsr   rf   r   matmul_launch_metadata\   sn   
&
>
 r   )r   tritontriton.languagelanguager   constexpr_functionr   r   jit	constexprr   r%   rV   r   r   r   r   r   <module>   s    
#