o
    پiq                     @   s  d dl mZ d dlZd dlZd dlZd dlZd dlmZmZ d dl	Z	d dl
mZ d dlmZmZ d dlmZmZmZ d dlmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZ ddlmZm Z m!Z! ddl"m"Z" ddl#m$Z$m%Z%m&Z&m'Z'm(Z( eddG dd dZ)eddG dd dZ*eddG dd dZ+G dd deZ,e)Z-e. a/e)0 e)0 fde)de)fddZ1dej%fdd Z2d!d" Z3eddG d#d$ d$Z4eG d%d& d&Z5d'd( Z6eG d)d* d*Z7d+d, Z8d-e7fd.d/Z9d0d1 Z:			2dLd3ej%d4ej%d5ej%d6ej%d7edB d8edB d9ej%dB d:e;d;e;fd<d=Z<d>d? Z=										dMd@edB dAedB dBedB dCe5dB dDej%dB dEej%dB dFe>dB dGej%dB de*dB de+dB fdHdIZ?							dNd@edAedBedCe5fdJdKZ@dS )O    )	dataclassN)Enumauto)target_info)
InFlexDataOutFlexData)
GatherIndxRoutingDataScatterIndx)is_cuda   )_matmul_ogs)_p_matmul_ogs"get_per_device_per_stream_alloc_fn)_reduce_grouped)MXFP_BLOCK_SIZE)make_opt_flagsupdate_opt_flags_constraintsInapplicableConstraint)
specialize)StorageTensorFP4bitwidthwrap_torch_tensorT)frozenc                   @   sH   e Zd ZU eed< ded< ee ed< e Zee ed< edd ZdS )	FnSpecsnameztriton.runtime.jit.JITFunctionfnfn_arg_namesfn_arg_do_not_specializec                   C   s   t dd t S )Ndflt)r   tuple r#   r#   M/home/ubuntu/.local/lib/python3.10/site-packages/triton_kernels/matmul_ogs.pydefault    s   zFnSpecs.defaultN)	__name__
__module____qualname__str__annotations__r"   r    staticmethodr%   r#   r#   r#   r$   r      s   
 r   c                   @   s<   e Zd ZU e Zeed< e Zee	 ed< dZ
eed< dS )FusedActivationspecsfn_argsr   reduction_nN)r&   r'   r(   r   r%   r-   r*   r"   r.   objectr/   intr#   r#   r#   r$   r,   %   s   
 r,   c                   @   sN   e Zd ZU e Zeed< e Zee	 ed< e Z
ee	 ed< dZeed< dS )Epiloguer-   fn_arg_values_matmulfn_arg_values_finalizeNeffective_itemsize)r&   r'   r(   r   r%   r-   r*   r"   r3   r0   r4   r5   floatr#   r#   r#   r$   r2   ,   s
   
 r2   c                   @   s   e Zd Ze ZdS )FnNameN)r&   r'   r(   r   QUANTIZE_MXFP8r#   r#   r#   r$   r7   3   s    
r7   epiloguefused_activationc                 C   s   |j | j f}|tv rt| S |j| jd}|j| jd}|j| j }dd l}|dd| }|tj	|j
< tt||||d|_tt||||d|_tt||||d|_|t|< |S )N)ACTIVATION_FNEPILOGUE_FN)activation_fn_argsepilogue_fn_argsr   matmul_ogs__)do_not_specialize)r   _kernelsr   r   r    types
ModuleTypejoinsysmodulesr&   r   r   r   r   )r9   r:   keyspec_constantsspec_tuplesrA   rC   moduler#   r#   r$   get_kernels<   s0   


rL   tensorc                 C   s<   d}d}t | jD ]}|| j| d | | 7 }q	||kS )Nir   r   )rangendimshapestride)rM   	max_int32offsetir#   r#   r$   can_overflow_int32]   s
   rU   c                  G   s   t dd | D S )Nc                 s   s     | ]}|d uot |V  qd S N)rU   ).0rM   r#   r#   r$   	<genexpr>f   s    z(should_upcast_indices.<locals>.<genexpr>)any)argsr#   r#   r$   should_upcast_indicese   s   r[   c                   @   s8   e Zd ZU e Zeed< e Zeed< e Zeed< dS )FlexCtxlhs_datarhs_dataout_dataN)	r&   r'   r(   r   r]   r*   r^   r   r_   r#   r#   r#   r$   r\   o   s   
 r\   c                   @   s   e Zd ZU dZeed< dZeed< e Z	eed< dZ
eed< dZeed	< dZeed
< dZedB ed< dZedB ed< dZedB ed< dZejed< dZeed< dS )PrecisionConfigNmax_num_imprecise_accT
allow_tf32flex_ctxg      ?	acc_scaleFflexpoint_saturate_infreport_quantization_err_fn	act_scaleweight_scale	out_scale	out_dtypeenforce_bitwise_invariance)r&   r'   r(   ra   r1   r*   rb   boolr\   rc   rd   re   rf   callablerg   r   rh   ri   rj   torchdtyperk   r#   r#   r#   r$   r`   u   s   
 r`   c                 C   s*   t ddr| jd uo|jdko|jS dS )N
   r   @   F)r   cuda_capability_geqrh   block_mis_persistent)precision_config	opt_flagsr#   r#   r$   get_swap_xw   s   rw   c                   @   s<   e Zd ZU eed< eee ejf ed< e	eef ed< dS )MatmulAllocationdeviceoutputscratchpadsN)
r&   r'   r(   r)   r*   r"   r1   rn   ro   dictr#   r#   r#   r$   rx      s   
 rx   c                 C   s  |j d }| j d }	|d ur|jj d }	|jdks|d u r |	}
n|jj d |j }|}
| jdkr5| j d nd}||
||j f}|jpD| j}||f}t }|jdksX|d urm|j	sm|jdkr`t
jn|}|jd|	|f|f|d< d|v r|jd ur|jd|	t|tft
jf|d< t| j||S )Nr   r      matmulmx_out_scale)rP   src_indxn_expts_actrO   r/   rj   ro   r|   split_kfused_scatterrn   float32ri   tritoncdivr   uint8rx   ry   )xwru   r:   routing_datagather_indxscatter_indxrv   NMy_rowsMc	batch_dim	out_shaperj   rz   
scratchpadscratch_out_dtyper#   r#   r$   init_allocation   s&   

 r   
allocationc                    s|   t  }|d u rtj jd  j jd d}n
|j jd ks!J |d d d d d f |d<  fdd j D |d< |S )Nr   r   ry   ro   rz   c                    s,   i | ]\}}|t j|d   j|d dqS )r   r   r   )rn   emptyry   )rW   kvr   r#   r$   
<dictcomp>   s    z$apply_allocation.<locals>.<dictcomp>r   )r|   rn   r   rz   ry   rP   r{   items)r   rz   retr#   r   r$   apply_allocation   s    

r   c                 C   s   || j jksJ dg|| j j  t| j j }| j |}|dg|| j j  t| j   }| j ||}|d ur@||}t|| j	S )Nr   r   )
datarO   listrP   viewrQ   
as_stridedreinterpretr   layout)storageout_ndim	flex_datanew_storage_shapenew_storage_viewnew_storage_stridenew_storage_datar#   r#   r$   _canonicalize_storage   s   	&
r   Fr   indxoutout_mx_scalex_flexout_flex
x_mx_scalerj   re   c                 C   s  |du r| j d dkr| ddfS |dur|j d }n| j d }|du r(t }|du r/t }|du r5dn|j d }|	du rA| jn|	}	| j d |j dksOJ d}|du rWdn|j}|du r`dn|j}|du ridn|j}|du rrdn|j	}|du r{dn|
d}|du rdn|
d}|du rdn|
d}t|j|j}|j|f || | 
d| 
d| 
d||||
d|
d||||| j d | j d |||||g|j|j|jR |du|du|
||dd	 ||fS )
a~  
    In-place grouped row reduction.

    Arguments
    - x: Tensor[AnyFloat] of shape [(num_groups * K), N]
    - indx: Tensor[Int] of shape [num_groups, K]

    Description
    For each group g in [0, num_groups), this routine sums the K rows of `x`
    specified by `indx[g, :]` and overwrites the row corresponding to the first
    valid (non-negative) index with the per-group sum. Accumulation is performed
    in float32 for numerical stability, and the result is written back in the
    dtype of `x`.

    Behavior and edge cases
    - Invalid (-1) entries are skipped during accumulation and do not generate
      memory traffic. If a group has no valid entries, nothing is written for
      that group.
    - Reduction is performed tile-by-tile along the N dimension within a single
      kernel launch (persistent along N) to minimize launch overhead.

    Performance notes
    - Memory traffic per group is approximately (valid_rows_read + 1) * N * sizeof(x),
      plus index reads. With no invalid entries, this becomes (K + 1) reads/writes
      of length N per group.

    Returns
    - The input tensor `x` (modified in place).
    Nr   r   r~   r}   i      r   )HAS_IN_MX_SCALEHAS_OUT_MX_SCALEFLEXPOINT_SATURATE_INFBLOCK_NK	num_warps)rP   squeezer   r   ro   r/   scaleexpected_scaleactual_scalechecksum_scalerQ   rL   r-   r   r   r.   r4   )r   r   r   r   r:   r9   r   r   r   rj   re   
num_groupsr   r   x_expected_scaleout_expected_scaleout_actual_scaleout_checksum_scale
stride_mxb
stride_mxsstride_omxskernelsr#   r#   r$   reduce_grouped   sT   "

 	
r   c                 C   s   t d| i dS )z;
    persistent kernels will leave `num_idle_sms` idle
    idle_smsN)r   )num_idle_smsr#   r#   r$   matmul_ogs_set_idle_sms%  s   r   r   r   r   ru   betasgammas	out_alphayc           Q      C   s  | j dk}|r0|du sJ d|du sJ d|du sJ d|j dkr.|jd | jd ks0J |du r7t }|du rDtt t d}|du rStt t t d}|du rdtddt	d|jd d}|j
}|du}t o{td	d o{t|jd
k}|r|ddksJ dt|ts|jtjkrtn|j}t||d}|durt|tst|}|dur|jtj|j_tj|_|j}|du}|r| ddksJ d|durt|tst|}t| tst| | jd} |du}|du}|jdu}|du r| jd n|jjd }|jdu r|j dkr|jd nd}|jdd \}}|| jd ks-J | j dkrF|j dkrF| jd |jd ksFJ |jpL| j}|  dkop| j op| dkop|j op|du pp|j }|otj ! d dkpt|jdk}|o|j"j#du o|j"j#du o|j$dk}t%|| j|j||||||||j&}|s|j'rt(d|dur|j)rt* st+d|dur|jj,j-dur|j)st* rt+d|}t } |j.dks|dur|j's| |}} t/| |||||||}!t0|!|
}"|| | dkr&|"d 1d}#|s$|#1d}#|#S |j)r2t23t4| j5 d|"d v }$|"d 6d|"d }%|%jtj7krLt8 n|j9j:}&|j;}'|'duro|'jtj}'|$rod|"d v ro|"d d }'|'duoy|%< dk}(|j9})|du rdn|d}*|du rdn|jjd }+|j=},|j>}-|,du rdn|,j?}.|,du rdn|,j@|- d }/|,du rdn|,jA}0|,du rdn|,jB|- }1t2C||j>}2|1dur|D||j>}2t2C||jE}3||2 |3 |j. }4|j)rtFtG |jH |4n|4}5|otI }6|j'otI }7t|j'r"|%tJK|%jdd |%jd n|%jtJK|%jdd g|%jdd R  }
tL| j|6rAdnd|)jM}8tL|jd|)jN}9tL|
j|7rVdnd|)j:}:|j)od|6pd| };|6rmd|jOgnd|j>|jOg}<|;sydn	|r|6sdnd}=|;r|8P|<|=n|8j}>|j)o|7p|j' }?|jE|jQ |jR }@|7rd|@gnd|j>|@g}A|?sdn	|r|7sdnd}B|?r|:P|A|Bn|:j}C|j)}D|Dr|9Pd|jO|jEgdn|9j}E|}F|j)o|du}G|Gr|jP|jE|jOgdn|}Fdgd|8jj   tS|8j  }H|r| nd}IddtT|I  |I }I|r&|Gs&| nd}JddtT|J  |J }J|(r9|' nd}KddtT|K  |K }KtU|j"|j"}L|9j d dk}M|j)r\|LjVn|LjW|5f g |C|:j|% |(rsd|'dfn|&|Kdd |>|8j|H|)jMjX|du rdn|jtj|I|E|9j|9j |M|)jNjX|F|J||*| jd |jdu r| jd nd|||||du rdn|j|du rdn|j|+|j'sdn|jY|j'sdn|jYjd |.|0|/|1||2|3|	|jZ|jR|j[|j\|j$|j]|j^|j_|)jNj`|j>|jE|jO|jaR i d |jbd!|jj,j-d"|du r?dn|jj,j-d#|jQd$|j.d%||jO dkd&|jcd'|jdd(|jed)|jfd*|jgd+th| ||%d,|=d-|Bd.ti||d/|j"j-tjjkj-kd0|j)r|5nd|jl |du s|j'rdn|jd|j$}Ntm|%|N|"d 1d|j;| |tn|&j|&jod1|j9j:|(r|'1dnd|"d j|j_d2\}O}P|s|O1d}O|Pdur|P|_;|OS )3zs
    Y[:, :] = 0.
    for e in num_experts:
        Y[idxs_y_m(e), :] += matmul(X[idxs_x_m(e), :], W[e, :, :])
    r   N$gather not supported in batched mode%scatter not supported in batched mode%routing not supported in batched moder   r   Frp      r~   zE`w` must be column-major when it has data-type FP8 on capability < 10ro   r}   z0'x' must be row-major when it has data-type mxfp	      zFused scatter is not supportedz1Must use non-persistent kernel for simulated MXFPz?Must use persistent kernel and be TMA-compliant for native MXFPrz   r   r   r   r   raggeddense)NNN)r   )NNNNXCD_SWIZZLESWIZZLE_MX_VALUESWIZZLE_MX_SCALEEPILOGUE_SUBTILESPLIT_KEVEN_KW_CACHE_MODIFIERTOKENS_PER_EXPT_FOR_ANNOTATIONr   
num_stagesarchUPCAST_INDICES
X_TMA_MODE
Y_TMA_MODESWAP_XWIS_EPILOGUE_QUANT_MXFP8NUM_SMS)ro   r   )r   r   r   rj   re   )prO   rP   r`   r,   r   r%   r"   r2   r	   maxrh   r   r   rr   r   ro   rQ   
isinstancer   rn   r   r   r   r   r   r   rg   	expt_histr   rj   numelis_tma_compliantcudaget_device_capabilityr-   r   r   r   r5   r   r   rt   has_native_mxfpNotImplementedErrorr   r   r   r   r   r   r   set_allocatorr   ry   getr   r   rc   r_   ri   element_size	expt_datars   histtoken_offs_padtoken_offs_rawblock_pid_mapr   n_blocksblock_nminnum_smsr   has_tma_gathermathprodr   r]   r^   block_kmake_tmaepilogue_subtiler/   r   lenrL   r   r   r   dst_indxr.   r3   n_expts_totra   rb   re   is_per_batchgroup_mxcd_swizzlew_cache_modifierexpected_tokens_per_exptr   r   r   r[   rw   r7   r8   target_kernel_kwargsr   r   r   )Qr   r   biasr   r   r   ru   r   r   r   r   r:   r9   is_input_batchedw_scalew_has_mxis_hopper_fp8ro   x_scalex_has_mx
has_gatherhas_scatter	is_raggedr   
batch_sizer   r   rj   can_use_tmacan_use_fused_scatterrv   matmul_fused_activationreduce_fused_activationr   memoryr   has_scratchpad
out_matmulout_matmul_flexout_matmul_scaleout_matmul_has_mxflexbias_stridenum_indxr   rs   r   expt_hist_sumexpt_token_offs_rawexpt_block_pid_mapgrid_mgrid_nmax_gridgridhas_gather_tmahas_scatter_tma	x_storage	w_storage	y_storage	x_has_tmax_tma_block_size
x_tma_modex_tensor_or_tma	y_has_tmar   y_tma_block_size
y_tma_modey_tensor_or_tma	w_has_tmaw_tensor_or_tmaw_scale_tensor_or_tmaw_scale_has_tma	x_stridesx_scale_stridesw_scale_stridesout_matmul_scale_stridesr   w_transpose
group_indx	out_finalout_final_mx_scaler#   r#   r$   
matmul_ogs+  sD  
""


 &
(,,





 Z"  
		

 !"
#$%&'()*+,-./
012&5


rI  c              
      s   j dk} jjdksJ |jjdksJ |r@|d u sJ d|d u s'J d|d u s/J d|j dkr>|jd  jd ks@J |	d u rHdd }	|
d u rPd	d }
|d urc|j dkrc|jdg|jR  }|j d
krr|jdg|jR  } j d
kr jdg jR   |d u rtd d |jd d}|j}|jdkr|s|j}t	j
|jd d t	jd}t	|d|dd < tt|}n fddt|jd D }|d u r҈ jd n|jjd }t	j
 jd ||jd f j jd}t|D ]\}\}}|d u rt	j|| jd}n	|j|| | }|r|nd}t	|	 ||d d f t	j||dd ||  }|d urT||d u rC||d d f n||d d f |||d f  7 }|d urc||||d f 9 }|
|||||d d f< q|s||jd |jd
 }|d u r|S |jd | }t	j
||jd ft	j jd}t|D ]2\}\}}|j|| | }|dk}||| d d f  |||d d f |d d f  7  < q|S )Nr   r   r   r   r   r   c                 S      | S rV   r#   )r   idxr#   r#   r$   <lambda>2      z"matmul_ogs_torch.<locals>.<lambda>c                 S   rJ  rV   r#   r   r#   r#   r$   rL  4  rM  r   r   c                    s   g | ]	}d  j d gqS )r   r   )rP   )rW   r@   rN  r#   r$   
<listcomp>E  s    z$matmul_ogs_torch.<locals>.<listcomp>r}   r   )ry   r   )ro   ry   )rO   ro   itemsizerP   r   r	   r   r
  r   rn   zerosint32cumsumr   	itertoolspairwiserN   r	  ry   	enumeratearanger   r   r6   r   )r   r   r  r   r   r   ru   r   r   round_xround_yr  r   sizesoffoffsn_rowsr   rT   lohirK  batchr   dst_idxmskr#   rN  r$   matmul_ogs_torch   sj   
	"

&
*

>

@rc  )NNNNF)
NNNNNNNNNN)NNNNNNNN)Adataclassesr   rT  rF   rn   r   enumr   r   r  triton_kernelsr   triton_kernels.numericsr   r   triton_kernels.routingr   r	   r
   triton_kernels.target_infor   matmul_ogs_details._matmul_ogsr    matmul_ogs_details._p_matmul_ogsr   r   "matmul_ogs_details._reduce_groupedr   numerics_details.mxfpr   matmul_ogs_details.opt_flagsr   r   r   r   rM   r   r   r   r   r   r   r,   r2   r7   EpilogueSpecsr|   rB   r%   rL   rU   r[   r\   r`   rw   rx   r   r   r   rl   r   r   r6   rI  rc  r#   r#   r#   r$   <module>   s    !
	
O	

 w