o
    پik                     @   s   d dl Z d dlmZ ddlmZmZ e jdejfddZ	e jdejfdd	Z
e jd
d Ze jdejdejfddZe jdejdejdejdejfddZe jdejfddZe jdejdejdejdejdejf
ddZdS )    N   )_expt_data_compute_expt_data_memsetBLOCK_Nc                 C   s   || d | }t |g| jj}t|D ]9}|| t d| }||k }t j| | |d}	t |	d|	 | }
|t |	d7 }t j	|| |
|d ||7 }qd S )Nr   r   mask)
tlzerosdtype
element_tyrangearangeloadcumsumsumstore)
ExpertHistFinalExpertOffs	hist_sizer   loop_iterationsxioffs_nmask_nhist2
tok_starts r   c/home/ubuntu/.local/lib/python3.10/site-packages/triton_kernels/routing_details/_routing_compute.py_routing_compute_expt_offs   s   
r   BLOCK_Mc                 C   s   t d|}d}td||D ]7}|| ||  }	t j| |	 ||k d}
t |
d| }|t |
d7 }t j| |	 ||
 ||k d ||7 }qd S )Nr   r   )r   r   r   r   r   r   r   )PartialHistshape_pm	stride_pm	stride_pnr   expt_idoffs_mcurr_sum_offscurroutr   r   r   _routing_compute_indx_offs   s   
r+   c                 C   s2   d}| |@ }||@ }t ||k| | | |}|S )N      )r   where)r   ykey_maskkxkyzr   r   r   
_keyed_add$   s
   r3   N_EXPTS_ACTc                 C   sr  t |
tjr|
j rt|
}
|
| }t|| dk td|| }| | | | }tj|| ||k ddtj	}|d> |B tj	}t
|d}|d? }| | | |d@  }|dk}tj|| |d}|d@ d	B }t|dt}|d	 d@ }tj|| |  ||  |d}|tj|	| |d7 }||7 }tj|| ||d tj|| ||d tj|| ||d d S )
Ni   r   )r   other   i  r   r,   r   )
isinstancer   tensorr
   is_ptrr   static_assertr   touint32sortassociative_scanr3   r   )pid_m
GatherIndxScatterIndxGateScalExptScalExptIndxPartialOffsr"   r#   TokensStartn_tokensr   r4   n_gates
local_offsr(   expertkv_pairsr   	gate_scalr   expts_and_inclusive_run_lengthsexclusive_run_lengthsgatesr   r   r   _routing_compute_indx0   s,   
 rQ   SIZESBLOCKc                 C   sX   t d}||k rt|||||||| d S ||8 }t|| |||||||||	|
| d S )Nr   )r   
program_idr   rQ   )rA   rB   rC   rD   rE   rF   r"   r#   rG   rH   r   r4   HistMDTileStartstile_starts_stridem
MDTileInfotile_info_stridemfirst_tile_dim_log2rR   rS   blocks2apidr   r   r   _combined_routing_computeT   s   

r]   c                 C   s   t d}|d }|d }d|> d }	td||D ]A}
|
t d| }t j| ||  ||  ||k d}t ||k||	@ |}t ||kd|}t j| ||  ||  |||k d qd S )Nr       r   r   )r   rT   r   r   r   r-   r   )	Bitmatrix	stride_bm	stride_bnshape_bncutoffr   r@   cutoff_word
cutoff_bitcutoff_maskstart_nr   valuesr   r   r   _routing_clear_bitmatrixd   s   
"&ri   BLOCK_Ac              	   C   s   t d}||k rt|||||||| dS ||| kr%t|||| dS ||| k r8t||	|
||||  dS || | d | t d| }||k }t j| | ||d dS )a3  
    This kernel essentially combines 6 different pieces of functionality,
    statically branching on the value of tl.program_id(0) to decide which
    codepath to take.

        pid == 0:                                  create the token cumsum
        1 <= pid <= SIZES:                         create a tile cumsum
        SIZES < pid < blocks1a:                    initialise MDTileInfo to 0xffffffff
        blocks1a <= pid < blocks1a + n_expts_tot:  compute_indx_offs
        pid == blocks1a + n_expts_tot:             compute_expt_offs
        pid > blocks1a + n_expts_tot:              initialise Indx to sentinel

    As each of these is a relatively trivial workload, launching them from
    this single trampoline is beneficial as they can execute on different
    streaming multiprocesses in parallel.
    r   r   r   N)r   rT   r   r   r+   r   r   )IndxsizesentinelrS   r   r   r   n_expts_totr    r!   r"   r#   MDStartsrW   blocks1arX   rZ   rR   rj   r   r   r\   r(   r   r   r   r   _combined_routing_memsetr   s   
 rq   )tritontriton.languagelanguager   
_expt_datar   r   jit	constexprr   r+   r3   rQ   r]   ri   rq   r   r   r   r   <module>   sN    
#