o
    i=                     @   s>  d dl Z d dlZd dlmZmZ ddlmZ ddlmZ ddlmZ ddl	m
Z
 ddl	mZ dd	lmZ eG d
d dZeG dd dZeG dd dZeG dd dZG dd de jjZdd ZG dd de jjZdd Zdd ZdZdd Zdd  Zd!d" Zd#d$ Zd.d&d'Zd(d) Zd/d*d+Z d0d,d-Z!dS )1    N)	dataclassfield   )_combined_routing_compute)_combined_routing_memset)_routing_clear_bitmatrix)_expt_data_memset)_expt_data_compute)is_hipc                   @   &   e Zd ZU dZejed< ejed< dS )
GatherIndxzG
    Indices for an operation that performs:
    Y = X[src_idx, :]
    src_indxdst_indxN__name__
__module____qualname____doc__torchTensor__annotations__ r   r   ]/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/third_party/triton_kernels/routing.pyr         
 
r   c                   @   r   )ScatterIndxzG
    Indices for an operation that performs:
    Y[dst_idx, :] = X
    r   r   Nr   r   r   r   r   r      r   r   c                   @   sN   e Zd ZU ejed< ejed< eeejf ed< eeejf ed< dd ZdS )ExptDatahisttoken_offs_rawtoken_offs_padblock_pid_mapc                 C   s   | j d ur| j jtjksJ | jd ur| jjtjksJ | jd ur1| j D ]
}|jtjks0J q&| jd urF| j D ]}|jtjksEJ q;d S d S N)r   dtyper   int32r   r   valuesr   )selfvr   r   r   __post_init__7   s   



zExptData.__post_init__N)	r   r   r   r   r   r   dictintr&   r   r   r   r   r   "   s   
 

	r   c                   @   sp   e Zd ZU e Zejed< e Zejed< e Z	e
ed< e Ze
ed< dZeed< eddZe
ed< d	d
 ZdS )RoutingData	gate_scal	expt_histn_expts_totn_expts_actN	expt_data)defaultexpected_tokens_per_exptc                 C   s4   || j kr|S tt|| j  d d|| j  d S )Nr   r   )r,   tritoncdivmax)r$   n_rowsblock_mr   r   r   n_blocksQ   s   
&zRoutingData.n_blocks)r   r   r   r   r*   r   r   r   r+   r,   r(   r-   r.   r   r0   r6   r   r   r   r   r)   D   s   
 r)   c                   @   s$   e Zd Zedd Zedd ZdS )
SortTokensc           $      C   s  d}d}d}t j}|j}	|j}
|j\}}|j\}}|| }|j|d\}}|d | }|jtjks3J tj|tj|	d}tj|d tj|	d}|d | }||d  }tj||
|	d}t	|||\
}}}}}}}}}} ||d || d }!|||}"t
||! f ||d d||||jd	 |||jd	 |d	|d||d	|||| |d|d
 |}#t||" f ||||||#|#d	|#d|||||||d	||d	|| || || _|| _|| _| | |||||||fS )N       i   )partials_block_sizer!   device   r   r   )SIZESBLOCK_ABLOCK_NBLOCK_M)r1   r2   r<   r!   shapesumr   r"   empty_compute_expt_data_internalr   strider   n_tokens_rawn_tokens_padr-   save_for_backward)$ctx	expt_scal	expt_indxr,   	bitmatrixHIST_BLOCK_MINDX_OFFS_BLOCK_MMEMSET_BLOCKr2   r<   r!   rH   _rI   r-   n_gates_padr   partial_hist	expt_offscombined_indx	topk_indx	gate_indxr*   token_offs_combinedr   r   r   blocks1ablocks2aMEMSET_BLOCK_AHIST2_BLOCK_Mblock_m_log2_startblock_m_numblocks1bblocks2b	indx_offsr   r   r   forward_   sZ   




	
zSortTokens.forwardc           	      C   s,   | j \}|| }|| j| j}|d d d fS r    )saved_tensorsreshaperI   r-   )	rK   _0_1_2
dgate_scal_3_4_5rX   r   r   r   backward   s   zSortTokens.backwardN)r   r   r   staticmethodrc   rm   r   r   r   r   r7   ]   s
    
8r7   c                 C   s   t | |||S r    )r7   apply)rL   rM   r,   rN   r   r   r   sort_tokens   s   rp   c                   @   s   e Zd Zedd ZdS )PruneRoutingc                 C   s   ddl m } |jd }|| dksJ t|f |jj|jjd|jjd|jjjd || dd ||||\}}|| }||jd< |||fS )Nr   )
compactionr   r9   )rA   r>   )rr   rC   r   storagedatarG   )rK   rL   rM   rN   r,   simulated_eprr   rI   r   r   r   rc      s   
	

zPruneRouting.forwardN)r   r   r   rn   rc   r   r   r   r   rq      s    rq   c                 C   s   t | ||||S r    )rq   ro   )rL   rM   rN   r,   ru   r   r   r   prune_routing   s   rv   c                 C   s,   | dkr| | d @ dksJ d|   d S )Nr   r   zx must be a power of two)
bit_lengthxr   r   r   log2_power_of_two   s    rz      c              
      s*  d d}| j }|}tjt rdnd}|t }||kr|}n|d || d dt   } fdd}tj}	tj|d ||d f|	|d}
|
d	 d |d  }|
dd  }tj|||f|	|d}t|  }|d d d |d f }|d d d |f }|| d }|| }|
||||| |t|f
S )
Nr9   	      r   r=   c                    s   |    S r    r   rx   rQ   r2   r   r   <lambda>   s    z-_compute_expt_data_internal.<locals>.<lambda>r;   r   )	r<   r1   r2   r
   r^   r   r"   rE   numel)r+   r,   n_gatesr]   r<   block_m_log2_endr_   max_n_tilespadr!   rY   r   r   r   memset_gridblocks1blocks2r   r~   r   rF      s,    rF   c                    s.   t  jd  } fddttt |D   S )Nr   c                    s&   i | ]\}}d |  |ddf qS )r=   Nr   ).0ijrx   r   r   
<dictcomp>   s   & z%_unpack_into_dict.<locals>.<dictcomp>)r^   rC   	enumeraterange)ry   r   r   rx   r   _unpack_into_dict   s   r   c                 C   s   | d u rt d d d d S t| ||\
}}}}}}}	}
}}t|f | |||d||||	dd	 t|f | ||d||d|||
dd	 t|}t|}t | |||S )Nr   r{   )r?   BLOCK	num_warps)r   rF   r   rG   r	   r   )r+   r,   r   rY   r   r   r   r   r   rQ   r]   r^   r_   r   r   r   compute_expt_data   s(   
r   c                 C   sh   t |||| \}}}}}	}
}t|
}
t|}t||	|
|}t||d}t||d}t|||||||fS )Nr   r   )rp   r   r   r   r   r)   )rN   rL   rM   r,   r-   r   rW   rX   r*   r   r   r   r.   gather_indxscatter_indxr   r   r   routing_from_bitmatrix  s   r   Fc           
      C   sz   ddl m } |rtj| dd} || || ||d\}}}| jd | }	|dkr5t|||| jd |\}}}t||||	|S )Nr   )topkr>   dim)apply_softmaxy_indxr4   )r   r   softmaxrC   rv   r   )
logitsr-   sm_firstrM   ru   r4   r   rL   rN   r,   r   r   r   routing$  s   r   c                 C   s  | j }tj| dd}ttjd|d|f}| }g d}t r&|d ||kr-|}n|d || d t|  }t	 }t	 }|D ]}	| |	 d |	 }
tj|
dd||	< ttjd|d||	 f||	< ||	  ||	< tj
|tj|d ||	< tj||d}tj||dd d d f |d> d d d f  }||	 d d	d f |d d d f  }|d d d f |
d d d f k }||	 || f| |  qCt| |||S )
Nr   r   r   )r<   )   r8   @         r;   r   r>   )r<   r   cumsumcatzerosr(   r
   appendminr'   onesr"   arange
index_put_r   )r   r,   r   r<   r   block_msr   r   r   r5   n_tilescolmap_valsmap_idxsmaskr   r   r   compute_expt_data_torch7  s0   
 .$  r   c                 C   sT   |r|}nt j|  dddd d d |f }| }t j| |dd}| }||fS )Nr   T)r   stabler   )r   argsortlongtake_along_dimr(   )valskrM   has_user_provided_indxtk_indxtk_valr   r   r   
topk_torch_  s   "r   c                 C   s:  |d u}| j d | }|d ur| d |d d f } | j \}}|r'tj| dd} t| |||d\}	}|s:tj|	dd}	|sLtj|dd\}}
t|	d|
}	|	d}	|dtj}tj	|dd}tj	|dd}|	| }tj
|||d d }t| | d	}t| | d	}t|||}t|||||||fS )
Nr   r>   r   )r   r   T)r   )binsr3   r   )rC   r   r   r   sortgatherre   tor"   r   histcr(   r   r   r   r)   )r   r-   r   rM   r4   r   rS   rR   r,   rL   sort_indicesrW   rX   r*   r   r   r   r.   r   r   r   routing_torchk  s.   

r   )FNr   N)F)FNN)"r   r1   dataclassesr   r    routing_details._routing_computer   r   r   routing_details._expt_datar   r	   target_infor
   r   r   r   r)   autogradFunctionr7   rp   rq   rv   rz   r^   rF   r   r   r   r   r   r   r   r   r   r   r   <module>   s>    

!C		"

(