o
    'iN                     @   s  d Z ddlZddlmZ ddlmZ ddlmZmZ ddl	Z	ddl
mZ ddlmZ d	d
lmZ d	dlmZmZ ejdd Zde	jdee	j dedededededee	je	je	je	je	je	jf fddZde	jde	jde	jde	jde	jde	jdedededededdfddZd e	jd!e	jd"e	jd#e	jde	jd$e	jd%e	jdededdfd&d'Zd(eddfd)d*Zdedefd+d,Zdedefd-d.Zd/e	jd0ee	j d1ee	j d2e	jdedededed3ededee	je	je	je	je	je	je	je	jf fd4d5ZeG d6d7 d7ZG d8d9 d9ZdS ):a3  
Copyright (c) 2025 by FlashInfer team.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

  http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
    N)	dataclass)SimpleNamespace)OptionalTuple   )gen_comm_alltoall_module)register_custom_op   )Mapping)MnnvlMemoryMnnvlConfigc                     s
  t    tdg ddtjdttj dtdtdtdtd	td
ttjtjtjtjtjtjf f fdd} tdddgddtjdtjdtjdtjdtjdtjdtdtdtdtd	td
d f fdd}tddgddtjdtjdtjdtjdtjdtjdtjdtd	td
d f fdd}tdg dd td
d f fd!d"}td#g dd	td
tf fd$d%}td&g dd	td
tf fd'd(}td)g dd*tjd+ttj d,ttj d-tjdtdtd	tdtd.tdtd
ttjtjtjtjtjtjtjtjf f fd/d0}t| ||||||d1S )2Nz$flashinfer::moe_comm_prepare_indices)mutates_argsgathered_target_rank_idsreal_rank_token_count_cum_summax_token_count_per_rankexpert_counttop_kep_rankep_sizereturnc                    s   | j }t||}tj|| |tjd}	tj|f|tjd}
tj|| |tjd}tj||tjd}tj|| |tjd}tj|| |tjd} | ||	|
||||||||| |	|
||||fS )N)devicedtype)r   maxtorchemptyintmoe_comm_prepare_indices)r   r   r   r   r   r   r   r   max_send_ranks_per_tokenlocal_gather_indicessend_rank_count_cum_sumsend_rank_local_indicesrecv_rank_count_cum_sumrecv_rank_local_indicesbackward_recv_rank_local_indicemodule \/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/flashinfer/comm/trtllm_alltoall.pyr   "   sX   

z:get_comm_alltoall_module.<locals>.moe_comm_prepare_indiceszflashinfer::moe_local_gatherlocal_expert_idslocal_scalesrecv_rank_cum_sumr   gathered_expert_idsgathered_scalesc                    s"     | |||||||||	|
 d S N)moe_local_gatherr*   r   r+   r,   r(   r)   r   r   r   r   r   r$   r&   r'   r.   d   s   z2get_comm_alltoall_module.<locals>.moe_local_gatherzflashinfer::moe_commoutputinputsend_rank_cum_sumsend_indicesrecv_indicesall_workspacesc	           	         s     | ||||||||	 d S r-   )moe_comm	r1   r2   r3   r0   r*   r4   r5   r   r   r$   r&   r'   r6      s   z*get_comm_alltoall_module.<locals>.moe_commz'flashinfer::set_moe_max_usable_sm_countmax_sm_countc                    s     |  d S r-   )set_moe_max_usable_sm_countr8   r$   r&   r'   r9      s   z=get_comm_alltoall_module.<locals>.set_moe_max_usable_sm_countz/flashinfer::get_moe_commworkspace_size_per_rankc                    
     | S r-   )#get_moe_commworkspace_size_per_rankr   r$   r&   r'   r<         
zEget_comm_alltoall_module.<locals>.get_moe_commworkspace_size_per_rankz3flashinfer::get_moe_prepare_workspace_size_per_rankc                    r;   r-   )'get_moe_prepare_workspace_size_per_rankr=   r$   r&   r'   r?      r>   zIget_comm_alltoall_module.<locals>.get_moe_prepare_workspace_size_per_rankzflashinfer::moe_prepareexperts_idsscalesexperts_statics	workspace
slot_countc
                    s  t j| jd}
t j|| |	ffi |
}t j|ffi |
}t j|ffi |
}t j|| ffi |
}t j|| ffi |
}t||	}t j|| ffi |
}t j|| ffi |
}t j|| ffi |
}t j|| ffi |
}|d urt j|| |	ft j|
d d}nd }|d urt j||ffi |
}nd } | ||||||||||||||||||||	 ||||||||fS )Nr   r   r   )r   int32r   r   r   float32moe_prepare)r@   rA   rB   rC   r   r   r   r   rD   r   attrsprepared_local_expert_idsr   r!   gather_recv_rank_indicesrecv_rank_indicesr   !gather_backward_recv_rank_indicesbackward_recv_rank_indicesgather_send_rank_indicessend_rank_indicesprepared_local_scalesgathered_expert_staticsr$   r&   r'   rH      s   


z-get_comm_alltoall_module.<locals>.moe_preparer   r.   r6   r9   r<   r?   rH   )	r   build_and_loadr   r   Tensorr   r   r   r   rS   r&   r$   r'   get_comm_alltoall_module   s:  
>	
	
	
_rV   r   r   r   r   r   r   r   r   c              	   C   s   t  | ||||||S r-   )rV   r   )r   r   r   r   r   r   r   r&   r&   r'   r   '  s   r   r*   r   r+   r,   r(   r)   c                 C   s$   t  | |||||||||	|
 d S r-   )rV   r.   r/   r&   r&   r'   r.   =  s   r.   r1   r2   r3   r0   r4   r5   c	           	      C   s    t  | ||||||||	 d S r-   )rV   r6   r7   r&   r&   r'   r6   Y  s   r6   r8   c                 C   s   t  |  d S r-   )rV   r9   r:   r&   r&   r'   r9   q  s   r9   c                 C      t  | S r-   )rV   r<   r=   r&   r&   r'   r<   w     r<   c                 C   rW   r-   )rV   r?   r=   r&   r&   r'   r?   }  rX   r?   r@   rA   rB   rC   rD   c
           
      C   s   t  | |||||||||	
S r-   )rV   rH   )
r@   rA   rB   rC   r   r   r   r   rD   r   r&   r&   r'   rH     s   rH   c                   @   sR   e Zd ZU ejed< ejed< ejed< ejed< ejed< ejed< eed< dS )	MoEAlltoallInfor   send_rank_count_cumsumr    recv_rank_count_cumsumr"    backward_recv_rank_local_indiceslocal_token_allocation_countN)__name__
__module____qualname__r   rU   __annotations__r   r&   r&   r&   r'   rY     s   
 





rY   c                   @   st  e Zd ZU dZeed< dZeed< dZej	ed< dZ
ej	ed< dZeed< ed)dedee fd	d
Ze	d)dedee fddZedej	dedefddZedej	dej	deej	 dej	dedededededefddZedej	dej	dej	dej	dededededefd d!Zed"ej	d#edej	dedef
d$d%Zed"ej	d#edej	dededed&efd'd(ZdS )*MnnvlMoeNmoe_workspacemoe_prepare_workspacemoe_workspace_tensormoe_prepare_workspace_tensormoe_mappingmappingconfigc                 C   sd   t jd ur| t jksJ dt jS | t _t| j}|r!t| | t| |t _t jt	j
t _t jS Nz"only one moe mapping supported now)rb   rc   rg   re   r<   tp_sizer   set_comm_from_configas_torch_strided_tensorr   uint64rh   ri   workspace_size_per_rankr&   r&   r'   get_moe_workspaces  s   

zMnnvlMoe.get_moe_workspacesc                 C   s^   t jd ur| t jksJ dt jS t| j}|rt| | t| |t _t jt	j
t _t jS rj   )rb   rf   rg   r?   rk   r   rl   rd   rm   r   rn   ro   r&   r&   r'   get_moe_prepare_workspace  s   
z"MnnvlMoe.get_moe_prepare_workspacetoken_selected_expertsr   r   c                 C   s(   || dks
J d|| }| | }|S )Nr   z+expert_count should be divisible by ep_sizer&   )rs   r   r   expert_per_ranktoken_target_rank_idsr&   r&   r'   compute_target_rank_id  s   zMnnvlMoe.compute_target_rank_id
expert_idsrA   expert_staticsrC   r   r   rD   r   c
                 C   sV   t | |||||||||	
\}
}}}}}}}|| }d }t|||||||}||
||fS r-   )rH   rY   )rw   rA   rx   rC   r   r   r   r   rD   r   prepared_local_expertsrQ   local_send_rank_count_cumsumlocal_send_rank_indiceslocal_recv_rank_count_cumsumlocal_recv_rank_indices backward_local_recv_rank_indicesrR   r]   r   alltoall_infor&   r&   r'   -mnnvl_moe_alltoallv_prepare_without_allgather  sJ   z6MnnvlMoe.mnnvl_moe_alltoallv_prepare_without_allgatherr   real_rank_token_count_cumsumr+   r,   c	                 C   s   t | ||||||\}	}
}}}}|| }tj||tjtdd}tj||tjtdd}t||	||||||||| t|	|
|||||}|||fS )NcudarE   )r   r   r   rF   r   rG   r.   rY   )r   r   r+   r,   r   r   r   r   r   r   rZ   r    r[   r"   r\   r]   r(   r)   r   r&   r&   r'   mnnvl_moe_alltoallv_prepare  sh   
	z$MnnvlMoe.mnnvl_moe_alltoallv_preparexr   c              
   C   sZ   |   dks
J dtj|j| jd | jtdd}t| |j|j	||j
|j|||	 |S )Nr   z)only 2D tensor supported, please reshape.r	   r   rE   )dimr   r   r]   shaper   r   r6   rZ   r    r[   r"   )r   r   rC   r   r   output_tensorr&   r&   r'   mnnvl_moe_alltoallvb  s&   zMnnvlMoe.mnnvl_moe_alltoallvtoken_countc              
   C   sx   |   dks
J dtj|| | jd | jtdd}t| |j|j||j	|j
|||	 tj|||| jd dddS )Nr   z$2D tensor supported, please reshape.r	   r   rE   F)r   keepdim)r   r   zerosr   r   r   r6   r[   r"   rZ   r\   sumreshape)r   r   rC   r   r   r   r   r   r&   r&   r'   mnnvl_moe_alltoallv_combine~  s$   
z$MnnvlMoe.mnnvl_moe_alltoallv_combiner-   )r^   r_   r`   rc   r   ra   rd   re   r   rU   rf   rg   r
   staticmethodr   r   rq   rr   r   rv   r   r   rY   r   r   r&   r&   r&   r'   rb     s   
 
	
7	Drb   ) __doc__	functoolsdataclassesr   typesr   typingr   r   r   jitr   utilsr   rh   r
   mnnvlr   r   cacherV   rU   r   r   r.   r6   r9   r<   r?   rH   rY   rb   r&   r&   r&   r'   <module>   s   
  

	

	




	

#
