o
    پi"                     @   s   U d dl Z d dlmZ d dlmZ d dlZd dlZd dlZd dl	m
Z
 d dlmZmZmZ d dlmZ d dlmZ d dlmZ e eZd	Zd
ZdejfddZG dd dZG dd dZG dd deZG dd deZG dd deZ e  a!ee e"d< dd Z#defddZ$dd Z%dS )     N)ABC)Optional)ModelConfig)get_attention_dp_rankget_dp_local_infois_dp_attention_enabled)ReqToTokenPool)ForwardBatch)get_global_server_argsi   @i   tc                 C   s   t | j| jj S N)npprodshapedtypeitemsize)r    r   a/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/moe/routed_experts_capturer.pyget_tensor_size_bytes   s   r   c                   @   sR   e Zd Zdedededededdfdd	Zd
d ZdedejfddZ	dd Z
dS )_RoutedExpertsDeviceCachemax_running_requestsnum_hidden_layersnum_experts_per_toknum_fused_shared_expertsdevicereturnNc                 C   s<   t jtt jt j |||| ft j|d| _|   d S )N)r   r   )	torchzerosmaxr
   chunked_prefill_sizedp_sizeint32buffer_finalize_allocation_log)selfr   r   r   r   r   r   r   r   __init__   s   	z"_RoutedExpertsDeviceCache.__init__c                 C      t | dsJ t| jS Nr"   hasattrr   r"   r$   r   r   r   get_buffer_size_bytes5      
z/_RoutedExpertsDeviceCache.get_buffer_size_byteslayer_idtopk_idsc                 C   s6   |d usJ d|j \}}|| jd ||d d f< d S )Nz/capturing routing experts but get layer_id None)r   r"   )r$   r-   r.   batch_r   r   r   capture_fwd_routed_experts9   s   
z4_RoutedExpertsDeviceCache.capture_fwd_routed_expertsc                 C   s2   |   t }tdt| jj d|dd dS )ICommon logging and memory usage computation for captured experts buffers.z1Routing experts device buffer allocated. #shape: , size: .2fz MBN)r+   _MBloggerinfotupler"   r   )r$   buffer_size_MBr   r   r   r#   >   s   z2_RoutedExpertsDeviceCache._finalize_allocation_log)__name__
__module____qualname__intstrr%   r+   r   Tensorr1   r#   r   r   r   r   r      s"    
r   c                   @   sP   e Zd ZdedededdfddZdd	 Zd
edejdejfddZdd Z	dS )_RoutedExpertsHostCache
num_tokensr   r   r   Nc                 C   s.   || _ tj|||ftjddd| _|   d S )NcpuT)r   r   
pin_memory)rA   r   r   r!   r"   r#   )r$   rA   r   r   r   r   r   r%   G   s   
z _RoutedExpertsHostCache.__init__c                 C   r&   r'   r(   r*   r   r   r   r+   Z   r,   z-_RoutedExpertsHostCache.get_buffer_size_bytesr-   loctop_kc                 C   s"   |j ddd| j||d d f< d S )NrB   T)r   non_blocking)tor"   )r$   r-   rD   rE   r   r   r   set_experts_buffer^   s   "z*_RoutedExpertsHostCache.set_experts_bufferc                 C   s,   |   t }td| j d|dd dS )r2   z0Routing experts host buffer allocated. #tokens: r3   r4   z GBN)r+   _GBr6   r7   rA   )r$   buffer_size_GBr   r   r   r#   a   s   z0_RoutedExpertsHostCache._finalize_allocation_log)
r:   r;   r<   r=   r%   r+   r   r?   rH   r#   r   r   r   r   r@   F   s    
r@   c                   @   s   e Zd ZededededededefddZd	e	d
edefddZ
dedejfddZdededefddZdd Zdd Zdd ZdS )RoutedExpertsCapturerenablemodel_configr   rA   r   r   c                 C   s   | rt |||||dS t S )N)rA   r   r   r   )_RoutedExpertsCapturerReal_RoutedExpertsCapturerNoop)rL   rM   r   rA   r   r   r   r   r   createj   s   	zRoutedExpertsCapturer.createforward_batchcan_run_graphcuda_graph_batchc                 C      t r   NotImplementedErrorr$   rQ   rR   rS   r   r   r   _sync_fwd_experts_buffer_DtoH~      z3RoutedExpertsCapturer._sync_fwd_experts_buffer_DtoHr-   r.   c                 C   rT   r   rU   r$   r-   r.   r   r   r   capture      zRoutedExpertsCapturer.capturereq_pool_idxseqlenreq_to_token_poolc                 C   rT   r   rU   r$   r]   r^   r_   r   r   r   get_routed_experts   rY   z(RoutedExpertsCapturer.get_routed_expertsc                 C   rT   r   rU   rW   r   r   r   on_forward_end   r\   z$RoutedExpertsCapturer.on_forward_endc                 C   rT   r   rU   r*   r   r   r   get_host_cache   r\   z$RoutedExpertsCapturer.get_host_cachec                 C   rT   r   rU   r*   r   r   r   get_device_cache   r\   z&RoutedExpertsCapturer.get_device_cacheN)r:   r;   r<   staticmethodboolr   r=   r>   rP   r	   rX   r   r?   r[   r   ra   rb   rc   rd   r   r   r   r   rK   i   sB    

rK   c                
   @   s   e Zd ZdZdededededef
ddZd	ed
e	defddZ
dedejfddZdededefddZdd Zdd Zdd ZdS )rN   z,Capturer for routed experts with host bufferrM   rA   r   r   r   c                 C   sL   || _ |jj| _|jj| _t|| j| jd| _t|| j| j| j |d| _d S )N)rA   r   r   )r   r   r   r   r   )r   hf_text_configr   r   r@   
host_cacher   device_cache)r$   rM   rA   r   r   r   r   r   r   r%      s   

z#_RoutedExpertsCapturerReal.__init__rQ   rR   rS   c                 C   s~   t  rt|\}}|rt | }|| }n|| }nd}|jjd }|j }| jj||d d d | jf  | j	j|< d S )Nr   )
r   r   r   out_cache_locr   rB   ri   r"   r   rh   )r$   rQ   rR   rS   local_start_poslocal_num_tokenslocal_end_posout_cache_loc_cpur   r   r   rX      s   



z8_RoutedExpertsCapturerReal._sync_fwd_experts_buffer_DtoHr-   r.   c                 C   s   | j || d S r   )ri   r1   rZ   r   r   r   r[      s   z"_RoutedExpertsCapturerReal.capturer]   r^   r_   c                 C   s,   |j | d |d    }|  j| S )N   )req_to_tokenrB   clonerc   r"   )r$   r]   r^   r_   cache_pool_idxr   r   r   ra      s   z-_RoutedExpertsCapturerReal.get_routed_expertsc                 C   s   | j |||d d S )N)rQ   rR   rS   )rX   rW   r   r   r   rb      s
   
z)_RoutedExpertsCapturerReal.on_forward_endc                 C      | j S r   )rh   r*   r   r   r   rc         z)_RoutedExpertsCapturerReal.get_host_cachec                 C   rs   r   )ri   r*   r   r   r   rd      rt   z+_RoutedExpertsCapturerReal.get_device_cacheN)r:   r;   r<   __doc__r   r=   r>   r%   r	   rf   rX   r   r?   r[   r   ra   rb   rc   rd   r   r   r   r   rN      s>    


rN   c                   @   sl   e Zd Zdd ZdededefddZded	ej	fd
dZ
dededefddZdd Zdd Zdd ZdS )rO   c                 C      d S r   r   r*   r   r   r   r%      r\   z#_RoutedExpertsCapturerNoop.__init__rQ   rR   rS   c                 C   rv   r   r   rW   r   r   r   rX      rY   z8_RoutedExpertsCapturerNoop._sync_fwd_experts_buffer_DtoHr-   r.   c                 C   rv   r   r   rZ   r   r   r   r[      r\   z"_RoutedExpertsCapturerNoop.capturer]   r^   r_   c                 C   rv   r   r   r`   r   r   r   ra      rY   z-_RoutedExpertsCapturerNoop.get_routed_expertsc                 C   rv   r   r   rW   r   r   r   rb     r\   z)_RoutedExpertsCapturerNoop.on_forward_endc                 C   rv   r   r   r*   r   r   r   rc     r\   z)_RoutedExpertsCapturerNoop.get_host_cachec                 C   rv   r   r   r*   r   r   r   rd   	  r\   z+_RoutedExpertsCapturerNoop.get_device_cacheN)r:   r;   r<   r%   r	   rf   r=   rX   r   r?   r[   r   ra   rb   rc   rd   r   r   r   r   rO      s(    

rO   _global_expert_capturerc                   C   s   t S r   rw   r   r   r   r   get_global_experts_capturer  r\   ry   capturerc                 C   s   | a d S r   rx   )rz   r   r   r   set_global_experts_capturer  s   r{   c                 C   s0   | d  dd }tjt|dtjd}|S )N	meta_inforouted_expertszutf-8)r   )getr   
frombufferpybase64	b64decodeencoder!   )datarouted_experts_base64r}   r   r   r   %extract_routed_experts_from_meta_info  s
   r   )&loggingabcr   typingr   numpyr   r   r   sglang.srt.configs.model_configr   sglang.srt.layers.dp_attentionr   r   r    sglang.srt.mem_cache.memory_poolr   ,sglang.srt.model_executor.forward_batch_infor	   sglang.srt.server_argsr
   	getLoggerr:   r6   rI   r5   r?   r   r   r@   rK   rN   rO   rw   __annotations__ry   r{   r   r   r   r   r   <module>   s0   
 
)#2Q!