o
    پixV                     @  s  d dl mZ d dlmZmZmZ d dlZd dlm  m	Z
 d dlmZ d dlmZmZ d dlmZmZmZmZ d dlmZ d dlmZmZmZ d d	lmZ d d
lmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z& ermd dl'm(Z(m)Z) e Z*e! Z+e  Z,e" Z-edo~e+Z.e.rd dl/m0Z0 d dl1m2Z2 d dl3m4Z4 e-rd dl5m6Z6 zd dl7m8Z9 W n e:y   dZ9Y nw G dd deZ;G dd deZ<G dd deeZ=dS )    )annotations)TYPE_CHECKINGListOptionalN)	Parameter)CPUQuantMethod!_amx_process_weight_after_loading)	MoeRunnerMoeRunnerBackendMoeRunnerConfigget_moe_runner_backend)TritonMoeQuantInfo)FusedMoEMethodBaseLinearMethodBaseQuantizeMethodBase)MultiPlatformOp)	cpu_has_amx_supportget_bool_env_varis_cpuis_hipis_npunext_power_of_2set_weight_attrsuse_intel_amx_backenduse_intel_xpu_backend)CombineInputStandardDispatchOutputSGLANG_USE_AITER)ActivationType)	fused_moe)shuffle_weight)npu_format_cast)cutlass_fused_moec                   @  s2   e Zd ZdZdddZ	ddddZdddZdS )UnquantizedEmbeddingMethodz"Unquantized method for embeddings.layertorch.nn.Moduleinput_size_per_partitionintoutput_partition_sizes	List[int]
input_sizeoutput_sizeparams_dtypetorch.dtypec           	      K  sF   t tjt|||ddd}t|ddd |d| t|| dS )	z#Create weights for embedding layer.dtypeFrequires_grad   r   	input_dim
output_dimweightNr   torchemptysumr   register_parameter	selfr$   r&   r(   r*   r+   r,   extra_weight_attrsr6    r?   Z/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/quantization/unquant.pycreate_weightsD   s   z)UnquantizedEmbeddingMethod.create_weightsNxtorch.TensorbiasOptional[torch.Tensor]returnc                 C  s   t ||j|S N)Flinearr6   )r=   r$   rB   rD   r?   r?   r@   apply[   s   z UnquantizedEmbeddingMethod.applyinput_c                 C  s   t ||jS rG   )rH   	embeddingr6   )r=   r$   rK   r?   r?   r@   rL   c   s   z$UnquantizedEmbeddingMethod.embeddingr$   r%   r&   r'   r(   r)   r*   r'   r+   r'   r,   r-   rG   r$   r%   rB   rC   rD   rE   rF   rC   )r$   r%   rK   rC   rF   rC   )__name__
__module____qualname____doc__rA   rJ   rL   r?   r?   r?   r@   r#   A   s    
r#   c                   @  s2   e Zd ZdZdddZdddZ	ddddZdS )UnquantizedLinearMethodz#Linear method without quantization.r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   c           	      K  sF   t tjt|||ddd}t|ddd |d| t|| d S )Nr.   Fr0   r2   r   r3   r6   r7   r<   r?   r?   r@   rA   j   s   
z&UnquantizedLinearMethod.create_weightsrF   Nonec                 C  s    t rtrt|dg d S d S d S )Nr6   )_is_cpu_is_cpu_amx_availabler   )r=   r$   r?   r?   r@   process_weights_after_loading   s   z5UnquantizedLinearMethod.process_weights_after_loadingNrB   rC   rD   rE   c                 C  sx   t |r4|j}t|dkr|d|jd }tjj||j|d}t|dkr2||d |d d}|S t	
||j|S )N   Tr   r2   )r   shapelenviewr8   ops
sgl_kernelweight_packed_linearr6   rH   rI   )r=   r$   rB   rD   x_shapesoutputr?   r?   r@   rJ      s   zUnquantizedLinearMethod.applyrM   r$   r%   rF   rT   rG   rN   )rO   rP   rQ   rR   rA   rW   rJ   r?   r?   r?   r@   rS   g   s    

rS   c                      s   e Zd ZdZ	d,d- fddZ	d.d/ddZd0ddZd1ddZed2ddZ	d3d d!Z
d3d"d#Zd3d$d%Zd3d&d'Zd3d(d)Zd4d*d+ZeZ  ZS )5UnquantizedFusedMoEMethodz MoE method without quantization.Fuse_triton_kernelsbooluse_flashinfer_trtllm_moec                   s6   t    t  | _|| _d| _|| _ti | _	d S )NF)
super__init__r   is_flashinfer_cutlassuse_flashinfer_cutlassrd   	with_biasrf   dict_cache_permute_indices)r=   rd   rf   	__class__r?   r@   rh      s   
z"UnquantizedFusedMoEMethod.__init__r$   r%   num_expertsr'   hidden_sizeintermediate_size_per_partitionr,   r-   rk   c                 K  s2  || _ |jjrd| n|}||}	}
| jr|
|	}	}
tjjtj||	|
|ddd}|d| t	|| | j rQtjjtj||tj
ddd}|d| t	|| ||}}| jr^||}}tjjtj||||ddd}|d| t	|| | j rtjjtj||tj
ddd}|d| t	|| d S d S )	N   r.   Fr0   
w13_weightw13_weight_bias	w2_weightw2_weight_bias)rk   moe_runner_configis_gatedrd   r8   nnr   r9   r;   r   float32)r=   r$   rp   rq   rr   r,   rk   r>   
w13_up_dimw13_weight_nw13_weight_krt   ru   w2_weight_nw2_weight_krv   rw   r?   r?   r@   rA      sP   







z(UnquantizedFusedMoEMethod.create_weightsrF   rT   c                 C  sP  t ot  }|r/tjjt|jjddd|_tj	
  tjjt|jjddd|_tj	
  tr:tr:t|ddg | jr
ddlm}m}m} d}d}|jjd j}|jjd j}	d }
d }t|jD ]}|| j|jj| tj|}|jj|  tj||jjj  }|| j|jj| tj|}|jj|  tj||jjj  }||tj|}||tj|}|tjj}
|tjj}|tj ||jj|< |tj |	|jj|< qc|jjj|jg|
R  |j_|jjj|jg|R  |j_t r&d	D ]}t!||}|j"d
d|_t#|j|_qd S )N)   r   Fr0   rt   rv   r   )'_maybe_get_cached_w3_w1_permute_indicesconvert_to_block_layout!get_w2_permute_indices_with_cache   )rt   rv   r2   rs   )$
_use_aiterr   is_autor8   rz   r   r    rt   datacudaempty_cacherv   rU   rV   r   rf   flashinfer.fused_moe.corer   r   r   rZ   rangenum_local_expertsrm   r\   uint8clonetodevice
contiguousbfloat16reshape_is_npugetattr	transposer!   )r=   r$   _should_use_aiter_moer   r   r   epilogue_tile_mblock_kold_shape_w13old_shape_w2new_shape_w13new_shape_w2ipermute_indicestmp_weights1tmp_weights2weight_namer6   r?   r?   r@   rW      s   







z7UnquantizedFusedMoEMethod.process_weights_after_loadingrx   r   c                 C  s(   || _ | jr	tjntj}t||| _d S rG   )rx   rd   r
   TRITON_KERNELSTRITONr	   runner)r=   r$   rx   backendr?   r?   r@   create_moe_runnerA  s   z+UnquantizedFusedMoEMethod.create_moe_runnerc                 C  s   | j S rG   )rj   )r=   r?   r?   r@   load_up_proj_weight_firstL  s   z3UnquantizedFusedMoEMethod.load_up_proj_weight_firstdispatch_outputr   r   c                 C  s   | j ||dS )N)r$   r   )forward)r=   r$   r   r?   r?   r@   rJ   Q  s   zUnquantizedFusedMoEMethod.applyc                 C  s  ddl m} |j}|j}| j}| jj}| r6ddlm	} ||j
|jt|dd t|dd d}	| j||	S | jr^t||j|j|j
|j|jd |j|j|j|jt|jd dd }
||
dS todt  }|r|jrnJ d	|\}}}|jr| d
ksJ d|j\}}|dksJ d|||j }t j!|t j"d}t#||j
|j|||j$dkrt%j&nt%j'|j(d}
||
dS t)|j
|jt|dd t|dd d}	| j||	S )Nr   StandardCombineInput)TritonKernelsQuantInforu   rw   )rt   rv   w13_biasw2_bias)inputtoken_selected_expertstoken_final_scalesfc1_expert_weightsfc2_expert_weightsoutput_dtypequant_scalesep_sizeep_ranktp_sizetp_ranktune_max_num_tokenshidden_statesunsupportedrs   z4`topk_weights` should be in shape (num_tokens, topk)r2   z?Only support topk=1 when `apply_router_weight_on_input` is Truer.   silu)
activationexpert_maskrt   rv   b13b2)*&sglang.srt.layers.moe.token_dispatcherr   r   topk_outputrx   r   runner_backendis_triton_kernels/sglang.srt.layers.moe.moe_runner.triton_kernelsr   rt   rv   r   runrj   flashinfer_cutlass_fused_moetopk_idstopk_weightsr/   moe_ep_sizemoe_ep_rankmoe_tp_sizemoe_tp_rankr   rZ   r   r   r   
no_combineapply_router_weight_on_inputdimr   r8   	ones_liker{   r   r   r   SiluGeluexpert_mask_gpur   )r=   r$   r   r   rB   r   rx   r   r   
quant_infora   r   r   r   _topkr?   r?   r@   forward_cuda[  s   









z&UnquantizedFusedMoEMethod.forward_cudac                 C  s   ddl m} |j}|j}| j}|jdksJ d|j dt|rPddlm} |\}}	}
||j	||\}}t
jj||j|j||	dtjd d d d d d}||d	S dd
lm} |||||}||d	S )Nr   r   r   activation =  is not supported.)apply_topk_weights_cpuFTr   )moe_forward_native)r   r   r   r   rx   r   r   sglang.srt.layers.moe.topkr   r   r8   r]   r^   fused_experts_cpurt   rv   r   UNQUANT&sglang.srt.layers.moe.fused_moe_nativer   )r=   r$   r   r   rB   r   rx   r   r   r   r   ra   r   r?   r?   r@   forward_cpu  sJ   


z%UnquantizedFusedMoEMethod.forward_cpuc                 C  s   ddl m} |j}|j}| j}|jdv sJ d|j d| jj}t rKddl	m
} |\}	}
}|||j|j|	|
t|dd t|dd |jd	}||d
S | sQJ |jdks_J d|j dt|j|jt|dd t|dd d}| j||S )Nr   r   )r   gelur   r   )fused_expertsru   rw   )b1r   r   r   r   zS is not supported             for Triton PATH, please set ENV SGLANG_USE_SGL_XPU=1.r   )r   r   r   r   rx   r   r   r   r   r^   r   rt   rv   r   	is_tritonr   r   )r=   r$   r   r   rB   r   rx   r   r   r   r   r   ra   r   r?   r?   r@   forward_xpu  sB   






z%UnquantizedFusedMoEMethod.forward_xpuc              
   C  s  dd l }ddlm} |j}|j\}}}|j}	|jd }
||j}|tj	}|j
}|j}|
| }tjd|tj	|jd|ddd }|j||||
d\}}}|||}|tj}| jre|jgnd }| jrn|jgnd }|j|g|jg|ddd||	dd }| jjd	krdd
lm} |||}n| jjdkr||}nddlm} | |}|j|g|j g|ddd||	dd }|j!|d d d |||d}||dS )Nr   r   )r/   r   rY   r2   )row_idx
expert_idx
active_numrs   )rB   r6   rD   
split_itemgroup_list_type
group_type
group_listr   npu_swiglu_oai)
swiglu_oair   )
GeluAndMul)skip1skip2rD   scalesexpanded_src_to_dst_rowexport_for_source_rowr   )"	torch_npur   r   r   r   r/   rZ   r   r8   int32rp   top_karanger   r\   permuter   npu_moe_init_routingnpu_moe_compute_expert_tokensint64rk   ru   rw   npu_grouped_matmulrt   rx   r   $sgl_kernel_npu.activation.swiglu_oair   
npu_swiglusglang.srt.layers.activationr   rv   npu_moe_finalize_routing)r=   r$   r   r   r   rB   r   r   r   original_dtype
num_tokensrp   r   row_idx_lenr   r   expanded_row_idxexpanded_expert_idxexpert_tokensr   r   r   r   final_hidden_statesr?   r?   r@   forward_npu  s   
	
	

z%UnquantizedFusedMoEMethod.forward_npuc                 O  s   t d)Nz/The TPU backend currently does not support MoE.)NotImplementedError)r=   argskwargsr?   r?   r@   forward_tpuh  s   z%UnquantizedFusedMoEMethod.forward_tpu)FF)rd   re   rf   re   )F)r$   r%   rp   r'   rq   r'   rr   r'   r,   r-   rk   re   rb   )r$   r%   rx   r   )rF   re   )r$   r%   r   r   rF   r   )rF   r   )rO   rP   rQ   rR   rh   rA   rW   r   propertyr   rJ   r   r   r   r  r  forward_native__classcell__r?   r?   rn   r@   rc      s"    
:
_



S
2
0
Xrc   )>
__future__r   typingr   r   r   r8   torch.nn.functionalrz   
functionalrH   torch.nn.parameterr   sglang.srt.layers.amx_utilsr   r   sglang.srt.layers.moer	   r
   r   r   'sglang.srt.layers.moe.moe_runner.tritonr   *sglang.srt.layers.quantization.base_configr   r   r   sglang.srt.layers.utilsr   sglang.srt.utilsr   r   r   r   r   r   r   r   r   r   r   r   rV   _is_hiprU   r   r   aiterr   aiter.fused_moer   aiter.ops.shuffler    %sglang.srt.hardware_backend.npu.utilsr!   flashinfer.fused_moer"   r   ImportErrorr#   rS   rc   r?   r?   r?   r@   <module>   s@    ,&4