o
    پi6                     @  s  d dl mZ d dlZd dlmZmZmZmZmZ d dl	Z	d dl
mZ d dlmZ d dlmZmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZ erdd dlmZ d dlmZ d dlmZm Z m!Z!m"Z" ddgZ#e$e%Z&G dd deZ'dddZ(G dd deZ)dS )    )annotationsN)TYPE_CHECKINGAnyDictListOptional)Module)	Parameter)FusedMoEMethodBaseQuantizationConfigQuantizeMethodBase)Fp8LinearMethod)UnquantizedLinearMethod)is_layer_skipped)set_weight_attrs)MoeRunnerConfig)	DeepEPMoE)CombineInputDeepEPLLDispatchOutputDeepEPNormalDispatchOutputStandardDispatchOutputstaticdynamicc                      s   e Zd ZdZ							d-d. fddZed/ddZed0ddZed1ddZed2dd Z	ed3d#d$Z
d4d)d*Zd2d+d,Z  ZS )5W4AFp8Configz(Config class for MIXED_PRECISION W4AFp8.Tr   r   N   is_checkpoint_fp8_serializedboolis_checkpoint_w4afp8_serializedlinear_activation_schemestrmoe_activation_schemeignored_layersOptional[List[str]]weight_block_sizeOptional[List[int]]
group_sizeintreturnNonec                   sd   t    || _|| _|rtd |tvrtd| || _|| _	|p&g | _
ddg| _|| _d S )Nz,Detected w4afp8 checkpoint. Please note thatzUnsupported activation scheme r   )super__init__r   r   loggerwarningACTIVATION_SCHEMES
ValueErrorr   r    r!   r#   r%   )selfr   r   r   r    r!   r#   r%   	__class__ Y/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/quantization/w4afp8.pyr*   &   s   





zW4AFp8Config.__init__c                 C     dS )Nw4afp8r2   clsr2   r2   r3   get_name=      zW4AFp8Config.get_nameList[torch.dtype]c                 C  s   t jt jgS N)torchbfloat16float8_e4m3fnr6   r2   r2   r3   get_supported_act_dtypesA   s   z%W4AFp8Config.get_supported_act_dtypesc                 C  r4   )NZ   r2   r6   r2   r2   r3   get_min_capabilityE   r9   zW4AFp8Config.get_min_capability	List[str]c                 C     g S r;   r2   r6   r2   r2   r3   get_config_filenamesI   r9   z!W4AFp8Config.get_config_filenamesconfigDict[str, Any]c                 C  s@   |  |dg}d|v }d|v }d}d}ddg}| |||||dS )Nquant_methodfp8r5   r   r   r   )r   r   r   r    r#   )get_from_keys)r7   rE   rG   r   r   r   r    r#   r2   r2   r3   from_configM   s   zW4AFp8Config.from_configlayertorch.nn.ModuleprefixOptional[QuantizeMethodBase]c                 C  sR   ddl m} ddlm} t||rt|| jrt S t| S t||r't	| S d S )Nr   )
LinearBase)FusedMoE)
sglang.srt.layers.linearrO   &sglang.srt.layers.moe.fused_moe_tritonrP   
isinstancer   r!   r   r   W4AFp8MoEMethod)r/   rK   rM   rO   rP   r2   r2   r3   get_quant_method]   s   

zW4AFp8Config.get_quant_methodc                 C  rC   r;   r2   )r/   r2   r2   r3   get_scaled_act_namesk   s   z!W4AFp8Config.get_scaled_act_names)TTr   r   NNr   )r   r   r   r   r   r   r    r   r!   r"   r#   r$   r%   r&   r'   r(   )r'   r   )r'   r:   )r'   r&   )r'   rB   )rE   rF   r'   r   )rK   rL   rM   r   r'   rN   )__name__
__module____qualname____doc__r*   classmethodr8   r?   rA   rD   rJ   rU   rV   __classcell__r2   r2   r0   r3   r   #   s,    
r   scalestorch.Tensorr'   c                 C  sx   | j }|d d dkrdnd}| |d |d |d | |}|dddd}||d |d | |d | }| S )zCInterleave scales in groups of 4 similar to TRT-LLM implementation.      r         )shapereshapepermute
contiguous)r]   s_shape	alignmentscales_interleavedr2   r2   r3   interleave_scaleso   s   rj   c                   @  sR   e Zd Zd&ddZd'ddZd(ddZd)ddZd*ddZd+d d!Zd,d#d$Z	d%S )-rT   quant_configr   c                 C  s
   || _ d S r;   )rk   )r/   rk   r2   r2   r3   r*      s   
zW4AFp8MoEMethod.__init__rK   r   num_expertsr&   hidden_sizeintermediate_size_per_partitionparams_dtypetorch.dtypec                 K  sj  ddl m} d|v sJ tjjtj||d |d tjddd}|d| t|| tjjtj|||d tjddd}	|d	|	 t|	| |	d
|j
ji tjjtj|d| || jj tjddd}
|d|
 t|
| tjjtj|||| jj tjddd}|d| t|| tjjtj|dftjddd}|d| t|| tjjtj|tjddd}|d| t|| |jj}tj|df||tjd| _tj|dfd| |tjd| _tj|df||tjd| _tj|df||tjd| _| j| _| j| _| j| _| j| _tj|d tj|d| _tj|dftj|d| _ tj|dftj|d| _!d S )Nr   )FusedMoeWeightScaleSupportedweight_loaderr_   )dtypeFrequires_grad
w13_weight	w2_weightrG   w13_weight_scale_invw2_weight_scale_invw13_input_scalew2_input_scalerb   )devicers   ra   rs   r|   )"rR   rq   r<   nnr	   emptyint8register_parameterr   updateGROUPvaluezerosrk   r%   float32onesr=   rv   r|   fullint64
a_strides1
c_strides1
a_strides2
c_strides2
b_strides1s_strides13
b_strides2
s_strides2int32expert_offsetsproblem_sizes1problem_sizes2)r/   rK   rl   rm   rn   ro   extra_weight_attrsrq   rv   rw   w13_weight_scalew2_weight_scalerz   r{   r|   r2   r2   r3   create_weights   s   		
	


	

	


zW4AFp8MoEMethod.create_weightsr'   r(   c           
      C  s   t j}|jj}|j|}t|}t|dd|_|j|}t|}t|dd|_|j	
 t j }t j|gt j|d}t|dd|_	|j
 t j }t j|gt j|d}	t|	dd|_d S )NFrt   r}   )r<   r=   rw   r|   rx   torj   r	   ry   rz   maxr   itemtensorr{   )
r/   rK   rs   r|   r   r   w13_input_scale_maxnew_w13_input_scalew2_input_scale_maxnew_w2_input_scaler2   r2   r3   process_weights_after_loading  s(   
z-W4AFp8MoEMethod.process_weights_after_loadingrL   moe_runner_configr   c                 C  s
   || _ d S r;   )r   )r/   rK   r   r2   r2   r3   create_moe_runner  s   
z!W4AFp8MoEMethod.create_moe_runnerdispatch_outputr   r   c                 C  s   ddl m} ddlm} |j}|j}|\}}}	|||j|j|j|j	||| j
| j| j| j| j| j| j| j| j| j| j|j|j| jjpBdd}
||
dS )Nr   )cutlass_w4a8_moe)StandardCombineInputg      ?)routed_scaling_factor)hidden_states)&sglang.srt.layers.moe.cutlass_w4a8_moer   &sglang.srt.layers.moe.token_dispatcherr   r   topk_outputrv   rw   rx   ry   r   r   r   r   r   r   r   r   r   r   r   rz   r{   r   r   )r/   rK   r   r   r   xr   topk_weightstopk_ids_outputr2   r2   r3   apply#  s:   


zW4AFp8MoEMethod.applyr   r   r^   c           	      C  s   ddl m} |\}}}}}}|||j|j|j|j|||jj|jj|jj	|jj
|jj|jj|jj|jj|jj|jj|jj|j|j}|S )Nr   )cutlass_w4a8_moe_deepep_ll)r   r   rv   rw   rx   ry   rG   r   r   r   r   r   r   r   r   r   r   r   rz   r{   )	r/   rK   r   r   r   r   r   masked_mr   r2   r2   r3   apply_deepep_llI  s2   zW4AFp8MoEMethod.apply_deepep_llr   c                 C  s   ddl m} |j|j|j}}}t|tr|d }|jd }|dkrK|||j|j	|j
|j||| j| j| j| j| j| j| j| j| j| j| j|j|jS |S )Nr   )cutlass_w4a8_moe_deepep_normal)r   r   r   r   r   rS   tuplerc   rv   rw   rx   ry   r   r   r   r   r   r   r   r   r   r   r   rz   r{   )r/   rK   r   r   r   topk_idxr   
num_tokensr2   r2   r3   apply_deepep_normall  s@   


z#W4AFp8MoEMethod.apply_deepep_normalN)rk   r   )
rK   r   rl   r&   rm   r&   rn   r&   ro   rp   )rK   r   r'   r(   )rK   rL   r   r   )rK   r   r   r   r'   r   )rK   r   r   r   r'   r^   )rK   r   r   r   r'   r^   )
rW   rX   rY   r*   r   r   r   r   r   r   r2   r2   r2   r3   rT      s    


}


&#rT   )r]   r^   r'   r^   )*
__future__r   loggingtypingr   r   r   r   r   r<   torch.nnr   torch.nn.parameterr	   *sglang.srt.layers.quantization.base_configr
   r   r   "sglang.srt.layers.quantization.fp8r   &sglang.srt.layers.quantization.unquantr   $sglang.srt.layers.quantization.utilsr   sglang.srt.utilsr   sglang.srt.layers.moer   "sglang.srt.layers.moe.ep_moe.layerr   r   r   r   r   r   r-   	getLoggerrW   r+   r   rj   rT   r2   r2   r2   r3   <module>   s(    

L