o
    پi4K                     @  s  d dl mZ d dlZd dlmZmZmZmZmZ d dl	Z
d dlZd dlmZ d dlmZ d dlmZmZmZ d dlmZ d dlmZ d d	lmZmZmZ d d
lmZmZ d dlm Z m!Z! d dl"m#Z#m$Z$ e%e&Z'ersd dl(m)Z)m*Z* dddZ+G dd deZ,dddZ-G dd deZ.dS )    )annotationsN)TYPE_CHECKINGAnyDictListOptional)get_tensor_model_parallel_rank)get_tp_group)	MoeRunnerMoeRunnerBackendMoeRunnerConfig)TritonMoeQuantInfo)	AWQConfig)FusedMoEMethodBaseQuantizationConfigQuantizeMethodBase)
GPTQConfigGPTQMarlinConfig)UnquantizedFusedMoEMethodUnquantizedLinearMethod)get_device_capabilityset_weight_attrs)CombineInputStandardDispatchOutputnum_bitsintc           	        s"  g }t dD ]N}g }|d }dD ]0}d|d  d|d  d d|d d  d|d d  d fD ]}|d| | d|   q0qt dD ] | fdd	|D  qEqt|}| dkrftg d
}n| dkrrtg d}ntd| |dt|fd d |f 	 }t
|}|S )N       )r         r         c                   s   g | ]}|d    qS )    ).0pjr#   \/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/quantization/moe_wna16.py
<listcomp>2   s    z#get_weight_perm.<locals>.<listcomp>)r   r   r      r            )r   r   r   r+   znum_bits must be 4 or 8, got {})rangeappendextendnparray	Exceptionformatreshapelenraveltorch
from_numpy)	r   	perm_listiperm1colblockrowperm
interleaver#   r&   r(   get_weight_perm$   s0   

"
rC   c                      s   e Zd ZdZd- fddZed.ddZed/ddZed0ddZed1ddZ	d1ddZ
ed2dd Zed3d"d#Zed4d%d&Zd5d+d,Z  ZS )6MoeWNA16Configz6Config class for MOE WNA16 (W8A16/W4A16) quantization.linear_quant_methodstrweight_bitsr   
group_sizehas_zpboollm_head_quantizedmodules_to_not_convertOptional[List[str]]full_configDict[str, Any]returnNonec                   s   t    || _|| _|| _d| j | _|| _|| _|| _d| _	| jdkr,t
|| _	n0| jdkrXt }|d u r:dn	|d d |d  }	t }
|	|
k rWtd	|
 d
|	 dntd|d u reg | _d S || _d S )Nr!   Fgptqawqr.   r   
   r   zbThe quantization method moe_wna16 + awq is not supported for the current GPU. Minimum capability: z. Current capability: .$moe_wna16 only support gptq and awq.)super__init__rG   rH   rI   bit8_pack_factorrK   rE   rN   
use_marlinr   is_gptq_marlin_compatibler   r   get_min_capability
ValueErrorrL   )selfrE   rG   rH   rI   rK   rL   rN   capability_tupledevice_capabilityawq_min_capability	__class__r#   r(   rX   E   s>   





zMoeWNA16Config.__init__c                 C     dS N	moe_wna16r#   clsr#   r#   r(   get_names      zMoeWNA16Config.get_nameList[torch.dtype]c                 C  s   t jt jgS N)r9   bfloat16halfrg   r#   r#   r(   get_supported_act_dtypesw   s   z'MoeWNA16Config.get_supported_act_dtypesc                 C  rd   )NF   r#   rg   r#   r#   r(   r\   {   rj   z!MoeWNA16Config.get_min_capability	List[str]c                 C  s   dgS )Nzquantize_config.jsonr#   rg   r#   r#   r(   get_config_filenames   s   z#MoeWNA16Config.get_config_filenamesc                 C  s   t rl   )NotImplementedError)r^   r#   r#   r(   get_scaled_act_names   s   z#MoeWNA16Config.get_scaled_act_namesconfigc                 C  s   |  |dg}|  |dg}|  |dg}| j|dgdd}|dkr-|  |dg }g }n|d	krA|  |d
g}| |dgd }ntd| |||||||S )Nquant_methodbitsrH   lm_headF)defaultrR   symrS   
zero_pointrL   rV   )get_from_keysget_from_keys_orr]   )rh   ru   rv   rG   rH   rK   rI   rL   r#   r#   r(   from_config   s,   zMoeWNA16Config.from_configOptional[str]c                 C  s   |dkr|  |r|  S d S re   )is_moe_wna16_compatibleri   )rh   hf_quant_cfg
user_quantr#   r#   r(   override_quantization_method   s   z+MoeWNA16Config.override_quantization_methodquant_configc           
      C  s   | dd }| d}| d}t }tdd |D r dn	|d d	 |d
  }t }|dko8| o8|dv }|dkoD|dkoD||k}	|pH|	S )Nrv    rw   desc_actc                 s  s    | ]}|d u V  qd S rl   r#   )r$   
capabilityr#   r#   r(   	<genexpr>       z9MoeWNA16Config.is_moe_wna16_compatible.<locals>.<genexpr>r.   r   rT   r   rR   r   r!   rS   r   )getlowerr   allr   r\   )
rh   r   rv   r   r   r_   r`   ra   gptq_compatibleawq_compatibler#   r#   r(   r      s    

z&MoeWNA16Config.is_moe_wna16_compatiblelayertorch.nn.ModuleprefixOptional[QuantizeMethodBase]c                 C  s   ddl m} ddlm} t|| jrt||rt S t S t||rQ| j	dkr>| j
r4t| j||S t| j||S | j	dkrMt| j||S tdt||rZt| S d S )Nr   )
LinearBase)FusedMoErR   rS   rV   )sglang.srt.layers.linearr   ,sglang.srt.layers.moe.fused_moe_triton.layerr   is_layer_skipped_quantrL   
isinstancer   r   rE   rZ   r   r~   rN   get_quant_methodr   r   r]   MoeWNA16Method)r^   r   r   r   r   r#   r#   r(   r      s2   




zMoeWNA16Config.get_quant_method)rE   rF   rG   r   rH   r   rI   rJ   rK   rJ   rL   rM   rN   rO   rP   rQ   )rP   rF   )rP   rk   )rP   r   )rP   rq   )ru   rO   rP   rD   )rP   r   )r   rO   )r   r   r   rF   rP   r   )__name__
__module____qualname____doc__rX   classmethodri   ro   r\   rr   rt   r~   r   r   r   __classcell__r#   r#   rb   r(   rD   B   s&    .
rD   r   rF   rL   rq   c                   s   t  fdd|D S )Nc                 3  s    | ]}| v V  qd S rl   r#   )r$   module_namer   r#   r(   r      r   z)is_layer_skipped_quant.<locals>.<genexpr>)any)r   rL   r#   r   r(   r      s   r   c                   @  sD   e Zd ZdZdddZdddZdddZd ddZedd Z	dS )!r   zLinear method for MOE WNA16 (W8A16/W4A16) quantization.

    Args:
        quant_config: The MOE WNA16 (W8A16/W4A16) quantization config.
    r   rD   c                 C  s
   || _ d S rl   )r   )r^   r   r#   r#   r(   rX      s   
zMoeWNA16Method.__init__r   r   num_expertsr   hidden_sizeintermediate_size_per_partitionparams_dtypetorch.dtypec                 K  s  ddl m} | j|_| jj}| jj}	d}
||	 s||	 r2|	d }	|
d9 }
|	dks*J ||	 s||	 s|	|_|
|_|jj}||dd d|v sJJ |d }t	
||}||d< tjjtj|d| || tjd	dd
}|d| t|| tjjtj|||| tjd	dd
}|d| t|| tjjtj|d| ||	 |d	dd
}|d| t|| tjjtj||||	 |d	dd
}|d| t|| | jjrtjjtj|d| | ||	 tjd	dd
}|d| t|| tjjtj||| ||	 tjd	dd
}|d| t|| | jjdkrHddg}| jjs)|ddg7 }|D ]}tjjtjdtjd	dd
}||| t|| q+d S d S )Nr   )FusedMoeWeightScaleSupportedr   r   r   F)rv   is_transposedweight_loader)dtype)requires_gradw13_qweight
w2_qweight
w13_scales	w2_scales
w13_qzeros	w2_qzerosrR   	w13_g_idxw2_g_idx)r   )&sglang.srt.layers.moe.fused_moe_tritonr   r   rY   rH   group_size_div_factorGROUPvalueupdater   get_weight_loaderr9   nn	Parameteremptyuint8register_parameterr   zerosrI   rE   int32)r^   r   r   r   r   r   extra_weight_attrsr   rY   rH   r   strategyr   wrapped_weight_loaderr   r   r   r   r   r   invalid_param_keyskeyparamr#   r#   r(   create_weights   s   		
	
	
	


	
	

zMoeWNA16Method.create_weightsmoe_runner_configr   c                 C  s   || _ ttj|| _d S rl   )r   r
   r   TRITONrunner)r^   r   r   r#   r#   r(   create_moe_runneri  s   z MoeWNA16Method.create_moe_runnerdispatch_outputr   rP   r   c                 C  sv   | j jdks
J d| jj}| jj}t|j|j|dk|dk|j|j	|r&|j
nd |r,|jnd d|jgd	}| j||S )Nsiluz"Only SiLU activation is supported.r   r!   r   )	
w13_weight	w2_weightuse_int4_w4a16use_int8_w8a16	w13_scalew2_scalew13_zpw2_zpblock_shape)r   
activationr   rG   rI   r   r   r   r   r   r   r   rH   r   run)r^   r   r   rG   rI   
quant_infor#   r#   r(   applyo  s"   zMoeWNA16Method.applyc                   s(   dd  dd d fdd}|S )Nc                 S  s   |  d}| tj} tjddgtj| jd}| d d d d d f |? d@ } g d}| ddd d |f } | |d} | j } |dkr_| d d d	d d
f d | d d d d d
f  } | S |dkr{| d	d d
d d f d | d d d
d d f  } | S )Nr   r   r   device   )r   r   r   r,   r   r*   r+   r-   r.   r!   qweightr   r   r    qzeros)sizeviewr9   r   tensorr   T
contiguous)r   tensor_typesize0shifterreverse_awq_pack_orderr#   r#   r(   convert_awq_tensor  s   


00z<MoeWNA16Method.get_weight_loader.<locals>.convert_awq_tensorc                 S  s~   |  tj} tjddgtj| jd}| d d d d d f |? d@ } | d } | d d d d df | d d d d df d  } | S )Nr   r   r   r   r   r    )r   r9   r   r   r   )r   r   r#   r#   r(   convert_gptq_int4_qzeros  s   0zBMoeWNA16Method.get_weight_loader.<locals>.convert_gptq_int4_qzerosr   torch.nn.Parameterloaded_weighttorch.Tensorweight_namerF   shard_id	expert_idr   c           	        s  d|v rd S j jsd|v rd S t j}t }||}j}j jdkrEj jdks-J d|v r7 |d}nId|v rA |d}n?|j	}n;j jdkrj jd	v sSJ d|v ra|j	
 tj}nd|v r}|tj}j jdkrw|j	}n	|j	d
 }n|j	}jd
krd|v sd|v r|jd
}d|v r|jd|d
| }|dkr|| j|d |d f< d S || j||d d f< d S d|v r||djdd d |f | j|< d S | |||| d S )Ng_idxr   rS   r   weightr   r   rR   r   r   scalesr   r.   w1r   r   r   )r   rI   r	   r   r   tor   rE   rG   r   r   r   r9   r   r   repeat_interleavemoe_tp_sizer   data)	r   r   r   r   r   r   tp_rank
shard_sizer   r   r   r   r   r#   r(   moe_wna16_weight_loader  s^   


zAMoeWNA16Method.get_weight_loader.<locals>.moe_wna16_weight_loader)
r   r   r   r   r   rF   r   rF   r   r   r#   )r   r   r   r#   r   r(   r     s   &Az MoeWNA16Method.get_weight_loaderN)r   rD   )
r   r   r   r   r   r   r   r   r   r   )r   r   r   r   )r   r   r   r   rP   r   )
r   r   r   r   rX   r   r   r   staticmethodr   r#   r#   r#   r(   r      s    


z
r   )r   r   )r   rF   rL   rq   )/
__future__r   loggingtypingr   r   r   r   r   numpyr2   r9   sglang.srt.distributedr   %sglang.srt.distributed.parallel_stater	   sglang.srt.layers.moer
   r   r   'sglang.srt.layers.moe.moe_runner.tritonr   "sglang.srt.layers.quantization.awqr   *sglang.srt.layers.quantization.base_configr   r   r   #sglang.srt.layers.quantization.gptqr   r   &sglang.srt.layers.quantization.unquantr   r   sglang.srt.utilsr   r   	getLoggerr   logger&sglang.srt.layers.moe.token_dispatcherr   r   rC   rD   r   r   r#   r#   r#   r(   <module>   s,   

 
 