o
    پi{                     @  s  d dl mZ d dlZd dlmZ d dlmZmZmZ d dl	Z	d dl
Z
d dlmZmZmZmZ d dlmZmZ d dlmZmZmZ d dlmZmZ d d	lmZ er\d d
lmZ d dlmZ d dl m!Z! zd dl"m#Z$ W n e%yu   dZ$Y nw e Z&e&rd dl'm(Z( e)e*Z+e \Z,Z-dZ.dZ/dZ0dZ1g dZ2dZ3eG dd dZ4			ddddZ5	ddd#d$Z6	%	ddd'd(Z7	%ddd*d+Z8dd/d0Z9dd1d2Z:dd5d6Z;dd8d9Z<	:ddd?d@Z=ddCdDZ>ddEdFZ?ddGdHZ@ddIdJZAddMdNZBdOdP ZCddTdUZDddVdWZEddXdYZFdd\d]ZGdd_d`ZHddadbZIdcdd ZJdedf ZKddldmZLde3fddydzZMde3fdd{d|ZNG d}d~ d~eZOG dd deZPdddZQeeQddddZRdddZSeeSddddZTdS )    )annotationsN)	dataclass)TYPE_CHECKINGAnyOptional)BasevLLMParameterChannelQuantScaleParameterGroupQuantScaleParameterPackedvLLMParameter)LinearMethodBaseQuantizationConfig)get_scalar_types	pack_colsunpack_cols)get_device_capabilityis_cuda)register_custom_op
LinearBase)FusedMoE)get_forward_context)_custom_ops)gptq_marlin_gemm   @          r   r   Tc                   @  sF   e Zd ZU ded< ded< ded< ded< ded	< d
ed< d
ed< dS )MarlinLinearLayerConfigztuple[int, int]full_weight_shapepartition_weight_shape
ScalarTypeweight_typetorch.dtypeact_typeint
group_sizeboolzero_points	has_g_idxN)__name__
__module____qualname____annotations__ r/   r/   _/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/quantization/marlin_utils.pyr   A   s   
 r   has_zpOptional[bool]include_fp_typer(   device_capabilityOptional[int]c           	      C  s   |d u rt  \}}|d | }|d u rdn|}|dk rg S | d u r1td||}td||}|| S | r7tjgS tjtjg}|rG|tjtjg7 }|S )N
   r   P   FT)r   "query_marlin_supported_quant_typesscalar_typesuint4uint4b8	uint8b128float8_e4m3fnfloat4_e2m1f)	r1   r3   r4   majorminor
capabilitytypes0types1resr/   r/   r0   r8   O   s(   
r8   
quant_typer"   r'   returntuple[bool, Optional[str]]c                 C  s   |d u rt  \}}|d | }|d u rdn|}t|d|}| |vr5dd|  d| d| d| d	| d
fS |d u s=|tvrHdd| dt dfS dS )Nr6   r   TFz&Marlin does not support weight_bits = z. Only types = z! are supported (for group_size = z, device_capability = z, zp = z).z%Marlin does not support group_size = z. Only group_sizes = z are supported.TN)r   r8   MARLIN_SUPPORTED_GROUP_SIZES)rE   r'   r1   r4   r?   r@   rA   supported_typesr/   r/   r0   _check_marlin_supporteds   s4   
rK   Fr&   c                 C  s   t | |||\}}|S N)rK   )rE   r'   r1   r4   cond_r/   r/   r0   check_marlin_supported   s   rO   Nonec                 C  s,   t | ||\}}|s|d usJ t|d S rL   )rK   
ValueError)rE   r'   r1   rM   err_msgr/   r/   r0   verify_marlin_supported   s
   rS   output_size_per_partitioninput_size_per_partition
input_sizec                 C  sv   | t  dkrtd|  dt  d|t dkr"td| dt d||k r7|| dkr9td| d| dd S d S )Nr   #Weight output_size_per_partition = z% is not divisible by  min_thread_n = zM. Consider reducing tensor_parallel_size or running with --quantization gptq."Weight input_size_per_partition = z$ is not divisible by min_thread_k = " is not divisible by group_size = )GPTQ_MARLIN_MIN_THREAD_NrQ   GPTQ_MARLIN_MIN_THREAD_KrT   rU   rV   r'   r/   r/   r0   verify_marlin_supports_shape   s.   	r]   c              
   C  sF   z
t | ||| W dS  ty" } zd| fW  Y d }~S d }~ww )NFrH   )r]   rQ   __str__)rT   rU   rV   r'   er/   r/   r0   check_marlin_supports_shape   s   r`   layerr   c                 C  s:   t | dd p| j}t | dd p| j}t||| j|dd S )NrT   rU   r\   r   )getattroutput_sizerV   r`   )ra   r'   rT   rU   r/   r/   r0   check_marlin_supports_layer   s   rd   r   c                 C  sX   | j }| j}| jj }| jjdk}|d dko|td| dk}|dv }|o+|o+|o+|S )Nsilur   r   r   r   )hidden_sizeintermediate_size_per_partitionmoe_runner_configapply_router_weight_on_input
activationmax)ra   r'   rf   rg   supports_router_weightsupports_activationsupports_shapesupports_group_sizer/   r/   r0   check_moe_marlin_supports_layer   s   
rp      devicetorch.devicemax_blocks_per_smtorch.Tensorc                 C  s&   t j| j}t j|| t j| ddS )NF)dtyperr   requires_grad)torchcudaget_device_propertiesmulti_processor_countzerosr&   )rr   rt   smsr/   r/   r0   marlin_make_workspace  s   r~   	act_orderis_row_parallelc                 C  s   |  p| o| S rL   r/   )r   r   r/   r/   r0   marlin_is_k_full  s   r   c                 C  s   |dk}| p	|o	|S Nr   r/   )r   r'   r   is_channelwiser/   r/   r0   !marlin_repeat_scales_on_all_ranks  s   r   c                 C     t jjt jdt j| dddS Nr   )rv   rr   Frw   rx   nn	Parameteremptyr&   rr   r/   r/   r0   marlin_make_empty_g_idx     r   c                 C  r   r   r   r   r/   r/   r0   marlin_make_empty_zp"  r   r   g_idx!tuple[torch.Tensor, torch.Tensor]c                 C  s   t | t j}| | |fS rL   )rx   argsorttor&   )r   g_idx_sort_indicesr/   r/   r0   marlin_sort_g_idx(  s   r   c                    s`   g } t dD ] |  fddt dD  qg }t dD ] | fdddD  q| |fS )N   c                   s   g | ]} d |  qS )r   r/   .0jir/   r0   
<listcomp>0      z#get_scale_perms.<locals>.<listcomp>   c                   s   g | ]}d   | qS )   r/   r   r   r/   r0   r   3  r   )r   rq   r   	   r            )rangeextend)
scale_permscale_perm_singler/   r   r0   get_scale_perms-  s   r   ssize_ksize_nc                 C  sn   t  \}}||k r|dkr| dt|fd d |f } n| dt|fd d |f } | d|f } | S r   )r   reshapelen
contiguous)r   r   r   r'   r   r   r/   r/   r0   marlin_permute_scales7  s   
 r   c                 C  s<   | j }t \}}| dt|fd d |f } | j|  S r   )shaper   r   r   r   )r   origin_shaperN   r   r/   r/   r0   marlin_permute_biasE  s   
r   c                 C  X   | j d }tj|| j d | j d f| j| jd}t|D ]}t| | |||||< q|S Nr   rq   r   rr   rv   )r   rx   r   rr   rv   r   r   )r   r   r   r'   num_expertsoutputr_   r/   r/   r0   marlin_moe_permute_scalesL  s   
r   zpnum_bitsc                 C  s   t  \}}| dt|fd d |f } |dkr tg d}n|dkr,tg d}ntd|| dt|fd d |f  } | d|f } t	| |||} | S )Nr   r   r   r   r      rq            r   r   r   rq   r   num_bits must be 4 or 8, got {})
r   r   r   numpyarray	Exceptionformatravelr   r   )r   r   r   r   r   rN   
interleaver/   r/   r0   marlin_zero_points^  s   
"r   q_zp_packedc                 C  s   t | |||}|dkrttg d}n|dkr%ttg d}ntd||dt|fd d |f  }|d|f	 }t
||||}|S )Nr   r   r   r   r   r   )r   r   r   r   r   r   r   r   r   r   r   )r   r   r   r   q_zpundo_interleave	marlin_zpr/   r/   r0   awq_to_marlin_zero_pointsu  s   "r   c                 C  r   r   )r   rx   r   rr   rv   r   r   )r   r   r   r   r   r   r_   r/   r/   r0   moe_awq_to_marlin_zero_points  s   
r   c                 C  sF   t j rd S t j| }|d dk r|t jkr!td d S d S d S )Nr   r   zYou are running Marlin kernel with bf16 on GPUs before SM90. You can consider change to fp16 to achieve better performance if possible.)rx   compileris_dynamo_compilingry   r   bfloat16logger	info_once)rr   rv   r4   r/   r/   r0   maybe_warn_marlin_atomic_add  s   
r   c                   C  s   t j rd S 	 d S rL   )rx   r   r   r   r   r/   r/   r/   r0    maybe_warn_marlin_atomic_add_env  s   
r   mnkrv   r$   c                 C  sT   |dks|dk s|j dkrdS 	 tj|}|d dk r(|tjkr(t|| dS dS )Ni   ry   Fr   r   T)typer   rx   ry   r   r   r   )r   r   r   rr   rv   r4   r/   r/   r0   should_use_atomic_add_reduce  s   
r   inputweightweight_scale	weight_zpr   	workspacewtype	is_k_fullbiasOptional[torch.Tensor]use_fp32_reducec                 C  s   |  d| jd }| jd d |f }t|d||d| j| jd}t }|d u rDt|d ||d ||||||jd ||	|
||dd}nt||||||||j	||	|
||dd}|d ur`|
| | |S )Nr   r   rq   r   r   r   rr   rv   Fsize_mr   r   r   use_atomic_addr   is_zp_floatr   r   r   r   r   r   r   wtype_idrT   rU   r   r   r   r   )r   r   r   sizerr   rv   r   r   )unified_apply_gptq_marlin_gemm_with_wtypeidadd_)r   r   r   r   r   r   r   r   rT   rU   r   r   r   
reshaped_x	out_shaper   forward_contextr   r/   r/   r0   apply_gptq_marlin_linear  sb   

r   c                 C  s   |  d| jd }| jd d |f }t|d||d| j| jd}t }|d u rCt|d ||d ||||||jd ||	||dd}nt|||||||||	||dd}|
d ur\|	|
 | |S )Nr   r   rq   r   Fr   r   r   r   r   r   r   r   r   r   r   r   r   rT   rU   r   r   r   )
r   r   r   r   rr   rv   r   r   unified_apply_gptq_marlin_gemmr   )r   r   r   r   r   r   r   rE   rT   rU   r   r   r   r   r   r   r   r/   r/   r0   apply_awq_marlin_linear  s\   

r   c                      s   e Zd ZdZd% fdd	Zd&ddZed&ddZed'ddZed(ddZ	ed)ddZ
ed*ddZed+ddZd,d#d$Z  ZS )-MarlinConfigz^Config class for Marlin.

    Reference: https://github.com/IST-DASLab/marlin/tree/master
    r'   r&   lm_head_quantizedr(   rF   rP   c                   sb   t    || _|| _| jdkr| jdkrtd| j d| _d| _d| _d| _d| _	d| _
d S )Nr   r   zcCurrently, only group size 128 and -1 (channelwise) is supported for Marlin, but got group_size of r   r   r   i   )super__init__r'   r   rQ   pack_factor	tile_sizemin_n_threadsmin_k_threadsmax_parallelperm_len)selfr'   r   	__class__r/   r0   r   `  s   

zMarlinConfig.__init__strc                 C  s   d| j  d| j dS )NzMarlinConfig(group_size=z, lm_head_quantized=))r'   r   )r  r/   r/   r0   __repr__  s   
zMarlinConfig.__repr__c                 C     dS )Nmarlinr/   clsr/   r/   r0   get_name  s   zMarlinConfig.get_namelist[torch.dtype]c                 C  s   t jgS rL   )rx   halfr
  r/   r/   r0   get_supported_act_dtypes  s   z%MarlinConfig.get_supported_act_dtypesc                 C  r  )Nr7   r/   r
  r/   r/   r0   get_min_capability  s   zMarlinConfig.get_min_capability	list[str]c                 C  s   dgS )Nzquantize_config.jsonr/   r
  r/   r/   r0   get_config_filenames  s   z!MarlinConfig.get_config_filenamesconfigdict[str, Any]'MarlinConfig'c                 C  s*   |  |dg}| j|dgdd}| ||S )Nr'   lm_headF)default)get_from_keysget_from_keys_or)r  r  r'   r   r/   r/   r0   from_config  s   
zMarlinConfig.from_configOptional[str]c                 C  sd   | ddkp| dd}|d u p|dkp|dk}|r0|r0d|  |  }t| |  S d S )Ncheckpoint_formatr	  is_marlin_formatFgptqz6The model is serialized in {} format. Using {} kernel.)getr   r  r   info)r  hf_quant_cfg
user_quantr  is_valid_user_quantmsgr/   r/   r0   override_quantization_method  s    

z)MarlinConfig.override_quantization_methodra   torch.nn.ModuleprefixOptional[MarlinLinearMethod]c                 C  s>   ddl m} ddlm} t||st||r| jrt| S d S )Nr   r   )ParallelLMHead)sglang.srt.layers.linearr   *sglang.srt.layers.vocab_parallel_embeddingr)  
isinstancer   MarlinLinearMethod)r  ra   r'  r   r)  r/   r/   r0   get_quant_method  s   
zMarlinConfig.get_quant_method)r'   r&   r   r(   rF   rP   )rF   r  )rF   r  )rF   r&   )rF   r  )r  r  rF   r  )rF   r  )ra   r&  r'  r  rF   r(  )r+   r,   r-   __doc__r   r  classmethodr  r  r  r  r  r%  r.  __classcell__r/   r/   r  r0   r   Z  s"    
$r   c                   @  s<   e Zd ZdZdddZdddZdddZ	d d!ddZdS )"r-  z_Linear method for Marlin.

    Args:
        quant_config: The Marlin quantization config.
    quant_configr   c                 C  s
   || _ d S rL   )r2  )r  r2  r/   r/   r0   r     s   
zMarlinLinearMethod.__init__ra   r&  rU   r&   output_partition_sizes	list[int]rV   rc   params_dtyper$   c              	   K  s  ~|d }|t jkrtd| t|}	|	| jj dkr*td|	 d| jj d|	| jj dkr?td|	 d| jj d|| jj dkrTtd| d	| jj d| jjd
kro|| jj dkrotd| d| jj d| jj	| jj
d  }
|	|
 dkrtdtt j|| jj
 |	| jj
 | jj dt jdddd| jj| jj
|d}| jjd
krdn|| jj }t j||	d|d|d}|dkrtdddi|}n
tdddd|}|	| jj | jj }tt j|dt jd|d}|d| |d| |d| d S )Nweight_loaderz*The params dtype must be float16, but got r   rW   z% is not divisible by min_n_threads = .z# is not divisible by pack_factor = rX   z% is not divisible by min_k_threads = r   rY   r   z2Each permutation group must reside on the same gpury   r   rq   )data	input_dim
output_dim
packed_dimpacked_factormarlin_tile_sizer6  )r8  r6  r:  )r:  r9  Br   r   r/   )rx   float16rQ   sumr2  r   r   r   r'   r  r   r
   r   int32r   r	   r   r   r|   r&   register_parameter)r  ra   rU   r3  rV   rc   r5  extra_weight_attrsr6  rT   num_tiles_per_permqweightinput_groupsweight_scale_argsscalesmax_workspace_sizer   r/   r/   r0   create_weights  s   




	
z!MarlinLinearMethod.create_weightsrF   rP   c                 C  sF   t jj|jjdd|_t jj|jjdd|_t jj|jjdd|_d S )NFr   )rx   r   r   r>  r8  r   r   )r  ra   r/   r/   r0   process_weights_after_loading=  s   z0MarlinLinearMethod.process_weights_after_loadingNxru   r   r   c              	   C  s   |j }|j}|j}|d|jd }|jd }|jd }	|jd }
t||||||
|	}||jd d |jd f }|d urE|| |S )Nr   r   rq   )r>  r   r   viewr   opsmarlin_gemmr   )r  ra   rL  r   rE  rH  r   x_2dr   r   r   	output_2dr   r/   r/   r0   applyC  s   


 
zMarlinLinearMethod.apply)r2  r   )ra   r&  rU   r&   r3  r4  rV   r&   rc   r&   r5  r$   )ra   r&  rF   rP   rL   )ra   r&  rL  ru   r   r   rF   ru   )r+   r,   r-   r/  r   rJ  rK  rR  r/   r/   r/   r0   r-    s    


p
r-  r   r   c                 C  s   | j | jd |f| jdS Nr   )rv   	new_emptyr   rv   r   r/   r/   r0   #fake_unified_apply_gptq_marlin_gemm_  s   rV  )	fake_implc                 C  s<   t  j}|j}t| d ||d |||||| jd |||	|
|dS )Nr   r   )r   r2  rE   r   r   )r   r   r   r   r   r   r   rT   rU   r   r   r   r2  rE   r/   r/   r0   r   r  s(   r   r   c                 C  s   | j | jd |f| jdS rS  rT  r   r/   r/   r0   .fake_unified_apply_gptq_marlin_gemm_with_wtype  s   rX  c                 C  sr   d }t tD ]}|ds tt|}t|dr |j|kr |} nqt| d ||d |||||| jd ||	|
|||dS )NrN   r   r   r   )dirr9   
startswithrb   hasattrr   r   r   )r   r   r   r   r   r   r   r   rT   rU   r   r   r   r   r   	attr_namestr/   r/   r0   r     s6   

r   )NTN)r1   r2   r3   r(   r4   r5   rL   )
rE   r"   r'   r5   r1   r(   r4   r5   rF   rG   )FN)
rE   r"   r'   r&   r1   r(   r4   r5   rF   r(   )F)rE   r"   r'   r&   r1   r(   rF   rP   )
rT   r&   rU   r&   rV   r&   r'   r&   rF   rP   )
rT   r&   rU   r&   rV   r&   r'   r&   rF   rG   )ra   r   r'   r&   rF   r(   )ra   r   r'   r&   rF   r(   )rq   )rr   rs   rt   r&   rF   ru   )r   r(   r   r(   rF   r(   )r   r(   r'   r&   r   r(   rF   r(   )rr   rs   rF   ru   )r   ru   rF   r   )
r   ru   r   r&   r   r&   r'   r&   rF   ru   )r   ru   rF   ru   )r   ru   r   r&   r   r&   r'   r&   )
r   ru   r   r&   r   r&   r   r&   rF   ru   )
r   ru   r   r&   r   r&   r   r&   rF   ru   )r   ru   r   r&   r   r&   r   r&   )r   r&   r   r&   r   r&   rr   rs   rv   r$   rF   r(   )r   ru   r   ru   r   ru   r   ru   r   ru   r   ru   r   ru   r   r"   rT   r&   rU   r&   r   r(   r   r   r   r(   rF   ru   )r   ru   r   ru   r   ru   r   ru   r   ru   r   ru   r   ru   rE   r"   rT   r&   rU   r&   r   r   r   r(   rF   ru   )r   ru   r   ru   r   ru   r   ru   r   ru   r   ru   r   ru   rT   r&   rU   r&   r   r(   r   r(   r   r(   rF   ru   )r   ru   r   ru   r   ru   r   ru   r   ru   r   ru   r   ru   r   r&   rT   r&   rU   r&   r   r(   r   r(   r   r(   r   r(   rF   ru   )U
__future__r   loggingdataclassesr   typingr   r   r   r   rx   sglang.srt.layers.parameterr   r   r	   r
   *sglang.srt.layers.quantization.base_configr   r   $sglang.srt.layers.quantization.utilsr   r   r   sglang.srt.utilsr   r   sglang.srt.utils.custom_opr   r*  r   ,sglang.srt.layers.moe.fused_moe_triton.layerr   0sglang.srt.compilation.piecewise_context_managerr   vllmr   rN  ImportError_is_cudasglang.jit_kernel.gptq_marlinr   	getLoggerr+   r   r"   r9   GPTQ_MARLIN_TILErZ   r[   GPTQ_MARLIN_MAX_PARALLELrI   USE_FP32_REDUCE_DEFAULTr   r8   rK   rO   rS   r]   r`   rd   rp   r~   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r-  rV  r   rX  r   r/   r/   r/   r0   <module>   s   

(&
	
$




	









%RCi 

$