o
    پiJ                     @   s*  d dl Z d dlZd dlmZmZmZmZmZ d dlZd dl	m
Z
 d dlmZ d dlmZmZmZmZ d dlmZ d dlmZmZmZmZmZmZ d dlmZmZ d d	lmZ d d
l m!Z! d dl"m#Z# erjd dl$m%Z% ddgZ&e'e(Z)G dd deZ*G dd deZ+G dd deZ,G dd deZ-dS )    N)TYPE_CHECKINGAnyListOptionalcast)
LinearBase)MoeRunnerConfig)FusedMoEMethodBaseLinearMethodBaseQuantizationConfigQuantizeMethodBase)BaseKVCacheMethod)QuarkLinearSchemeQuarkMoESchemeQuarkW4A4MXFP4QuarkW4A4MXFp4MoEQuarkW8A8Fp8QuarkW8A8FP8MoE)deep_compareshould_ignore_layer)UnquantizedLinearMethod)RadixAttention)get_device_capability)StandardDispatchOutputQuarkLinearMethodQuarkFusedMoEMethodc                
       s  e Zd Z			d6deeef deee  deeeef  def fddZd7ddZ	e
d	eej fddZe
d	efddZd	efddZdejjded	ed fddZe
deeef d	d fddZe
d	ee fddZd8deded	efd d!Zd"eeeef  d#eeeef  d	efd$d%Zd"eeeef  d#eeeef  d	efd&d'Zd(ed)ejjd	eeef fd*d+Zdeeef d	d,fd-d.Zdejjd(ed	d,fd/d0Zd)ejjd(ed	d1fd2d3Zd	ee fd4d5Z  Z S )9QuarkConfigNreorderquant_configkv_cache_groupkv_cache_configpack_methodc                    s>   t    |d u rg }|| _|| _|| _|| _| jd | _d S )Npacked_modules_mapping)super__init__r   r   r    r!   r"   )selfr   r   r    r!   	__class__ ^/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/quantization/quark/quark.pyr$   )   s   
zQuarkConfig.__init__returnr   c                 C   s   t | S N)r   r%   r(   r(   r)   get_linear_method:   s   zQuarkConfig.get_linear_methodc                 C   s   t jt jgS r+   )torchfloat16bfloat16clsr(   r(   r)   get_supported_act_dtypes=   s   z$QuarkConfig.get_supported_act_dtypesc                 C      dS )NF   r(   r1   r(   r(   r)   get_min_capabilityA      zQuarkConfig.get_min_capabilityc                 C   r4   )Nquarkr(   r,   r(   r(   r)   get_nameE      zQuarkConfig.get_namelayerprefixr   c                 C   s   t tt | jd}t||| jdr&t|trt	 S t|t
r$t| S d S t|tr9| j||d}||_t| S t|t
rBt| S ddlm} t||rX| |||_t| S d S )Nexclude)ignorefused_mapping)r;   
layer_namer   )FusedMoE)r   liststrr   getr   r"   
isinstancer   r   r   QuarkKVCacheMethodget_linear_schemeschemer   ,sglang.srt.layers.moe.fused_moe_triton.layerrA   get_moe_schemer   )r%   r;   r<   exclude_layersrH   rA   r(   r(   r)   get_quant_methodH   s(   




zQuarkConfig.get_quant_methodconfigc                    s.  | d}|d u rtdttt | d}tt| d}t|dkr(d }ngt|}ttttf | d t 	 }t|}|
|sKtd fdd	|D tfd
dD sctdd  d}|d u rrtdD ]}	d |	d< qtttttf   d}
|
d urd |
d< | ||||dS )NexportzPThe export key should be included in the configurations of Quark quantized modelr   r!   r   layer_quant_configzThe Quark quantized model has the kv_cache_group parameter setting, but no kv_cache quantization settings were found in the quantization configuration.c                    s$   g | ]}t tttf  |qS r(   )r   dictrC   r   rD   ).0name)rO   r(   r)   
<listcomp>   s    z+QuarkConfig.from_config.<locals>.<listcomp>c                 3       | ]
}t | d  V  qdS r   Nr   rQ   q_config)	q_configsr(   r)   	<genexpr>   s    z*QuarkConfig.from_config.<locals>.<genexpr>zThe quantization method used for kv_cache should be the same, but the quantization method for the kv_cache layer in the config is different.output_tensorsz1The kv_cache quantization configuration is empty.z*q_proj)r   r   r    r!   )rD   
ValueErrorr   rB   rC   lensetrP   r   keysissubsetall)r2   rM   export_configr   r!   r    kv_cache_setlayer_quant_nameslayer_quant_setrX   q_proj_q_configr(   )rO   rY   r)   from_configf   sL   



zQuarkConfig.from_configc                 C      g S r+   r(   r1   r(   r(   r)   get_config_filenames   r7   z QuarkConfig.get_config_filenamesTmin_capabilityerrorc                 C   sv   t  }|d ur9d|d   krdk sJ  J |d d |d  }||k}|r7|s7tdd| dd| d|S d	S )
Nr      
   z)Quantization scheme is not supported for z!the current GPU. Min capability: z. zCurrent capability: .F)r   RuntimeError)r%   rj   rk   capability_tuple
capability	supportedr(   r(   r)   _check_scheme_supported   s    

z#QuarkConfig._check_scheme_supportedweight_quantinput_quantc                 C   sz   |d u s|d u r
dS | ddko| ddk}| d }| ddv }|r+|r+|s-dS | dr4dS | ddk}|S )	NFdtypefp8_e4m3
is_dynamicqscheme)
per_tensorper_channelTrz   )rD   )r%   rt   ru   is_fp8_dtypeis_static_weightis_per_tensor_or_channel_weightis_per_tensor_activationr(   r(   r)   _is_fp8_w8a8   s   
zQuarkConfig._is_fp8_w8a8c                 C   s  |d u s|d u rt d dS |ddks|ddkr$t d dS |ddks2|ddkr9t d dS |d	d
ksG|d	d
krNt d dS |ddu r\t d dS |ddu rjt d dS |ddksx|ddkrt d dS dS )NzHQuark model is not in MX-FP4 format: weight_quant or input_quant not setFrv   fp4z2Quark model is not in MX-FP4 format: dtype not fp4ry   	per_groupz2Quark model is not in MX-FP4 format: not per_group
group_size    z6Quark model is not in MX-FP4 format: not group_size=32rx   Tz6Quark model is not in MX-FP4 format: not weight staticz;Quark model is not in MX-FP4 format: not activation dynamicscale_formate8m0z:Quark model is not in MX-FP4 format: not scale_format e8m0)loggerdebugrD   )r%   rt   ru   r(   r(   r)   
_is_mx_fp4   s4   





zQuarkConfig._is_mx_fp4r@   modulec           
         s    dd jv r?j } fdd|D }fdd|D tfddD s;td| d	  d
d S ttttf j	d}|D ]}t

 |r\||   S qNtj}ttttf j	d}||v rw|| S ttttf j	d}	|	S )Nrn   c                    s   g | ]}  |qS r(   )replace)rQ   shard_proj_name)r@   	proj_namer(   r)   rS         
z4QuarkConfig._find_matched_config.<locals>.<listcomp>c                    s   g | ]} | qS r(   )_find_matched_config)rQ   
shard_name)r   r%   r(   r)   rS     r   c                 3   rT   rU   rV   rW   )shard_configsr(   r)   rZ   "  s    
z3QuarkConfig._find_matched_config.<locals>.<genexpr>z1Found a different quantization configuration for z in z+. vLLM requires all to use the same scheme.r   rO   layer_type_quant_configglobal_quant_config)splitr"   ra   r\   r   rP   rC   r   r   rD   fnmatchtype__name__)
r%   r@   r   shard_proj_namesshard_namesrO   name_pattern
layer_typer   r   r(   )r@   r   r   r%   r   r)   r     sJ   


z QuarkConfig._find_matched_configr   c                 C   s   | ds
| drtdttttf | d}ttttf | d}| ||r1t||S | ||rG| j	t
 dd}|rGt
||S td| d	| )
Nr[   biasPCurrently, Quark models with output_tensors and bias quantized are not supportedweightinput_tensorsF)rk   z5No quark compatible scheme was found. Weight config: z, Input config: )rD   NotImplementedErrorr   rP   rC   r   r   r   r   rs   r   r6   )r%   rM   weight_configinput_configis_fp8_w8a8_supportedr(   r(   r)   _get_scheme_from_config?  s*   

z#QuarkConfig._get_scheme_from_configc                 C   s(   |  ||}| |}| |  |S r+   )r   r   rs   r6   )r%   r;   r@   rO   rH   r(   r(   r)   rG   W  s   
zQuarkConfig.get_linear_schemer   c                 C   sp   |  ||}|ds|drtd|d}|d}| ||r)t||S | ||r4t||S td)Nr[   r   r   r   r   zUnsupported FusedMoe scheme)r   rD   r   r   r   r   r   ro   )r%   r   r@   rO   r   r   r(   r(   r)   rJ   f  s   



zQuarkConfig.get_moe_schemec                 C   rh   r+   r(   r,   r(   r(   r)   get_scaled_act_names}  r:   z QuarkConfig.get_scaled_act_names)NNr   )r*   r   )T)!r   
__module____qualname__rP   rC   r   r   rB   r$   r-   classmethodr.   rv   r3   intr6   r9   nnModulerL   rg   ri   boolrs   r   r   r   r   rG   rJ   r   r   __classcell__r(   r(   r&   r)   r   '   s    



B

3

.

r   c                   @   s   e Zd ZdefddZdejjddfddZdejjd	e	d
e
e	 de	de	dejfddZ	ddejjdejdeej fddZdS )r   quantization_configc                 C   
   || _ d S r+   r   r%   r   r(   r(   r)   r$        
zQuarkLinearMethod.__init__r;   r*   Nc                 C      |j | d S r+   rH   process_weights_after_loadingr%   r;   r(   r(   r)   r        z/QuarkLinearMethod.process_weights_after_loadinginput_size_per_partitionoutput_partition_sizes
input_sizeoutput_sizeparams_dtypec           	   	   K   s(   | d}|jj|||||||d dS )z
        Use the QuarkLinearScheme associated with the layer to create
        the necessary parameters for the layer. See LinearMethodBase for param
        details
        weight_loader)r;   r   r   r   r   r   r   N)rD   rH   create_weights)	r%   r;   r   r   r   r   r   extra_weight_attrsr   r(   r(   r)   r     s   

z QuarkLinearMethod.create_weightsxr   c                 C   s&   |j }|du rtd|j|||dS )z
        Use the output of create_weights and the QuarkLinearScheme
        associated with the layer to apply the forward pass with the
        layer input.  See LinearMethodBase for param details

        N'A scheme must be defined for each layer)r   rH   r\   apply_weights)r%   r;   r   r   rH   r(   r(   r)   apply  s   zQuarkLinearMethod.applyr+   )r   r   r   r   r$   r.   r   r   r   r   rB   rv   r   Tensorr   r   r(   r(   r(   r)   r     s0    
c                
   @   s   e Zd ZdefddZdejjddfddZdejjd	e	d
e	de	dej
f
ddZdejjdefddZdejjddfddZdS )r   r   c                 C   r   r+   r   r   r(   r(   r)   r$     r   zQuarkFusedMoEMethod.__init__r;   r*   Nc                 C   r   r+   r   r   r(   r(   r)   r     r   z1QuarkFusedMoEMethod.process_weights_after_loadingnum_expertshidden_sizeintermediate_size_per_partitionr   c                 K   s"   |j jd|||||d| dS )z
        Use the QuarkMoEScheme associated with the layer to create
        the necessary parameters for the layer. See FusedMoEMethodBase for param
        details
        )r;   r   r   r   r   Nr(   )rH   r   )r%   r;   r   r   r   r   r   r(   r(   r)   r     s   
z"QuarkFusedMoEMethod.create_weightsmoe_runner_configc                 C   s   |j || d S r+   )rH   create_moe_runner)r%   r;   r   r(   r(   r)   r     s   z%QuarkFusedMoEMethod.create_moe_runnerdispatch_outputr   c                 C   s"   |j }|du rtd|||S )z
        Use the output of create_weights and the QuarkMoEScheme
        associated with the layer to apply the forward pass with the
        fused MoE layer. See FusedMoEMethodBase for param details

        Nr   r   )r%   r;   r   rH   r(   r(   r)   r     s   zQuarkFusedMoEMethod.apply)r   r   r   r   r$   r.   r   r   r   r   rv   r   r   r   r   r(   r(   r(   r)   r     s0    

c                       sD   e Zd ZdZdef fddZedeee	e
f  fddZ  ZS )rF   zK
    Supports loading kv-cache scaling factors from quark checkpoints.
    r   c                    s   |  |j t | d S r+   )validate_kv_cache_configr    r#   r$   )r%   r   r&   r(   r)   r$     s   zQuarkKVCacheMethod.__init__r    c                 C   sP   | du rdS |  d}|dkrtd| |  d}|dkr&td| dS )z
        Validator for the kv cache configuration. Useful for controlling the
        kv cache quantization schemes, that are being supported in vLLM
        :param kv_cache_config: the quark kv cache scheme
        Nrv   rw   zNCurrently supported kv cache quantization is dtype=fp8_e4m3, however received ry   rz   zhOnly support per-tensor scaling factor for quark KV cache. Expected qscheme: per_tensor, found qscheme: )rD   r   )r    rv   ry   r(   r(   r)   r     s"   

z+QuarkKVCacheMethod.validate_kv_cache_config)r   r   r   __doc__r   r$   staticmethodr   rP   rC   r   r   r   r(   r(   r&   r)   rF     s
    $rF   ).r   loggingtypingr   r   r   r   r   r.   sglang.srt.layers.linearr   sglang.srt.layers.moer   *sglang.srt.layers.quantization.base_configr	   r
   r   r   'sglang.srt.layers.quantization.kv_cacher   ,sglang.srt.layers.quantization.quark.schemesr   r   r   r   r   r   *sglang.srt.layers.quantization.quark.utilsr   r   &sglang.srt.layers.quantization.unquantr   !sglang.srt.layers.radix_attentionr   sglang.srt.utilsr   &sglang.srt.layers.moe.token_dispatcherr   __all__	getLoggerr   r   r   r   r   rF   r(   r(   r(   r)   <module>   s.    
  \45