o
    پi>                     @   s6  d dl Z d dlmZmZmZmZmZ d dlZd dlmZ d dl	m
Z
 d dlmZ d dlmZmZmZ d dlmZ d dlmZmZmZ d d	lmZ d d
lmZmZmZ erZd dlmZ e Zer|d dl m!Z!m"Z" d dl#m$Z$ d dl%m&Z& dej'(dj)v Z*e +e,Z-ddefddZ.G dd deZ/G dd deZ0dS )    N)TYPE_CHECKINGAnyDictListOptional)tqdm)EMA)get_tensor_model_parallel_rank)pack_int4_to_int32quantize_fp8_scale_tensorwisequantize_int4_scale_columnwise)MoeRunnerConfig)FusedMoEMethodBaseQuantizationConfigQuantizeMethodBase)Fp8LinearMethod)
BAR_FORMATis_hipset_weight_attrs)DispatchOutput)ActivationType	QuantType)	fused_moe)shuffle_weightgfx950cudatqdm_barc                 C   s\   d| _ |d ur
|| _| jrd S d| _|   | _| _t| j| _	t| j| _
t| j| _d S )Nr   )ntotaldisablelast_print_n_timelast_print_tstart_tr   	smoothing_ema_dn_ema_dt_ema_miniters)r   r    r(   d/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/quantization/quark_int4fp8_moe.pytqdm_reset_no_print'   s   r*   c                   @   s   e Zd ZdZ		ddedefddZedee	j
 fd	d
ZedefddZedefddZedee fddZedeeef dd fddZde	jjdeded fddZdee fddZdS )QuarkInt4Fp8ConfigzConfig class for Quark Quantization.

    - Weight: static, per-channel, symmetric
    - Activation: dynamic, per-token, symmetric
    Fdynamicis_checkpoint_fp8_serializedactivation_schemec                 C   sX   || _ || _|dkrtdd | _d| _t }dt  }tdd| |tdd| _	d S )Nr,   z=QuarkInt4Fp8Config only supports activation_scheme='dynamic'.r      z.Online quark_int4fp8_moe quantization on rank=       @)r   descposition
bar_formatmininterval)
r-   r.   NotImplementedErrorweight_block_sizenum_quant_layersr	   r   _get_free_posr   online_quant_progress_bar)selfr-   r.   tp_rankr2   r(   r(   r)   __init__;   s"   zQuarkInt4Fp8Config.__init__returnc                 C   s   t jt jgS N)torchfloat16bfloat16clsr(   r(   r)   get_supported_act_dtypesX   s   z+QuarkInt4Fp8Config.get_supported_act_dtypesc                 C      dS )NF   r(   rB   r(   r(   r)   get_min_capability\      z%QuarkInt4Fp8Config.get_min_capabilityc                 C   rE   )Nquark_int4fp8_moer(   r:   r(   r(   r)   get_name`   rH   zQuarkInt4Fp8Config.get_namec                 C      g S r>   r(   rB   r(   r(   r)   get_config_filenamesd   rH   z'QuarkInt4Fp8Config.get_config_filenamesconfigc                 C   s   |  S r>   r(   )rC   rN   r(   r(   r)   from_configh   s   zQuarkInt4Fp8Config.from_configlayerprefixr   c                 C   s@   ddl m} ddlm} t||rt| S t||rt| S d S )Nr   )
LinearBase)FusedMoE)sglang.srt.layers.linearrR   &sglang.srt.layers.moe.fused_moe_tritonrS   
isinstancer   QuarkInt4Fp8MoEMethod)r:   rP   rQ   rR   rS   r(   r(   r)   get_quant_methodl   s   

z#QuarkInt4Fp8Config.get_quant_methodc                 C   rL   r>   r(   rJ   r(   r(   r)   get_scaled_act_names}   s   z'QuarkInt4Fp8Config.get_scaled_act_namesN)Fr,   )__name__
__module____qualname____doc__boolstrr<   classmethodr   r?   dtyperD   intrG   rK   rM   r   r   rO   nnModuler   rX   rY   r(   r(   r(   r)   r+   4   s6    

r+   c                
   @   s   e Zd ZdZdd Zdd Zdejjde	de	d	e	d
ej
f
ddZdejjddfddZdejjdefddZdejjdddejfddZdS )rW   zMoE method for INT4FP8.

    Supports loading BF16/FP16 checkpoints, quantizing down to INT4, and dequantizing to FP8 during inference.

    Args:
        quant_config: The quantization config.
    c                 C   s(   || _ | j j| _t | _tstdd S )NzOThe quark_int4fp8_moe online quantization scheme is only supported on AMD GPUs.)quant_configr9   r	   r;   _is_hipr5   )r:   re   r(   r(   r)   r<      s   
zQuarkInt4Fp8MoEMethod.__init__c              
      s0   dt jjdt jdtdtdtf
 fdd}|S )Nparamloaded_weightweight_nameshard_id	expert_idc                    s<  |dv rj }nj} j} js1d _|dv r%d}|||j |}nd}|||j |}|| j}t|\}}	t|\}
}t	|
}
||	 }|dv r|dkr[t
d|}d}n	t
|d| }d}| | | j|
jkspJ  j| | j|jks}J  j| | j|jksJ  j| | |  j| | j|	jksJ  j| | j|	jksJ  j| | |	 nQ| | j|
jksJ | | j|
jksJ  j| j|jksJ  j| j|jksJ  j| |  j| j|	jksJ  j| j|	jksJ  j| |	 | |
|||d | _jd d S )N)w1w3Tr   r/   rl      )rj   ri   rk   )w13_shard_sizew2_shard_sizeuse_presharded_weightsnarrowr;   todevicer   r   r
   slicera   w13_int4_scaleshapecopy_w13_fp8_scalew2_int4_scalew2_fp8_scaler9   update)rg   rh   ri   rj   rk   
shard_sizeoriginal_use_presharded_weights	shard_dim_	fp8_scaleint4_w
int4_scaleshard_sliceidxrP   original_weight_loaderr:   r(   r)   online_int4_fp8_weight_loader   sl   
	zNQuarkInt4Fp8MoEMethod.get_weight_loader.<locals>.online_int4_fp8_weight_loader)r?   rc   	ParameterTensorr_   rb   )r:   rP   r   r   r(   r   r)   get_weight_loader   s   ^z'QuarkInt4Fp8MoEMethod.get_weight_loaderrP   num_expertshidden_sizeintermediate_size_per_partitionparams_dtypec                 K   s  ddl m} || _|| _d|v sJ |d}| ||}	|	|d< tj}tjj	tj
|d| |d |ddd}
tjj	tj
|||d |ddd}|d	|
 t|
| |d
| t|| tjj	tj|dtjddd}tjj	tj|tjddd}|d| |d| trtjj	tj|d| tjddd}tjj	tj||tjddd}|d| |d| |d|jji t|| t|| |d|jji t|| t|| d }|d| d }|d| | jj|d  }t| j|d d S )Nr   )FusedMoeWeightScaleSupportedweight_loaderrn      )ra   Frequires_grad
w13_weight	w2_weightry   r{   rv   rz   quant_methodw13_input_scalew2_input_scale   )r   )rU   r   ro   rp   getr   r?   uint32rc   r   emptyregister_parameterr   onesfloat32rf   r|   TENSORvalueCHANNELr9   r   r*   )r:   rP   r   r   r   r   extra_weight_attrsr   r   online_int4fp8_weight_loaderr   r   ry   r{   rv   rz   r   r   r   r(   r(   r)   create_weights   s   
		







z$QuarkInt4Fp8MoEMethod.create_weightsr=   Nc           	      C   sx  t r ts | jd9  _| jd9  _| jd9  _| jd9  _tjjt	|j
jddd|_
tj  tjjt	|jjddd|_tj  |jd usMJ |j}|jjddj}t|jD ]4}d}|| }td	D ]'}|j| | |kr|j| | | }|j| |||   |9  < ||7 }qiq]tjj|dd|_t|jD ]}|j|  || 9  < |j|  |j| 9  < qd S )
Ng      ?r0   )   r   Fr   r/   )dimr   rn   )rf   	ON_GFX950rv   rz   ry   r{   r?   rc   r   r   r   datar   empty_cacher   r   maxvaluesranger   )	r:   rP   r}   max_w13_scalesrk   startmax_w13_scale_fp8rj   int4_rescaler(   r(   r)   process_weights_after_loadinga  sJ   




z3QuarkInt4Fp8MoEMethod.process_weights_after_loadingmoe_runner_configc                 C   s
   || _ d S r>   )r   )r:   rP   r   r(   r(   r)   create_moe_runner  s   
z'QuarkInt4Fp8MoEMethod.create_moe_runnerdispatch_outputr   c                 C   sv   ddl m} |j}| j}|jrJ d|j dt|j|j|j|j	|j
tj|j|j|jdkr1tjntjd	}||dS )Nr   )StandardCombineInputzno_combine=z is not supported.silu)
quant_typew1_scalew2_scale
activation)hidden_states)&sglang.srt.layers.moe.token_dispatcherr   topk_outputr   
no_combiner   r   r   r   topk_weightstopk_idsr   	per_Tokenrv   rz   r   r   SiluGelu)r:   rP   r   r   r   r   outputr(   r(   r)   apply  s*   

zQuarkInt4Fp8MoEMethod.apply)rZ   r[   r\   r]   r<   r   r?   rc   rd   rb   ra   r   r   r   r   r   r   r(   r(   r(   r)   rW      s8    a
j4
rW   r>   )1loggingtypingr   r   r   r   r   r?   r   tqdm.stdr   sglang.srt.distributedr	   sglang.srt.layers.int4fp8_utilsr
   r   r   sglang.srt.layers.moer   *sglang.srt.layers.quantization.base_configr   r   r   "sglang.srt.layers.quantization.fp8r   sglang.srt.utilsr   r   r   r   r   rf   aiterr   r   aiter.fused_moer   aiter.ops.shuffler   r   get_device_propertiesgcnArchNamer   	getLoggerrZ   loggerr*   r+   rW   r(   r(   r(   r)   <module>   s.    
M