o
    پiB1                     @   s6  d dl Z d dlmZ d dlZd dlmZmZmZmZm	Z	 d dl
mZ d dlmZ e Zer9d dlmZ d dlmZ e \ZZe eZdd	 Zefd
ejdejdejdejdededeej dedejfddZ	d!dejjdeddfddZ	d!dejjdeddfddZ 	d!dejdedejfddZ!dd  Z"dS )"    N)Optional)USE_FP32_REDUCE_DEFAULTmarlin_make_workspacemarlin_permute_biasmarlin_permute_scalesshould_use_atomic_add_reduce)get_scalar_types)is_cuda)gptq_marlin_gemm)gptq_marlin_repackc                 C   s\   d}| j tjkrd}n| j tjkrd}d|d  d|d   }t| d }|| }| | S )N               )dtypetorchhalfbfloat16	ones_like)scalesfp8_exponenttarget_exponentexponent_biass r   c/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/quantization/marlin_utils_fp8.py#fp8_fused_exponent_bias_into_scales   s   r   inputweightweight_scale	workspacesize_nsize_kbiasuse_fp32_reducereturnc                 C   s   |  d| jd }| jd d |f }	t|d||| j| jd}
t|d ||d d d d |tj|d|||
|d}|d urA|	| | |	S )Nr   )mnkdevicer   )ac
b_q_weightb_scalesglobal_scaleb_zerosg_idxpermr!   b_q_typesize_mr"   r#   use_atomic_addr%   )
reshapeshaper   sizer+   r   r
   scalar_typesfloat8_e4m3fnadd_)r   r   r    r!   r"   r#   r$   r%   
reshaped_x	out_shaper6   outputr   r   r   apply_fp8_marlin_linear*   s2   

r@   Tlayersize_k_firstc                 C   s>  t d | j}| j}t| dd }|r| jj||fksJ n
| jj||fks(J | jj}t|| _	t
jdt
j|d}t| j|}|sG|j }t||||dd}t
jj|dd| _d	t| v rg| j| j}	nd
t| v rv| j| j}	| `|d u r|dn|d }
|d u r|	 dkr|	dd|d}	nK|	 dkr|	 |kr||	  dksJ |	 }|	d|}	|	|| d}	n"|	d|}	n|s|	j }	|d }|	|d}	|	d d d |f }	t|	|||
d}t|}t
jj|dd| _t| dr| jd ur| jj|fksJ t| j}t
jj|dd| _d S d S d S )NYour GPU does not have native support for FP8 computation but FP8 quantization is being used. Weight-only FP8 compression will be used leveraging the Marlin kernel. This may degrade performance for compute-heavy workloads.weight_block_sizer   r   r+   r   r.   r3   r#   r"   num_bitsFrequires_gradr    weight_scale_invr'   r   r   r#   r"   
group_sizer$   ) loggerwarning_onceoutput_size_per_partitioninput_size_per_partitiongetattrr   r8   r+   r   r!   r   emptyintpack_fp8_to_int32T
contiguousr   nn	Parameterdirr    to
orig_dtyperJ   nelementviewrepeat_interleaver   r   hasattrr$   r   )rA   rB   part_size_npart_size_krD   r+   r3   qweightmarlin_qweightr   rL   s_sizeblock_nmarlin_scalesr$   r   r   r   prepare_fp8_layer_for_marlinV   sj   



rg   c              	   C   s  t d | j}| j}| j}t| dd }| jj}t|d| _	t
jdt
j|d}dD ]m}t| |}	g }
d|v r>|d |}}n||}}|rP|	j|||fksOJ n
|	j|||fksZJ t|D ]}t|	| |}|sn|j }t||||d	d
}|
| q^t
dd |
D d}	t
jj|	dd}	t| ||	 q)|d u rdn|d }dD ]}|d t| v r|d }t| || j}t| | n|d t| v r|d }t| || j}t| | g }
d|v r|d |}}n||}}|d u r@| |kr||dd|d}nY| |kr8| || kr8|| |  dks"J | | }||d|}||| d}n&||d|}n|sJ|ddd}|d }||d}|dd |f  }t|D ]}t || |||d}|
| qbt
dd |
D d}t!|}t
jj|dd}t| |d | qdD ]B}t"| |sqt| || j}g }
t|D ]}|| }|
t#| qt
dd |
D d}t
jj|dd}t| || qd S )NrC   rD   r   r   rE   )
w13_weight	w2_weightw13r   r   rF   c                 S      g | ]}| d qS r   	unsqueeze.0xr   r   r   
<listcomp>       z4prepare_moe_fp8_layer_for_marlin.<locals>.<listcomp>FrH   r'   r   )rj   w2_weight_scale_weight_scale_inv.rK   c                 S   rk   rl   rm   ro   r   r   r   rr     rs   )w13_biasw2_biasc                 S   rk   rl   rm   ro   r   r   r   rr   /  rs   )$rM   rN   num_expertshidden_sizeintermediate_size_per_partitionrQ   rh   r+   r   r!   r   rR   rS   r8   rangerT   rU   rV   r   appendcatrW   rX   setattrrY   rZ   r[   delattrr\   r]   r^   permuter   r   r_   r   )rA   rB   er*   r)   rD   r+   r3   namer   tensor_listr"   r#   irb   rc   rL   new_namer   rd   re   rf   r$   expert_biasr   r   r    prepare_moe_fp8_layer_for_marlin   s   






 r   
fp8_tensorc                 C   sR   | j tjksJ | jdksJ |r| jn| } |  } | tj}|r'|j S |S )zC
    Repack FP8 weights to gptq format (packed int32 elements)
    r   )r   r   r;   ndimrU   rV   r]   int32)r   rB   int32_tensorr   r   r   rT   4  s   rT   c                 C   s  | j \}}| j}|dkr3| |d| dd d }||d}| | tj}|| j	| }n&| |d| dd d }||d}| | tj}|| j	| }t
|dj }	t|	tjdtj|d||dd}
t|j|||d	}t|}|j|
|fS )
Nr'   r   i  r   FrE   r   rF   rK   )r8   r+   r]   absmaxr^   rZ   r   r;   r   rT   rU   rV   r   rR   rS   r   r   )r   rL   r"   r#   r+   r   repeated_scales
fp8_weight
weight_refpacked_weightrc   rf   r   r   r   marlin_quant_fp8_torchE  s0   
  
r   )T)#loggingtypingr   r   +sglang.srt.layers.quantization.marlin_utilsr   r   r   r   r   $sglang.srt.layers.quantization.utilsr   sglang.srt.utilsr	   _is_cudasglang.jit_kernel.gptq_marlinr
   $sglang.jit_kernel.gptq_marlin_repackr   
ScalarTyper:   	getLogger__name__rM   r   TensorrS   boolr@   rW   Modulerg   r   rT   r   r   r   r   r   <module>   sv   

	
-
]
 
