o
    پi_                     @  s  d dl mZ d dlZd dlmZ d dlmZ d dlmZm	Z	m
Z
mZmZmZmZ d dlZd dlZd dlmZ er>d dlmZ dd	 Ze \ZZei fdtddZduddZdvddZdwd!d"Zdxd$d%Zdyd)d*Zdzd/d0Zd{d3d4Zd|d7d8Z 		d}d~d?d@Z!ddDdEZ"dFdG Z#	dddNdOZ$ddSdTZ%ddUdVZ&ddXdYZ'	Z	ZdddadbZ(ej)ej*gZ+g dcZ,	dddedfZ-ddhdiZ.ddkdlZ/	mdddpdqZ0drds Z1dS )    )annotationsN)deepcopy)MappingProxyType)TYPE_CHECKINGDictListMappingOptionalTupleUnion)scaled_fp8_quant)QuantizationConfigc                  C  sT   zddl m} m} | |fW S  ty)   G dd d}G dd d}|| f Y S w )z<
    Returns:
        tuple: (ScalarType, scalar_types)
    r   )
ScalarTypescalar_typesc                   @  s   e Zd ZdS )z(get_scalar_types.<locals>.MockScalarTypeN)__name__
__module____qualname__ r   r   X/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/quantization/utils.pyMockScalarType   s    r   c                   @  s   e Zd ZdZdZdd ZdS )z)get_scalar_types.<locals>.MockScalarTypesuint4b8	uint8b128c                 S  s
   d| S )Nmock_r   )selfnamer   r   r   __getattr__%   s   
z5get_scalar_types.<locals>.MockScalarTypes.__getattr__N)r   r   r   r   r   r   r   r   r   r   MockScalarTypes!   s    r   )sgl_kernel.scalar_typer   r   ImportError)r   r   r   r   r   r   r   get_scalar_types   s   
r   prefixstrignored_layers	List[str]fused_mappingMapping[str, List[str]]returnboolc                   s     dd |v r= fdd| D }d }|D ] tfdd|D }|d u r/|}q||kr;td  dqn5t fd	d|D }d
 v rc d
d} d
d}||v rb||v rbd}nd v rrt fdd|D }|d usxJ |S )N.c                   s   g | ]}  |qS r   )replace).0shard_proj_name)r    	proj_namer   r   
<listcomp><   s    
z$is_layer_skipped.<locals>.<listcomp>c                 3      | ]}| v V  qd S Nr   r+   ignored)shard_prefixr   r   	<genexpr>C   s    
z#is_layer_skipped.<locals>.<genexpr>z$Detected some but not all shards of zF are quantized. All shards of fused layers to have the same precision.c                 3  r/   r0   r   r1   r    r   r   r4   P   s    gate_up_proj	gate_projup_projTexpertsc                   s   g | ]
}d |v r |v qS )r9   r   )r+   
layer_namer5   r   r   r.   X   s
    )splitany
ValueErrorr*   )r    r"   r$   shard_prefixes
is_skippedis_shard_skippedprefix_gate	prefix_upr   )r    r-   r3   r   is_layer_skipped.   sB   

rC   tensortorch.Tensor	inv_scaleUnion[float, torch.Tensor]c                 C  s   |  tj}|| }|S r0   )totorchfloat16)rD   rF   fake_qweight	dq_weightr   r   r   per_tensor_dequantizec   s   rM   xc                   s2   t  jdks	J t fddt jd D S )N   c                 3  s$    | ]}t  d   | V  qdS )r   N)rI   allclose)r+   irN   r   r   r4   m   s   " zall_close_1d.<locals>.<genexpr>r   )lenshapeallrangerR   r   rR   r   all_close_1dk   s    rW   weight_scalelogical_widths	List[int]!Tuple[torch.Tensor, torch.Tensor]c                 C  sx   t jt|dft j| jd}|  dkr||   |S d}t|D ]\}}|| }| | |||d d f< |}q#|S )NrO   dtypedevicer   )	rI   emptysumfloat32r^   dimfill_item	enumerate)rX   rY   weight_scale_channelstartidxlogical_widthendr   r   r   convert_to_channelwisep   s   rk   weightc                 C  s   |  }|d ttjjk}|r@d}t|D ](\}}|| }t| ||d d f || }	t|	|\| ||d d f< }
|}q|| fS )Nr)   r   )maxrI   finfofloat8_e4m3fnminre   rM   r   )rl   rX   rY   max_w_scaleunfused_module_in_checkpointrg   rh   ri   rj   	weight_dq_r   r   r   requantize_with_max_scale   s   	ru   oldnewNonec                 C  s   |  | d S r0   )copy_)rv   rw   r   r   r   update_tensor_inplace   s   rz   modtorch.nn.Moduler   'Union[torch.Tensor, torch.nn.Parameter]c                 C  s   t | |}t|t|u r&|j|jkr&|  |  kr&t|| d S t|tjj	s5tjj	|dd}| 
|tjj	|dd d S )NF)requires_grad)getattrtyper]   untyped_storagenbytesrz   
isinstancerI   nn	Parameterregister_parameter)r{   r   rw   rv   r   r   r   replace_parameter   s   
r   abc           	   	   C  s   | j |j ksJ | j|j  krtjksJ  J | tj}|tj}|tj|tj  }| 	 }|dk|dk @ 
  }|dk
  }|dk
  }|dkra|| dk ra|dksqJ d|d|d|d|d S )	Nr   rO      g{Gzt?zcount_diff_sign=z count_tiny_diff=z count_large_diff=z numel=)rT   r]   rI   ro   viewuint8rH   int16absnumelr`   rd   )	r   r   a_u8b_u8diff_u8r   count_diff_signcount_tiny_diffcount_large_diffr   r   r   assert_fp8_all_close   s   "
r   configr   c                 C  s
  t | |d| j}t|tr|| _t | |d| j}t|tr || _t | |d| j}t|tr0|| _d| j | _|  dkrmt | |d| j	}t|trL|| _	| j| j	f| j
vratd| j d| j	 | j
| j| j	f | _d S |  d	kr| jd
vrtd| j dd S d S )Nbits
group_sizedesc_act    gptq_marlinsymz&Unsupported quantization config: bits=z, sym=gptq)r            zOCurrently, only 2/3/4/8-bit weight quantization is supported for GPTQ, but got z bits.)get_dynamic_overrideweight_bitsr   intr   r   r'   pack_factorget_nameis_symTYPE_MAPr=   
quant_type)r   r    r   r   r   r   r   r   r   override_config   s@   




r   r:   keyOptional[str]default_valueUnion[int, bool, None]Union[Dict, int, bool, None]c                 C  sn   | j  D ]/\}}|drt|d|r dS qt|d|r4|d u r,|  S |||  S q|S )Nz-:Fz+:)dynamicitems
startswithrematchremoveprefixget)r   r:   r   r   patternpattern_dictr   r   r   r      s   
r   layerlinear_method_clsr   c           
      C  s   ddl m} ddlm}m} ddlm} t| }t||o|j	}	t||s'|	rCt
||ddu r7|	r4| S | S |r?t||d ||S d S )Nr   )
LinearBase)UnquantizedEmbeddingMethodUnquantizedLinearMethod)ParallelLMHead)r:   Fr5   )sglang.srt.layers.linearr   &sglang.srt.layers.quantization.unquantr   r   *sglang.srt.layers.vocab_parallel_embeddingr   r   r   lm_head_quantizedr   r   )
r   r   r    r   r   r   r   r   cloned_configparallel_lm_head_quantizedr   r   r   get_linear_quant_method  s   r   c                 C  s"   d|  dksJ d|  d|  S )Nr   r   zUnsupported num_bits = r   )num_bitsr   r   r   get_pack_factor.  s   r   q_ww_refr   r   	test_permOptional[torch.Tensor]c           
      C  s   | j |j ksJ | j}| j \}}tj|ftjd}t|D ]}|| ||< q|d ur,|nt|}	||	  }| |	d d f  } ||	d d f  }|j|d| j|d|j|d|	j|dfS Nr]   r^   )	rT   r^   rI   zerosint32rV   randperm
contiguousrH   )
r   r   r   r   orig_devicek_sizert   g_idxrQ   	rand_permr   r   r   permute_rows3  s   




r   r   size_ksize_nc                 C  s   | j ||fks	J t|}|| dksJ | j}|   tj} tj||| ftjd}t|D ]}|| d d |d |f || > O }q2t	
|tj|}| }|S Nr   r   )rT   r   r^   cpunumpyastypeuint32r   rV   rI   
from_numpyr   rH   r   r   r   r   r   r   r   q_resrQ   r   r   r   	pack_colsQ  s   $r   c                 C  s   | j ||fks	J t|}|| dksJ | j}|   tj} tj|| |ftjd}t|D ]}|| |d |d d f || > O }q2t	
|tj|}|S r   )rT   r   r^   r   r   r   r   r   rV   rI   r   r   rH   r   r   r   r   	pack_rowsk  s   $r   
packed_q_wc                 C  s   t |}|| dksJ | j||| fks J d| j|||| j}|   tj}tj||ftjd}d|> d }t	|D ]}	||@ }
||L }|
|d d |	d |f< qAt
|tj|}| }|S )Nr   z?packed_q_w.shape = {} size_k = {}, size_n = {} pack_Factor = {}r   rO   )r   rT   formatr^   r   r   r   r   r   rV   rI   r   r   rH   r   )r   r   r   r   r   r   packed_q_w_cpur   maskrQ   valsr   r   r   unpack_cols  s*   
r   Fwr   r   Optional[int]zero_pointsref_zero_points_after_scalesc                   s  |  sJ d|r d usJ d| j}| j}| j\|  s%J d dkr+  d urI k rI| d f} | ddd} |  df} tj| ddd	j	}tj
| ddd	j	}| }	|
 }
td
g| j}d } d ur|r| s| dksJ || jdd|  }tt|| |
|	 }ntt||	dkr|	ntj t||
dkr|
ntj }t| |  |r|nd }t||
|	}|r|d ur||| |||  }n||r|nd || }| r||j7 } d ur k r fdd}||}||}|df }|d ur2|df }|j|d}|j|d|j|d d urE||fS d |fS )Nz<Floating point quantization may work but has not been testedzUto have group zero points, group_size must be provided (-1 group_size is channelwise)w must be floatr)   rO   r   r   T)keepdimg      ?gh㈵>)rp   c                   s4   |   df} | ddd} |  f } | S )Nr)   rO   r   r   )reshapepermuter   )r   r   r   r   r   r   	reshape_w  s   z#quantize_weights.<locals>.reshape_wr   )
is_integerr^   r]   rT   is_floating_pointr   r   rI   rm   valuesrp   TensorrH   	is_signedclamproundr   r   infhas_biasbiasr   )r   r   r   r   r   r   	orig_typemax_valmin_val	max_q_val	min_q_valw_s
maybe_w_zpw_qr   r   r   r   r   quantize_weights  sp   




r  )r)   r   @      	act_orderc                 C  s   | j \}}|  sJ d|tv sJ d| |t|g v s&J d| t| ||\}}}	}tjdtj| jd}
tjdtj| jd}|r]||k sRJ d	||t
||||\}}}
}|||	|
|fS )Nr   zUnsupported gptq type = zUnsupported groupsize = r   r\   z;For act_order, groupsize = {} must be less than size_k = {})rT   r   SUPPORTED_GPTQ_QUANT_TYPESSUPPORTED_GROUP_SIZESr  rI   r_   r   r^   r   r   )r   r   r   r	  r   r   rt   r   r  r  r   r   r   r   r   gptq_quantize_weights  s*   



r  r   c                 C  s\   | j }t|jtjd}||  }| |d d f  } | j|d|j|d|j|dfS r   )r^   rI   argsortrH   r   r   )r   r   r   sort_indicesr   r   r   sort_weights)  s   


r  scalec                 C  s  | j tjksJ | j}| jdkr| d} | jdksJ | j\}}}dd }||d}||d}tj|||f| j d}| |d	|d	|d	|f< |j\}	}
}|
d dksVJ |d dks^J ||	|
d dd
|d d}|d}|	 
 }|dkr|||S ||||S )z^
    Swizzle the scale tensor into a blockwise interleaved format for NVFP4 quantization.
    r   r   r   c                 S  s   | | d | | S )NrO   r   )rN   mr   r   r   <lambda>C  s    z$swizzle_blockscale.<locals>.<lambda>r  r   r   Nr   )r   rO   r   r   r      )r]   rI   ro   ndim	unsqueezerT   r   r   r   r   cuda)r  
scale_ndimBMKround_up_multipleM_paddedK_paddedpadded_scalebatchesrowscolsswizzled_scaler   r   r   swizzle_blockscale8  s,   





r#  rb   !tuple[torch.Tensor, torch.Tensor]c           	      C  s   |  |}|d dksJ d| d| |d }| j||d\}}|j||d\}}tj||g|d tj||g|d fS )z:Re-order the concatenated `[w1, w3]` tensors to `[w3, w1]`r   r   zExpected even size in dim z, got )rb   )sizer;   rI   catr   )	rl   r  rb   r&  halfw1w3s1s3r   r   r   reorder_w1w3_to_w3w1U  s   
 r-  c                 C  s   ddl m} ddlm}m}	 	 i }
d}| tj|d| |d }|tj|d| |d }|tj|||d }|tj|||d }g }g }g }g }t	|D ]}||
|| tj
|}||| tj
||j   ||
|| tj
|dd}|||| tj
||j   |	|
|| tj
|}||| tj
||j   |	|
|| tj
|dd}|||| tj
||j   qWt|}t|tj|d| |d }t|}t|tj|||d }||||fS )Nr   )nvfp4_block_scale_interleave)'_maybe_get_cached_w3_w1_permute_indices!get_w2_permute_indices_with_cacher  r      )num_elts_per_sf)
flashinferr.  flashinfer.fused_moe.corer/  r0  r   rI   ro   r   rV   r   appendrH   r^   r   stack)gemm1_weightsgemm2_weightsgemm1_scales_linear_fp4_bytesgemm2_scales_linear_fp4_byteshidden_sizeintermediate_sizenum_expertsr.  r/  r0  _cache_permute_indicesepilogue_tile_mgemm1_weights_fp4gemm1_scales_linear_fp4gemm2_weights_fp4gemm2_scales_linear_fp4gemm1_weights_fp4_shuffledgemm1_scales_fp4_shuffledgemm2_weights_fp4_shuffledgemm2_scales_fp4_shuffledrQ   permute_indicespermute_sf_indicesr   r   r   )prepare_static_weights_for_trtllm_fp4_moef  s   	






rJ  )r    r!   r"   r#   r$   r%   r&   r'   )rD   rE   rF   rG   r&   rE   )rN   rE   r&   r'   )rX   rE   rY   rZ   r&   r[   )rl   rE   rX   rE   rY   rZ   r&   r[   )rv   rE   rw   rE   r&   rx   )r{   r|   r   r!   rw   r}   r&   rx   )r   rE   r   rE   )r   r   r    r!   )NN)
r   r   r:   r!   r   r   r   r   r&   r   )r   r   r   r|   r    r!   r   r   r0   )r   rE   r   rE   r   r   r   r   )r   rE   r   r   r   r   r   r   )r   rE   r   r   r   r   r   r   )FF)
r   rE   r   r   r   r   r   r'   r   r'   )
r   rE   r   r   r   r   r	  r'   r   r   )r   rE   r   rE   )r  rE   )r$  )rl   rE   r  rE   rb   r   r&   r%  )2
__future__r   r   copyr   typesr   typingr   r   r   r   r	   r
   r   r   rI   )sglang.srt.layers.quantization.fp8_kernelr   *sglang.srt.layers.quantization.base_configr   r   r   r   rC   rM   rW   rk   ru   rz   r   r   r   r   r   r   r   r   r   r   r  r   r   r
  r  r  r  r#  r-  rJ  r   r   r   r   <module>   sX   $

5






#
!	


%_
"
