o
    پiM                  	   @  s<  d dl mZ d dlZd dlZd dlmZmZmZmZ d dl	Z	d dl
Z
d dl	mZ d dlmZmZ d dlmZ d dlmZ d dlmZmZmZmZ d d	lmZ d d
lmZmZmZmZ erdd dl m!Z!m"Z" e Z#e Z$e Z%e#rd dl&m'Z'm(Z(m)Z)m*Z* d dl+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1 ne$se2d e3e4Z5G dd deZ6d4ddZ7ej8ej9ej:hZ;ej<ej=ej>ej?ej@ejAhZBejCejDejEejFejGhZHejIejJejKejLejMejNejOejPejQh	ZReBeHB eRB ZSeBeHB eRB ZTeBeHB ZUd5ddZVd6d%d&ZW	d7d8d*d+ZXG d,d- d-eZYG d.d/ d/eZZG d0d1 d1eYZ[G d2d3 d3eZ\dS )9    )annotationsN)TYPE_CHECKINGAnyListOptional)GGMLQuantizationType)	ParameterUninitializedParameter)
LinearBase)MoeRunnerConfig)FusedMoEMethodBaseLinearMethodBaseQuantizationConfigQuantizeMethodBase)UnquantizedLinearMethod)is_cudais_hipis_xpuset_weight_attrs)CombineInputStandardDispatchOutput)gelu_and_mulmoe_align_block_sizemoe_sumsilu_and_mul)ggml_dequantizeggml_moe_a8ggml_moe_a8_vecggml_moe_get_block_sizeggml_mul_mat_a8ggml_mul_mat_vec_a8.Only CUDA support GGUF quantization currently.c                      s   e Zd ZdZd&d' fddZd(d
dZd)ddZd*ddZd+ddZe	d,ddZ
e	d-ddZe	d.ddZd/d$d%Z  ZS )0
GGUFConfigzConfig class for GGUF.Nmodules_to_not_convertlist[str] | NonereturnNonec                   s&   t    trtd |pg | _d S )Nr!   )super__init___is_hipwarningswarnr#   )selfr#   	__class__ W/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/quantization/gguf.pyr(   7   s   

zGGUFConfig.__init__strc                 C     dS )NzGGUFConfig()r/   r,   r/   r/   r0   __repr__=      zGGUFConfig.__repr__	List[str]c                 C     g S Nr/   r3   r/   r/   r0   get_scaled_act_names@   r5   zGGUFConfig.get_scaled_act_names'str'c                 C  r2   )Nggufr/   r3   r/   r/   r0   get_nameC   r5   zGGUFConfig.get_namelist[torch.dtype]c                 C  s   t jt jt jgS r8   )torchhalfbfloat16float32r3   r/   r/   r0   get_supported_act_dtypesF   s   z#GGUFConfig.get_supported_act_dtypesintc                 C  r2   )N<   r/   clsr/   r/   r0   get_min_capabilityI      zGGUFConfig.get_min_capability	list[str]c                 C  r7   r8   r/   rE   r/   r/   r0   get_config_filenamesM   rH   zGGUFConfig.get_config_filenamesconfigdict[str, Any]'GGUFConfig'c                 C  s   |  |dgd }| |S )Nr#   )get_from_keys_or)rF   rK   r#   r/   r/   r0   from_configQ   s   zGGUFConfig.from_configlayertorch.nn.ModuleprefixOptional['QuantizeMethodBase']c                 C  sd   ddl m} ddlm} t|trt|| jrt S t	| S t||r't
| S t||r0t| S d S )Nr   )FusedMoE)VocabParallelEmbedding)&sglang.srt.layers.moe.fused_moe_tritonrT   *sglang.srt.layers.vocab_parallel_embeddingrU   
isinstancer
   is_layer_skipped_ggufr#   r   GGUFLinearMethodGGUFEmbeddingMethodGGUFMoEMethod)r,   rP   rR   rT   rU   r/   r/   r0   get_quant_methodX   s   


zGGUFConfig.get_quant_methodr8   )r#   r$   r%   r&   )r%   r1   )r%   r6   )r%   r:   )r%   r=   )r%   rC   )r%   rI   )rK   rL   r%   rM   )rP   rQ   rR   r1   r%   rS   )__name__
__module____qualname____doc__r(   r4   r9   r<   rB   classmethodrG   rJ   rO   r]   __classcell__r/   r/   r-   r0   r"   4   s    



r"   rR   r1   r#   rI   c                   s   t  fdd|D S )Nc                 3  s    | ]}| v V  qd S r8   r/   ).0module_namerR   r/   r0   	<genexpr>j   s    z(is_layer_skipped_gguf.<locals>.<genexpr>)any)rR   r#   r/   rf   r0   rY   i   s   rY   xtorch.Tensorqweightqweight_typerC   r%   c           	      C  s4  |t v r|jd dkrdnd}n|jd dkrdnd}| jd dkr3tj| jd |jd | j| jdS |tv r<| |j S | jd |krS|tv rSt	|| ||jd }|S |t
v rct|| ||jd }|S |tv rtj| \}}|jd |jd | | f}t||g|| jR  }| |j }|S t|}td	| )
Nr   i               dtypedevice   $Unsupported GGUF quantization type: )IMATRIX_QUANT_TYPESshaper>   emptyrr   rs   UNQUANTIZED_TYPESTMMVQ_QUANT_TYPESr    MMQ_QUANT_TYPESr   DEQUANT_TYPESr;   GGML_QUANT_SIZESr   
WeightTypeNotImplementedError)	ri   rk   rl   	mmvq_safey
block_size	type_sizerw   weightr/   r/   r0   fused_mul_mat_gguf   s*   "

r   w1w2topk_weightstopk_idsqweight_type2
activationc                   s6  d
 fdd}t | }	|tv rr|tv rr| jd dkrr| j\}
}|j\}}}|jd }t|}t|||\}}}t| ||||||||
	}||}t|||||||jd d|
| 	}||
||jd |	|
|d}t
||	 |	S |tv r|tv r| j\}
}|j\}}}|jd }t| ||||||
}||}t|||d||jd |
| }||
||jd |	|
|d}t
||	 |	S td tt||D ]L\}\}}| | d	| jdd   }d }t||D ],\}}|| }t|||}||}|| }t||||}|d u r|}q|| q||	|< q|	S )Nri   rj   c                   sp   | j d d }| j d d |f }tj|| j| jd} dkr&t||  |S  dkr1t||  |S td  )Nro   rq   silugeluzUnsupported activation: )rw   r>   rx   rr   rs   r   r   
ValueError)ri   doutput_shapeoutr   r/   r0   act   s   

zfused_moe_gguf.<locals>.actr   @   rt   znThere is no support for fast MoE kernel for current quantization method. Falling back to slow implementation. )rt   )ri   rj   )r>   
empty_liker|   rw   r   r   r   reshapemul_viewr   r{   r   loggerwarning_once	enumeratezipr   add_)ri   r   r   r   r   rl   r   r   r   out_hidden_states
num_tokens_ENtop_k
BLOCK_SIZEsorted_token_ids
expert_idsnum_tokens_post_paddedr   tokwidxinpcurrent_hidden_statewwii	expert_upexpert_downcurrent_stater/   r   r0   fused_moe_gguf   s   





(




r   hidden_sizerr   torch.dtype | Nonec           
      C  s   |t v r
t|| S |tv rDtj| \}}|  }||jd | | ks&J tj|d|d}t	||||jd |}	|	j
g | j|R  S t|}td| )Nrt   r   )dimindexru   )ry   r>   	embeddingr}   r;   r~   flattenrw   index_selectr   r   r   r   )
ri   rk   rl   r   rr   r   r   x_flatquantdequantr/   r/   r0   apply_gguf_embedding  s   r   c                   @  sF   e Zd ZdZdddZdddZd ddZd ddZ	d!d"ddZdS )#rZ   z[Linear method for GGUF.

    Args:
        quant_config: The GGUF quantization config.
    quant_configr"   c                 C  
   || _ d S r8   r   r,   r   r/   r/   r0   r(   <     
zGGUFLinearMethod.__init__rP   rQ   input_size_per_partitionrC   output_partition_sizes	list[int]
input_sizeoutput_sizeparams_dtypetorch.dtypec              
   K  s   || _ t|}||f}	tdd}
t|
dd|	dg g i d t|
| |d|
 ttjt|tj	ddd}t|ddi dd	 t|| |d
| d S )NFrequires_gradrt   r   T)	input_dim
output_dimtensor_shapeis_gguf_weightdata_containershard_idshard_id_maprk   rr   )is_gguf_weight_typeweight_typeshard_weight_typeignore_warningrl   )
r   sumGGUFUninitializedParameterr   register_parameterr   r>   rx   lenuint8)r,   rP   r   r   r   r   r   extra_weight_attrsoutput_size_per_partitionr   rk   rl   r/   r/   r0   create_weights?  s>   



	zGGUFLinearMethod.create_weightsc                 C  sD   |j j}|tv s|tv st|}td| d| d| | d S )Nz#Unsupported GGUF quantization type z
 in layer .)rl   r   ry   r}   r   r   _create_padded_weight_param)r,   rP   rl   r/   r/   r0   process_weights_after_loadingm  s   z.GGUFLinearMethod.process_weights_after_loadingc                 C  sf  |j }|j}|j}t|j }dkrdd |D }t|dks(J td| tt|}tdd |D }t	dd |D }t
j||f||jd}	ttttttf f  }
|D ]6}|| }t	d	d |d
| D }||| d }|| d}|| |	||d
|f< |||f|
|< qY|j  t|	dd}t|t| t|d|
i |d| d
S d
S )z;Create padded weight parameter for GGUF MergedLinear layer.rt   c                 S  s   h | ]}|j qS r/   r   )rd   datar/   r/   r0   	<setcomp>~  s    z?GGUFLinearMethod._create_padded_weight_param.<locals>.<setcomp>z!Data container has mixed dtypes: c                 s      | ]}| d V  qdS )rt   Nsizerd   ri   r/   r/   r0   rg         z?GGUFLinearMethod._create_padded_weight_param.<locals>.<genexpr>c                 s  r   r   Nr   r   r/   r/   r0   rg     r   rq   c                 s  r   r   r   r   r/   r/   r0   rg     r   Nr   Fr   shard_offset_maprk   )rk   r   r   r   r   r   nextitermaxr   r>   zerosrs   dictr1   tuplerC   r   clearr   r   varsr   )r,   rP   rk   r   r   r   rr   padded_sideconcat_sidepadded_datar   r   id_in_containerstartendr   padded_paramr/   r/   r0   r   x  s8   
z,GGUFLinearMethod._create_padded_weight_paramNri   rj   biastorch.Tensor | Noner%   c              	   C  s   |j j}|rDd|v rg dn|}|j }g }|D ]$}|j j| \}}	}
|jj| }|t||||	d |
f  | qtj	|dd}n|j }|jj
}t|||}|d urZ|| |S )Nq)r   kvrt   )axis)rk   r   r   rl   r   appendr   
contiguousr>   catr   r   )r,   rP   ri   r   r   rk   resultr   r   r   offsetrl   r   r/   r/   r0   apply  s(   
zGGUFLinearMethod.applyr   r"   )rP   rQ   r   rC   r   r   r   rC   r   rC   r   r   )rP   rQ   r8   )rP   rQ   ri   rj   r   r   r%   rj   )	r^   r_   r`   ra   r(   r   r   r   r  r/   r/   r/   r0   rZ   5  s    


.
&rZ   c                   @  s8   e Zd ZdZdddZdddZdddZdddZdS )r\   zXMoE method for GGUF.

    Args:
        quant_config: The GGUF quantization config.
    r   r"   c                 C  r   r8   r   r   r/   r/   r0   r(     r   zGGUFMoEMethod.__init__rP   rQ   num_expertsrC   r   intermediate_size_per_partitionr   r   c                 K  s  |d| |f}t dd}t|dd|dg d t|| |d| ttjdtjd	dd}	t|	dddd
 t|	| |d|	 |||f}t dd}
t|
dd|dg d t|
| |d|
 ttjdtjd	dd}t|dddd
 t|| |d| d S )Nro   Fr   rt   r   T)r   r   r   r   r   w13_qweightr   )r   r   r   w13_qweight_type
w2_qweightw2_qweight_type)r   r   r   r   r>   rx   r   )r,   rP   r
  r   r  r   r   r   r  r  r  r  r/   r/   r0   r     sX   	










zGGUFMoEMethod.create_weightsmoe_runner_configr   c                 C  s
   || _ d S r8   )r  )r,   rP   r  r/   r/   r0   create_moe_runner  s   
zGGUFMoEMethod.create_moe_runnerdispatch_outputr   r%   r   c              
   C  sz   | j d u sJ ddlm} | jjdksJ d|j}|j}| j}|\}}}	t||j|j	|||j
j|jj|jd}
||
dS )Nr   )StandardCombineInputr   z"Only SiLU activation is supported.)ri   r   r   r   r   rl   r   r   )hidden_states)fused_experts&sglang.srt.layers.moe.token_dispatcherr  r  r   r  topk_outputr   r  r  r  r   r  )r,   rP   r  r  ri   r  r  r   r   r   outputr/   r/   r0   r    s(   


zGGUFMoEMethod.applyNr	  )
rP   rQ   r
  rC   r   rC   r  rC   r   r   )rP   rQ   r  r   )rP   rQ   r  r   r%   r   )r^   r_   r`   ra   r(   r   r  r  r/   r/   r/   r0   r\     s    


>r\   c                   @  s   e Zd ZdZd
ddZd	S )r[   z^Embedding method for GGUF.

    Args:
        quant_config: The GGUF quantization config.
    rP   rQ   ri   rj   r%   c                 C  s,   |j }|jj}|jd }t||||| jdS )Nrt   r   )rk   rl   r   r   r   r   )r,   rP   ri   rk   rl   r   r/   r/   r0   r   -  s   
zGGUFEmbeddingMethod.embeddingN)rP   rQ   ri   rj   r%   rj   )r^   r_   r`   ra   r   r/   r/   r/   r0   r[   &  s    r[   c                   @  s   e Zd ZU eZded< dS )r   zlist[torch.Tensor]r   N)r^   r_   r`   r   cls_to_become__annotations__r/   r/   r/   r0   r   7  s   
 r   )rR   r1   r#   rI   )ri   rj   rk   rj   rl   rC   r%   rj   )ri   rj   r   rj   r   rj   r   rj   r   rj   rl   rC   r   rC   r   r1   r%   rj   r8   )ri   rj   rk   rj   rl   rC   r   rC   rr   r   r%   rj   )]
__future__r   loggingr*   typingr   r   r   r   r;   r>   r   r   torch.nn.parameterr   r	   sglang.srt.layers.linearr
   sglang.srt.layers.moer   *sglang.srt.layers.quantization.base_configr   r   r   r   &sglang.srt.layers.quantization.unquantr   sglang.srt.utilsr   r   r   r   r  r   r   _is_cudar)   _is_xpu
sgl_kernelr   r   r   r   sgl_kernel.quantizationr   r   r   r   r   r    r+   	getLoggerr^   r   r"   rY   F32F16BF16ry   Q4_0Q4_1Q5_0Q5_1Q8_0Q8_1STANDARD_QUANT_TYPESQ2_KQ3_KQ4_KQ5_KQ6_KKQUANT_TYPESIQ1_MIQ1_SIQ2_XXSIQ2_XSIQ2_SIQ3_XXSIQ3_SIQ4_XSIQ4_NLrv   r}   r{   r|   r   r   r   rZ   r\   r[   r   r/   r/   r/   r0   <module>   s|   "	


5	

#p m