o ÔÙ¾ièMã @s<ddlmZddlZddlZddlmZmZmZmZddl Z ddl Z ddl mZddl mZmZddlmZddlmZddlmZmZmZmZdd lmZdd lmZmZmZmZerdddl m!Z!m"Z"eƒZ#eƒZ$eƒZ%e#rŒddl&m'Z'm(Z(m)Z)m*Z*dd l+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1ne$s“e 2d¡e 3e4¡Z5Gdd„deƒZ6d4dd„Z7ej8ej9ej:hZ;ejej?ej@ejAhZBejCejDejEejFejGhZHejIejJejKejLejMejNejOejPejQh ZReBeHBeRBZSeBeHBeRBZTeBeHBZUd5dd„ZVd6d%d&„ZW d7d8d*d+„ZXGd,d-„d-eƒZYGd.d/„d/eƒZZGd0d1„d1eYƒZ[Gd2d3„d3eƒZ\dS)9é)ÚannotationsN)Ú TYPE_CHECKINGÚAnyÚListÚOptional)ÚGGMLQuantizationType)Ú ParameterÚUninitializedParameter)Ú LinearBase)ÚMoeRunnerConfig)ÚFusedMoEMethodBaseÚLinearMethodBaseÚQuantizationConfigÚQuantizeMethodBase)ÚUnquantizedLinearMethod)Úis_cudaÚis_hipÚis_xpuÚset_weight_attrs)ÚCombineInputÚStandardDispatchOutput)Úgelu_and_mulÚmoe_align_block_sizeÚmoe_sumÚsilu_and_mul)Úggml_dequantizeÚggml_moe_a8Úggml_moe_a8_vecÚggml_moe_get_block_sizeÚggml_mul_mat_a8Úggml_mul_mat_vec_a8ú.Only CUDA support GGUF quantization currently.cs€eZdZdZd&d'‡fdd„ Zd(d d„Zd)d d„Zd*dd„Zd+dd„Ze d,dd„ƒZ e d-dd„ƒZe d.dd„ƒZd/d$d%„Z ‡ZS)0Ú GGUFConfigzConfig class for GGUF.NÚmodules_to_not_convertúlist[str] | NoneÚreturnÚNonecs&tƒ ¡trt d¡|pg|_dS)Nr!)ÚsuperÚ__init__Ú_is_hipÚwarningsÚwarnr#)Úselfr#©Ú __class__©úW/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/quantization/gguf.pyr(7s zGGUFConfig.__init__ÚstrcCódS)NzGGUFConfig()r/©r,r/r/r0Ú__repr__=ózGGUFConfig.__repr__ú List[str]cCógS©Nr/r3r/r/r0Úget_scaled_act_names@r5zGGUFConfig.get_scaled_act_namesú'str'cCr2)NÚggufr/r3r/r/r0Úget_nameCr5zGGUFConfig.get_nameúlist[torch.dtype]cCstjtjtjgSr8)ÚtorchÚhalfÚbfloat16Úfloat32r3r/r/r0Úget_supported_act_dtypesFsz#GGUFConfig.get_supported_act_dtypesÚintcCr2)Né<r/©Úclsr/r/r0Úget_min_capabilityIózGGUFConfig.get_min_capabilityú list[str]cCr7r8r/rEr/r/r0Úget_config_filenamesMrHzGGUFConfig.get_config_filenamesÚconfigúdict[str, Any]ú'GGUFConfig'cCs| |dgd¡}||ƒS)Nr#)Úget_from_keys_or)rFrKr#r/r/r0Úfrom_configQsÿzGGUFConfig.from_configÚlayerútorch.nn.ModuleÚprefixúOptional['QuantizeMethodBase']cCsdddlm}ddlm}t|tƒrt||jƒrtƒSt |ƒSt||ƒr't |ƒSt||ƒr0t|ƒSdS)Nr)ÚFusedMoE)ÚVocabParallelEmbedding)Ú&sglang.srt.layers.moe.fused_moe_tritonrTÚ*sglang.srt.layers.vocab_parallel_embeddingrUÚ isinstancer Úis_layer_skipped_ggufr#rÚGGUFLinearMethodÚGGUFEmbeddingMethodÚ GGUFMoEMethod)r,rPrRrTrUr/r/r0Úget_quant_methodXs zGGUFConfig.get_quant_methodr8)r#r$r%r&)r%r1)r%r6)r%r:)r%r=)r%rC)r%rI)rKrLr%rM)rPrQrRr1r%rS)Ú__name__Ú __module__Ú__qualname__Ú__doc__r(r4r9r<rBÚclassmethodrGrJrOr]Ú __classcell__r/r/r-r0r"4s r"rRr1r#rIcst‡fdd„|DƒƒS)Nc3s|]}|ˆvVqdSr8r/)Ú.0Úmodule_name©rRr/r0Ú js€z(is_layer_skipped_gguf..)Úany)rRr#r/rfr0rYisrYÚxútorch.TensorÚqweightÚqweight_typerCr%c Cs4|tvr|jddkr dnd}n|jddkrdnd}|jddkr3tj|jd|jd|j|jdS|tvr<||jS|jd|krS|tvrSt ||||jdƒ}|S|t vrct||||jdƒ}|S|tvrt j|\}}|jd|jd||f}t||g|¢|j‘RŽ}||j}|St|ƒ}td |›ƒ‚) Nriéééé©ÚdtypeÚdeviceéú$Unsupported GGUF quantization type: )ÚIMATRIX_QUANT_TYPESÚshaper>ÚemptyrrrsÚUNQUANTIZED_TYPESÚTÚMMVQ_QUANT_TYPESr ÚMMQ_QUANT_TYPESrÚ DEQUANT_TYPESr;ÚGGML_QUANT_SIZESrÚ WeightTypeÚNotImplementedError) rirkrlÚ mmvq_safeÚyÚ block_sizeÚ type_sizerwÚweightr/r/r0Úfused_mul_mat_ggufs*" ò õ þr†Úw1Úw2Útopk_weightsÚtopk_idsÚ qweight_type2Ú activationcs6d ‡fdd„}t |¡} |tvrr|tvrr|jddkrr|j\} }|j\}} }|jd}t|ƒ}t|||ƒ\}}}t||||||| || ƒ }||ƒ}t|||||||jdd| |ƒ }| | ||jd¡ | | |d¡¡}t || ƒ| S|tvrÀ|tvrÀ|j\} }|j\}} }|jd}t|||||| | ƒ}||ƒ}t|||d||jd| |ƒ}| | ||jd¡ | | |d¡¡}t || ƒ| St d¡tt||ƒƒD]L\}\}}|| d |jdd…¡}d}t||ƒD],\}}||}t|||ƒ}||ƒ}||}t|||ƒ |¡}|dur|}qç| |¡qç|| |<qÌ| S)Nrirjcsp|jdd}|jdd…|f}tj||j|jd}ˆdkr&t||ƒ|Sˆdkr1t||ƒ|Stdˆ›ƒ‚)NéÿÿÿÿrorqÚsiluÚgeluzUnsupported activation: )rwr>rxrrrsrrÚ ValueError)riÚdÚoutput_shapeÚout©rŒr/r0Úact½s ü ÿzfused_moe_gguf..actré@rtznThere is no support for fast MoE kernel for current quantization method. Falling back to slow implementation. )rt)rirj)r>Ú empty_liker|rwrrrÚreshapeÚmul_Úviewrr{rÚloggerÚwarning_onceÚ enumerateÚzipr†Úadd_)rir‡rˆr‰rŠrlr‹rŒr•Úout_hidden_statesÚ num_tokensÚ_ÚEÚNÚtop_kÚ BLOCK_SIZEÚsorted_token_idsÚ expert_idsÚnum_tokens_post_paddedr“ÚtokÚwÚidxÚinpÚcurrent_hidden_stateÚwwÚiiÚ expert_upÚexpert_downÚ current_stater/r”r0Úfused_moe_gguf³s– ÿ÷÷ÿ (Ù ÿÿ éÿÿþ r´Úhidden_sizerrútorch.dtype | Nonec Csž|tvr t ||¡S|tvrDtj|\}}| ¡}||jd||ks&J‚tj|d|d}t ||||jd|ƒ} | j g|j¢|‘RŽSt|ƒ}td|›ƒ‚)Nrtr)ÚdimÚindexru) ryr>Ú embeddingr}r;r~ÚflattenrwÚindex_selectrršrr€) rirkrlrµrrrƒr„Úx_flatÚquantÚdequantr/r/r0Úapply_gguf_embeddingsÿr¿c@sFeZdZdZddd„Zddd„Zd dd„Zd dd„Z d!d"dd„ZdS)#rZz[Linear method for GGUF. Args: quant_config: The GGUF quantization config. Úquant_configr"cCó ||_dSr8©rÀ©r,rÀr/r/r0r(<ó zGGUFLinearMethod.__init__rPrQÚinput_size_per_partitionrCÚoutput_partition_sizesú list[int]Ú input_sizeÚoutput_sizeÚparams_dtypeútorch.dtypec Ksš||_t|ƒ}||f} tdd} t| dd| dggidœƒt| |ƒ| d| ¡ttjt|ƒtj ddd}t|ddidd œƒt||ƒ| d |¡dS)NF©Ú requires_gradrtrT)Ú input_dimÚ output_dimÚtensor_shapeÚis_gguf_weightÚdata_containerÚshard_idÚshard_id_maprk©rr)Úis_gguf_weight_typeÚweight_typeÚshard_weight_typeÚignore_warningrl) rÊÚsumÚGGUFUninitializedParameterrÚregister_parameterrr>rxÚlenÚuint8)r,rPrÅrÆrÈrÉrÊÚextra_weight_attrsÚoutput_size_per_partitionrÐrkrlr/r/r0Úcreate_weights?s> ùþ þüþ zGGUFLinearMethod.create_weightscCsD|jj}|tvs|tvst|ƒ}td|›d|›dƒ‚| |¡dS)Nz#Unsupported GGUF quantization type z in layer Ú.)rlr×ryr}rrÚ_create_padded_weight_param)r,rPrlr/r/r0Úprocess_weights_after_loadingmsÿz.GGUFLinearMethod.process_weights_after_loadingcCsf|j}|j}|j}t|j}ƒdkr±dd„|Dƒ}t|ƒdks(Jtd|›ƒƒ‚tt|ƒƒ}tdd„|Dƒƒ}t dd„|Dƒƒ}t j||f||jd} t tttttffƒ} |D]6}||}t d d„|d |…Dƒƒ} | || d¡}|| d¡}||| | |…d |…f<| ||f| |<qY|j ¡t| dd }t|t|ƒƒt|d| iƒ| d|¡d Sd S)z;Create padded weight parameter for GGUF MergedLinear layer.rtcSsh|]}|j’qSr/rÕ)rdÚdatar/r/r0Ú ~sz?GGUFLinearMethod._create_padded_weight_param..z!Data container has mixed dtypes: csó|]}| d¡VqdS)rtN©Úsize©rdrir/r/r0rg„ó€z?GGUFLinearMethod._create_padded_weight_param..csrç©rNrèrêr/r/r0rg…rërqcsrçrìrèrêr/r/r0rgrëNrFrÌÚshard_offset_maprk)rkrÔrÓrÝrÒrÚnextÚiterÚmaxrÚr>ÚzerosrsÚdictr1ÚtuplerCréÚclearrrÚvarsrÜ)r,rPrkrÔrÓrÒrrÚpadded_sideÚconcat_sideÚpadded_datarír¬Úid_in_containerÚstartÚendréÚpadded_paramr/r/r0rãxs8ÿÿ åz,GGUFLinearMethod._create_padded_weight_paramNrirjÚbiasútorch.Tensor | Noner%c Cs¸|jj}|rDd|vrgd¢n|}|j}g}|D]$}|jj|\}} } |jj|}| t|||| …d| …f ¡|ƒ¡qtj |dd}n |j}|jj }t|||ƒ}|durZ| |¡|S)NÚq)rÿÚkÚvrt)Úaxis)rkrÓrírlrØÚappendr†Ú contiguousr>Úcatr×rŸ) r,rPrirýrÓrkÚresultr¬rúrûÚoffsetrlr“r/r/r0Úapplyšs(ÿÿ zGGUFLinearMethod.apply©rÀr")rPrQrÅrCrÆrÇrÈrCrÉrCrÊrË)rPrQr8)rPrQrirjrýrþr%rj) r^r_r`rar(rárärãrr/r/r/r0rZ5s . &ürZc@s8eZdZdZddd„Zddd„Zddd„Zddd„ZdS)r\zXMoE method for GGUF. Args: quant_config: The GGUF quantization config. rÀr"cCrÁr8rÂrÃr/r/r0r(ÀrÄzGGUFMoEMethod.__init__rPrQÚnum_expertsrCrµÚintermediate_size_per_partitionrÊrËcKs|d||f}tdd}t|dd|dgdœƒt||ƒ| d|¡ttjdtjd dd} t| dddd œƒt| |ƒ| d| ¡|||f}tdd} t| dd|dgdœƒt| |ƒ| d| ¡ttjdtjd dd}t|dddd œƒt||ƒ| d |¡dS)NroFrÌrtrT)rÎrÏrÐrÑrÒÚw13_qweightrÕ)rÖr×rÙÚw13_qweight_typeÚ w2_qweightÚw2_qweight_type)rÛrrÜrr>rxrÞ)r,rPr rµrrÊrßrÐrr rrr/r/r0ráÃsX ûþ ÿ þ ûþ ÿ þ zGGUFMoEMethod.create_weightsÚmoe_runner_configrcCs ||_dSr8)r)r,rPrr/r/r0Úcreate_moe_runners zGGUFMoEMethod.create_moe_runnerÚdispatch_outputrr%rc Csz|jdusJ‚ddlm}|jjdksJdƒ‚|j}|j}|j}|\}}} t||j|j |||j j|jj|jd} || dS)Nr)ÚStandardCombineInputrŽz"Only SiLU activation is supported.)rir‡rˆr‰rŠrlr‹rŒ)Ú hidden_states) Ú fused_expertsÚ&sglang.srt.layers.moe.token_dispatcherrrrŒrÚtopk_outputr´rrr r×r)r,rPrrrirrr‰rŠr¢Úoutputr/r/r0rs(ÿ ø zGGUFMoEMethod.applyNr ) rPrQr rCrµrCrrCrÊrË)rPrQrr)rPrQrrr%r)r^r_r`rar(rárrr/r/r/r0r\¹s >r\c@seZdZdZd dd„Zd S)r[z^Embedding method for GGUF. Args: quant_config: The GGUF quantization config. rPrQrirjr%cCs,|j}|jj}|jd}t|||||jdS)NrtrÕ)rkrlr×rÐr¿rÊ)r,rPrirkrlrµr/r/r0r¹-s ÿzGGUFEmbeddingMethod.embeddingN)rPrQrirjr%rj)r^r_r`rar¹r/r/r/r0r[&sr[c@seZdZUeZded<dS)rÛzlist[torch.Tensor]rÒN)r^r_r`rÚ cls_to_becomeÚ__annotations__r/r/r/r0rÛ7s rÛ)rRr1r#rI)rirjrkrjrlrCr%rj)rirjr‡rjrˆrjr‰rjrŠrjrlrCr‹rCrŒr1r%rjr8)rirjrkrjrlrCrµrCrrr¶r%rj)]Ú __future__rÚloggingr*Útypingrrrrr;r>rrÚtorch.nn.parameterrr Úsglang.srt.layers.linearr Úsglang.srt.layers.moerÚ*sglang.srt.layers.quantization.base_configrr rrÚ&sglang.srt.layers.quantization.unquantrÚsglang.srt.utilsrrrrrrrÚ_is_cudar)Ú_is_xpuÚ sgl_kernelrrrrÚsgl_kernel.quantizationrrrrrr r+Ú getLoggerr^r›r"rYÚF32ÚF16ÚBF16ryÚQ4_0ÚQ4_1ÚQ5_0ÚQ5_1ÚQ8_0ÚQ8_1ÚSTANDARD_QUANT_TYPESÚQ2_KÚQ3_KÚQ4_KÚQ5_KÚQ6_KÚKQUANT_TYPESÚIQ1_MÚIQ1_SÚIQ2_XXSÚIQ2_XSÚIQ2_SÚIQ3_XXSÚIQ3_SÚIQ4_XSÚIQ4_NLrvr}r{r|r†r´r¿rZr\r[rÛr/r/r/r0Ús|" 5ú û÷ #pûm