o
    
۾iZ                     @   s  d dl mZ d dlmZ d dlmZ d dlZd dlZd dlmZ	 d dl
mZmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZmZ d dlmZmZmZ d dlmZ d dlmZm Z  d dl!m"Z"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z) d dl*m+Z+ ee,Z-G dd deZ.ei fde/de0e/ dee/e0e/ f fddZ1e	j2e	j3e	j4hZ5e	j6e	j7e	j8e	j9e	j:e	j;hZ<e	j=e	j>e	j?e	j@e	jAhZBe	jCe	jDe	jEe	jFe	jGe	jHe	jIe	jJe	jKh	ZLe<eBB eLB ZMe<eBB eLB ZNe<eBB ZOdejPdejPdeQdejPfddZRdejPdejPdeQdejPfd d!ZSze+deReSd" ejjjRZTW n eUy" ZV zeVdZV[Vww dejPd#ejPd$ejPd%ejPd&ejPdeQd'eQd(e/dejPfd)d*ZWdejPd#ejPd$ejPd%ejPd&ejPdeQd'eQd(e/dejPfd+d,ZXze+d*eWeXd" ejjjWZYW n eUyz ZV zeVdZV[Vww 	d;dejPdejPdeQd-eQd.ejZdB dejPfd/d0Z[	d;dejPdejPdeQd-eQd.ejZdB dejPfd1d2Z\ze+d0e[e\d" ejjj[Z]W n eUy ZV zeVdZV[Vww G d3d4 d4eZ^G d5d6 d6eZ_G d7d8 d8e^Z`G d9d: d:eZadS )<    )Mapping)MappingProxyType)AnyN)GGMLQuantizationType)	ParameterUninitializedParameter)_custom_ops)init_logger)FusedMoEConfigFusedMoEQuantConfig)FusedMoEFusedMoEMethodBase)
LinearBaseLinearMethodBaseUnquantizedLinearMethod)QuantizationMethods)QuantizationConfigQuantizeMethodBase)UnquantizedEmbeddingMethodVocabParallelEmbedding)WeightsMapper)set_weight_attrs)current_platform)direct_register_custom_opc                       s   e Zd ZdZddee dB ddf fddZdefddZdefd	d
Z	dee
j fddZedefddZedee fddZedeeef dd fddZde
jjdeddfddZdddZ  ZS )
GGUFConfigzConfig class for GGUF.Nunquantized_modulesreturnc                    s   t    |pg | _d S N)super__init__r   )selfr   	__class__ `/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/layers/quantization/gguf.pyr   0   s   
zGGUFConfig.__init__c                 C      dS )NzGGUFConfig()r#   r    r#   r#   r$   __repr__4      zGGUFConfig.__repr__c                 C   r%   )Nggufr#   r&   r#   r#   r$   get_name7   r(   zGGUFConfig.get_namec                 C   s0   t drtd tjtjgS tjtjtjgS )Nd   z5GGUF has precision issues with bfloat16 on Blackwell.)r   has_device_capabilityloggerwarning_oncetorchhalffloat32bfloat16r&   r#   r#   r$   get_supported_act_dtypes:   s   

z#GGUFConfig.get_supported_act_dtypesc                 C   r%   )N<   r#   clsr#   r#   r$   get_min_capabilityB      zGGUFConfig.get_min_capabilityc                 C   s   g S r   r#   r5   r#   r#   r$   get_config_filenamesF   r8   zGGUFConfig.get_config_filenamesconfigc                 C   s   |  S r   r#   )r6   r:   r#   r#   r$   from_configJ   s   zGGUFConfig.from_configlayerprefixzQuantizeMethodBase | Nonec                 C   sj   t |trt|| j| jrt S t| S t |tr(t|| j| jr$t S t	| S t |t
r3t| |jS d S r   )
isinstancer   is_layer_skipped_ggufr   packed_modules_mappingr   GGUFLinearMethodr   r   GGUFEmbeddingMethodr   GGUFMoEMethod
moe_config)r    r<   r=   r#   r#   r$   get_quant_methodN   s   




zGGUFConfig.get_quant_methodhf_to_vllm_mapperr   c                 C   s    | j dur|| j | _ dS dS )a   
        Interface for models to update module names referenced in
        quantization configs in order to reflect the vllm model structure

        :param hf_to_vllm_mapper: maps from hf model structure (the assumed
            structure of the qconfig) to vllm model structure
        N)r   
apply_list)r    rF   r#   r#   r$   apply_vllm_mapperb   s
   

zGGUFConfig.apply_vllm_mapperr   )rF   r   )__name__
__module____qualname____doc__liststrr   r'   r   r*   r/   dtyper3   classmethodintr7   r9   dictr   r;   nnModulerE   rH   __classcell__r#   r#   r!   r$   r   -   s(     
r   r=   r   fused_mappingc                    s     dd |v r= fdd| D }d }|D ] tfdd|D }|d u r/|}q||kr;td  dqnt fd	d|D }|d usNJ |S )
N.c                    s   g | ]}  |qS r#   )replace).0shard_proj_name)r=   	proj_namer#   r$   
<listcomp>{   s    
z)is_layer_skipped_gguf.<locals>.<listcomp>c                 3   s    | ]} |v V  qd S r   r#   rZ   module_name)shard_prefixr#   r$   	<genexpr>   s    
z(is_layer_skipped_gguf.<locals>.<genexpr>z$Detected some but not all shards of zF are quantized. All shards of fused layers to have the same precision.c                 3   s    | ]}| v V  qd S r   r#   r^   )r=   r#   r$   ra      s    )splitany
ValueError)r=   r   rV   shard_prefixes
is_skippedis_shard_skippedr#   )r=   r\   r`   r$   r?   p   s*   	
r?   xqweightqweight_typer   c           	      C   s:  |t v r|jd dkrdnd}n|jd dkrdnd}| jd dkr3tj| jd |jd | j| jdS |tv r<| |j S | jd |krT|tv rTt	
|| ||jd }|S |tv ret	|| ||jd }|S |tv rtj| \}}|jd |jd | | f}t	j||g|| jR  }| |j }|S t|}td	| )
Nr   i               rO   device   $Unsupported GGUF quantization type: )IMATRIX_QUANT_TYPESshaper/   emptyrO   rp   UNQUANTIZED_TYPESTMMVQ_QUANT_TYPESopsggml_mul_mat_vec_a8MMQ_QUANT_TYPESggml_mul_mat_a8DEQUANT_TYPESr)   GGML_QUANT_SIZESggml_dequantize
WeightTypeNotImplementedError)	rh   ri   rj   	mmvq_safey
block_size	type_sizert   weightr#   r#   r$   _fused_mul_mat_gguf   s*   "

r   c                 C   s"   t j| jd |jd | j| jdS Nr   ro   )r/   ru   rt   rO   rp   )rh   ri   rj   r#   r#   r$   _fused_mul_mat_gguf_fake   s   "r   )op_nameop_func	fake_implw1w2topk_weightstopk_idsqweight_type2
activationc                    sV  dt jf fdd}ddlm}	 t | }
|tv r|tv r| jd dkr| j\}}|j\}}}|jd }t|}|	|||\}}}t	| ||||||||	}||}t	|||||||jd d|| 	}|
|||jd |||d}t||
 |
S |tv r|tv r| j\}}|j\}}}|jd }t| ||||||}||}t|||d||jd || }|
|||jd |||d}t||
 |
S td tt||D ]L\}\}}| | 
d	| jdd   }d }t||D ],\}}|| }t|||}||}|| }t||||}|d u r|}q|| q||
|< q|
S )
Nrh   c                    s|   | j d d }| j d d |f }tj|| j| jd} dkr)tjj||  |S  dkr7tjj||  |S t	d  )NrX   rm   ro   silugeluzUnsupported activation: )
rt   r/   ru   rO   rp   ry   _Csilu_and_mulgelu_and_mulrd   )rh   doutput_shapeoutr   r#   r$   act   s   z_fused_moe_gguf.<locals>.actr   )moe_align_block_size@   rq   znThere is no support for fast MoE kernel for current quantization method. Falling back to slow implementation. )rq   )r/   Tensor.vllm.model_executor.layers.fused_moe.fused_moer   
empty_liker{   rt   ry   ggml_moe_get_block_sizeggml_moe_a8reshapemul_viewmoe_sumrx   ggml_moe_a8_vecr-   r.   	enumeratezipfused_mul_mat_ggufadd_)rh   r   r   r   r   rj   r   r   r   r   out_hidden_states
num_tokens_ENtop_k
BLOCK_SIZEsorted_token_ids
expert_idsnum_tokens_post_paddedr   tokwidxinpcurrent_hidden_statewwii	expert_upexpert_downcurrent_stater#   r   r$   _fused_moe_gguf   s   





(



r   c                 C   s
   t | S r   )r/   r   )rh   r   r   r   r   rj   r   r   r#   r#   r$   _fused_moe_gguf_fake\  s   

r   hidden_sizerO   c           
      C   s   |t v r
t|| S |tv rEtj| \}}|  }||jd | | ks&J tj|d|d}t	
||||jd |}	|	jg | j|R  S t|}td| )Nrq   r   )dimindexrr   )rv   r/   	embeddingr}   r)   r~   flattenrt   index_selectry   r   r   r   r   )
rh   ri   rj   r   rO   r   r   x_flatquantdequantr#   r#   r$   _apply_gguf_embeddingu  s   r   c                 C   s   t j| jd ||| jdS r   )r/   ru   rt   rp   )rh   ri   rj   r   rO   r#   r#   r$   _apply_gguf_embedding_fake  s   r   c                   @   s   e Zd ZdZdefddZdejjde	de
e	 de	d	e	d
ejfddZdejjfddZdejjfddZ	ddejjdejdejdB dejfddZdS )rA   z[Linear method for GGUF.

    Args:
        quant_config: The GGUF quantization config.
    quant_configc                 C   s
   || _ d S r   )r   )r    r   r#   r#   r$   r     s   
zGGUFLinearMethod.__init__r<   input_size_per_partitionoutput_partition_sizes
input_sizeoutput_sizeparams_dtypec              
   K   s   || _ t|}||f}	tdd}
t|
dd|	dg g i d t|
| |d|
 ttjt|tj	ddd}t|ddi dd	 t|| |d
| d S )NFrequires_gradrq   r   T)	input_dim
output_dimtensor_shapeis_gguf_weightdata_containershard_idshard_id_mapri   rO   )is_gguf_weight_typeweight_typeshard_weight_typeignore_warningrj   )
r   sumGGUFUninitializedParameterr   register_parameterr   r/   ru   lenuint8)r    r<   r   r   r   r   r   extra_weight_attrsoutput_size_per_partitionr   ri   rj   r#   r#   r$   create_weights  s>   



	zGGUFLinearMethod.create_weightsc                 C   sD   |j j}|tv s|tv st|}td| d| d| | d S )Nz#Unsupported GGUF quantization type z
 in layer rW   )rj   r   rv   r}   r   rd   _create_padded_weight_param)r    r<   rj   r#   r#   r$   process_weights_after_loading  s   z.GGUFLinearMethod.process_weights_after_loadingc                 C   sf  |j }|j}|j}t|j }dkrdd |D }t|dks(J td| tt|}tdd |D }t	dd |D }t
j||f||jd}	ttttttf f  }
|D ]6}|| }t	d	d |d
| D }||| d }|| d}|| |	||d
|f< |||f|
|< qY|j  t|	dd}t|t| t|d|
i |d| d
S d
S )z;Create padded weight parameter for GGUF MergedLinear layer.rq   c                 S   s   h | ]}|j qS r#   r   )rZ   datar#   r#   r$   	<setcomp>  s    z?GGUFLinearMethod._create_padded_weight_param.<locals>.<setcomp>z!Data container has mixed dtypes: c                 s       | ]}| d V  qdS )rq   NsizerZ   rh   r#   r#   r$   ra         z?GGUFLinearMethod._create_padded_weight_param.<locals>.<genexpr>c                 s   r   r   Nr   r   r#   r#   r$   ra     r   ro   c                 s   r   r   r   r   r#   r#   r$   ra     r   Nr   Fr   shard_offset_mapri   )ri   r   r   r   r   rd   nextitermaxr   r/   zerosrp   rR   rN   tuplerQ   r   clearr   r   varsr   )r    r<   ri   r   r   r   rO   padded_sideconcat_sidepadded_datar   r   id_in_containerstartendr   padded_paramr#   r#   r$   r     s8   
z,GGUFLinearMethod._create_padded_weight_paramNrh   biasr   c              	   C   s   |j j}|rDd|v rg dn|}|j }g }|D ]$}|j j| \}}	}
|jj| }|t||||	d |
f  | qtj	|dd}n|j }|jj
}t|||}|d urZ|| |S )Nq)r  kvrq   )axis)ri   r   r   rj   r   appendr   
contiguousr/   catr   r   )r    r<   rh   r  r   ri   resultr   r   r  offsetrj   r   r#   r#   r$   apply  s(   
zGGUFLinearMethod.applyr   )rI   rJ   rK   rL   r   r   r/   rS   rT   rQ   rM   rO   r   r   r   r   r  r#   r#   r#   r$   rA     s8    
.&rA   c                       s   e Zd ZdZdedef fddZdejj	de
de
d	e
d
ejf
ddZdejj	dedB fddZdedejdejdejdejeejejf B f
ddZ  ZS )rC   zXMoE method for GGUF.

    Args:
        quant_config: The GGUF quantization config.
    r   moec                    s   t  | || _d S r   )r   r   r   )r    r   r  r!   r#   r$   r   -  s   
zGGUFMoEMethod.__init__r<   num_expertsr   intermediate_size_per_partitionr   c                 K   s  |d| |f}t dd}t|dd|dg d t|| |d| ttjdtjd	dd}	t|	dddd
 t|	| |d|	 |||f}t dd}
t|
dd|dg d t|
| |d|
 ttjdtjd	dd}t|dddd
 t|| |d| d S )Nrm   Fr   rq   r   T)r   r   r   r   r   w13_qweightr   )r   r   r   w13_qweight_type
w2_qweightw2_qweight_type)r   r   r   r   r/   ru   r   )r    r<   r  r   r  r   r   r   r  r  r  r  r#   r#   r$   r   5  sX   	










zGGUFMoEMethod.create_weightsr   Nc                 C   s   d S r   r#   )r    r<   r#   r#   r$   get_fused_moe_quant_configs  s   z(GGUFMoEMethod.get_fused_moe_quant_configrh   r   r   c              	   C   sD   |j dks	J d|jrtdt||j|j|||jj|jj|j S )Nr   z"Only SiLU activation is supported.zGApply router weight on input is not supported forfused GGUF MoE method.)	r   apply_router_weight_on_inputr   fused_moe_ggufr  r  r  r   r  )r    r<   rh   r   r   r#   r#   r$   r  x  s   zGGUFMoEMethod.apply)rI   rJ   rK   rL   r   r
   r   r/   rS   rT   rQ   rO   r   r   r  r   r   r   r  rU   r#   r#   r!   r$   rC   &  sD    
>
rC   c                   @   s.   e Zd ZdZdejjdejdejfddZdS )rB   z^Embedding method for GGUF.

    Args:
        quant_config: The GGUF quantization config.
    r<   rh   r   c                 C   s,   |j }|jj}|jd }t||||| jdS )Nrq   r   )ri   rj   r   r   apply_gguf_embeddingr   )r    r<   rh   ri   rj   r   r#   r#   r$   r     s   
zGGUFEmbeddingMethod.embeddingN)	rI   rJ   rK   rL   r/   rS   rT   r   r   r#   r#   r#   r$   rB     s    "rB   c                   @   s    e Zd ZU eZeej ed< dS )r   r   N)	rI   rJ   rK   r   cls_to_becomerM   r/   r   __annotations__r#   r#   r#   r$   r     s   
 r   r   )bcollections.abcr   typesr   typingr   r)   r/   r   r   torch.nn.parameterr   r   vllmr   ry   vllm.loggerr	   +vllm.model_executor.layers.fused_moe.configr
   r   *vllm.model_executor.layers.fused_moe.layerr   r   !vllm.model_executor.layers.linearr   r   r   'vllm.model_executor.layers.quantizationr   3vllm.model_executor.layers.quantization.base_configr   r   3vllm.model_executor.layers.vocab_parallel_embeddingr   r    vllm.model_executor.models.utilsr   vllm.model_executor.utilsr   vllm.platformsr   vllm.utils.torch_utilsr   rI   r-   r   rN   rM   r?   F32F16BF16rv   Q4_0Q4_1Q5_0Q5_1Q8_0Q8_1STANDARD_QUANT_TYPESQ2_KQ3_KQ4_KQ5_KQ6_KKQUANT_TYPESIQ1_MIQ1_SIQ2_XXSIQ2_XSIQ2_SIQ3_XXSIQ3_SIQ4_XSIQ4_NLrs   r}   rx   r{   r   rQ   r   r   r   AttributeErrorerrorr   r   r  rO   r   r   r  rA   rC   rB   r   r#   r#   r#   r$   <module>   sd  F
%	
#
	
m	
	


 l