o
    پi!                     @  s   d dl mZ d dlZd dlmZmZ d dlmZmZm	Z	m
Z
mZmZ d dlZd dlmZ erBd dlmZ d dlmZmZ d dlmZ G d	d
 d
eZG dd deZG dd deZG dd deZdddZdS )    )annotationsN)ABCabstractmethod)TYPE_CHECKINGAnyDictListOptionalType)nn)MoeRunnerConfig)CombineInputDispatchOutput)WeightsMapperc                   @  s2   e Zd ZdZdddZeddd	ZdddZdS )QuantizeMethodBasez+Base class for different quantized methods.layertorch.nn.Modulec                 O     t  )zXCreate weights for a layer.

        The weights will be set as attributes of the layer.NotImplementedError)selfr   weight_argsextra_weight_attrs r   ^/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/quantization/base_config.pycreate_weights      z!QuantizeMethodBase.create_weightsreturntorch.Tensorc                 O  r   )zxApply the weights in layer to the input tensor.

        Expects create_weights to have been called before on the layer.r   )r   r   argskwargsr   r   r   apply   s   zQuantizeMethodBase.apply	nn.ModuleNonec                 C     dS )zwProcess the weight after loading.

        This can be used for example, to transpose weights for computation.
        Nr   )r   r   r   r   r   process_weights_after_loading#   s   z0QuantizeMethodBase.process_weights_after_loadingN)r   r   )r   r   r   r   )r   r"   r   r#   )__name__
__module____qualname____doc__r   r   r!   r%   r   r   r   r   r      s    
r   c                   @  s,   e Zd ZdZdddZe	ddddZdS )LinearMethodBasez:Base class for different (maybe quantized) linear methods.r   r   input_size_per_partitionintoutput_partition_sizes	List[int]
input_sizeoutput_sizeparams_dtypetorch.dtypec                 K  r   )a  Create weights for a linear layer.
           The weights will be set as attributes of the layer.

        Args:
            layer: The layer that is using the LinearMethodBase factory.
            input_size_per_partition: Size of the weight input dim on rank X.
            output_partition_sizes: Sizes of the output dim of each logical
                weight on rank X. E.g., output_partition_sizes for QKVLinear
                is a list contains the width of Wq, Wk, Wv on rank X.
            input_size: Size of the input dim of the weight across all ranks.
            output_size: Size of the output dim of the weight across all ranks.
            params_dtype: Datatype of the parameters.
        r   )r   r   r+   r-   r/   r0   r1   r   r   r   r   r   .   s   zLinearMethodBase.create_weightsNxr   biasOptional[torch.Tensor]r   c                 C  r   )zwApply the weights in layer to the input tensor.
        Expects create_weights to have been called before on the layer.r   )r   r   r3   r4   r   r   r   r!   G      	zLinearMethodBase.apply)r   r   r+   r,   r-   r.   r/   r,   r0   r,   r1   r2   N)r   r   r3   r   r4   r5   r   r   )r&   r'   r(   r)   r   r   r!   r   r   r   r   r*   +   s    
r*   c                   @  s.   e Zd Zdd	d
ZdddZedddZdS )FusedMoEMethodBaser   r   num_expertsr,   hidden_sizeintermediate_size_per_partitionr1   r2   c                 K     t r7   r   )r   r   r9   r:   r;   r1   r   r   r   r   r   U      	z!FusedMoEMethodBase.create_weightsmoe_runner_configr   c                 C  r<   r7   r   )r   r   r>   r   r   r   create_moe_runner`   s   z$FusedMoEMethodBase.create_moe_runnerdispatch_outputr   r   r   c                 C  r<   r7   r   )r   r   r@   r   r   r   r!   e   s   zFusedMoEMethodBase.applyN)
r   r   r9   r,   r:   r,   r;   r,   r1   r2   )r   r   r>   r   )r   r   r@   r   r   r   )r&   r'   r(   r   r?   r   r!   r   r   r   r   r8   S   s
    

r8   c                      s   e Zd ZdZ fddZed.ddZed/d	d
Zeed0ddZ	e
ed1ddZeed2ddZed3ddZed3ddZe
d4ddZe
d5d d!Zed6d&d'Zed1d(d)Zd7d,d-Z  ZS )8QuantizationConfigz$Base class for quantization configs.c                   s   t    t | _d S r7   )super__init__dictpacked_modules_mappingr   	__class__r   r   rC   q   s   
zQuantizationConfig.__init__r   strc                 C  r   )z Name of the quantization method.r   rF   r   r   r   get_namev      zQuantizationConfig.get_nameList[torch.dtype]c                 C  r   )z$List of supported activation dtypes.r   rF   r   r   r   get_supported_act_dtypes{   rK   z+QuantizationConfig.get_supported_act_dtypesr,   c                 C  r   )zMinimum GPU capability to support the quantization method.

        E.g., 70 for Volta, 75 for Turing, 80 for Ampere.
        This requirement is due to the custom CUDA kernels used by the
        quantization method.
        r   )clsr   r   r   get_min_capability   r6   z%QuantizationConfig.get_min_capability	List[str]c                   C  r   )z7List of filenames to search for in the model directory.r   r   r   r   r   get_config_filenames      z'QuantizationConfig.get_config_filenamesconfigDict[str, Any]'QuantizationConfig'c                 C  r   )z;Create a config class from the model's quantization config.r   )rN   rS   r   r   r   from_config   rR   zQuantizationConfig.from_configOptional[str]c                 C  r$   )z
        Detects if this quantization method can support a given checkpoint
        format by overriding the user specified quantization method --
        this method should only be overwritten by subclasses in exceptional
        circumstances
        Nr   )rN   hf_quant_cfg
user_quantr   r   r   override_quantization_method   s   z/QuantizationConfig.override_quantization_methodc                 C  sp   |du rdS | dd }|dkr"d|v rdS d|v s d|v r"d	S | d
ddkr,dS | d
dd	kr6d	S dS )z3Shared ModelOpt quantization method override logic.N
quant_algo modeloptFP8modelopt_fp8NVFP4FP4modelopt_fp4quant_method)getupper)rN   hf_quant_configrY   r[   r   r   r   &_modelopt_override_quantization_method   s   z9QuantizationConfig._modelopt_override_quantization_methodkeysr   c                 C  s.   |D ]}|| v r| |   S qt d| d)z1Get a value from the model's quantization config.zCannot find any of z$ in the model's quantization config.)
ValueError)rS   rh   keyr   r   r   get_from_keys   s   
z QuantizationConfig.get_from_keysdefaultc                 C  s&   zt | |W S  ty   | Y S w )z:Get a optional value from the model's quantization config.)rA   rk   ri   )rS   rh   rl   r   r   r   get_from_keys_or   s
   z#QuantizationConfig.get_from_keys_orr   r   prefixOptional[QuantizeMethodBase]c                 C  r   )a7  Get the quantize method to use for the quantized layer.

        Args:
            layer: The layer for the quant method.
            prefix: The full name of the layer in the state dict
        Returns:
            The quantize method. None if the given layer doesn't support quant
            method.
        r   )r   r   rn   r   r   r   get_quant_method   s   z#QuantizationConfig.get_quant_methodc                 C  r   )zvReturns the activation function names that should be post-scaled.

        For now, this is only used by AWQ.
        r   rF   r   r   r   get_scaled_act_names   r   z'QuantizationConfig.get_scaled_act_nameshf_to_sglang_mapper'WeightsMapper'c                 C  r$   )a%  
        Interface for models to update module names referenced in
        quantization configs in order to reflect the sglang model structure
        :param hf_to_sglang_mapper: maps from hf model structure (the assumed
            structure of the qconfig) to sglang model structure
        Nr   )r   rr   r   r   r   apply_weight_name_mapper   r=   z+QuantizationConfig.apply_weight_name_mapper)r   rI   )r   rL   )r   r,   )r   rP   )rS   rT   r   rU   )r   rW   )rS   rT   rh   rP   r   r   )rS   rT   rh   rP   rl   r   r   r   )r   r   rn   rI   r   ro   )rr   rs   )r&   r'   r(   r)   rC   r   rJ   rM   classmethodrO   staticmethodrQ   rV   rZ   rg   rk   rm   rp   rq   rt   __classcell__r   r   rG   r   rA   n   s:    			rA   method_classType[QuantizeMethodBase]r   boolc                 C  s,   t tdd}t | dd}|duo||uS )z
    Not all quant methods have embedding implemented, so we need to check that
    it exists for our given method. We check this by making sure the function
    has been changed from the base implementation.
    	embeddingN)inspectgetattr_staticr   )rx   base_embeddingclass_embeddingr   r   r    method_has_implemented_embedding   s   r   )rx   ry   r   rz   )
__future__r   r|   abcr   r   typingr   r   r   r   r	   r
   torchr    sglang.srt.layers.moe.moe_runnerr   &sglang.srt.layers.moe.token_dispatcherr   r   sglang.srt.models.utilsr   r   r*   r8   rA   r   r   r   r   r   <module>   s     ( 