o
    پi6                     @  s6  d dl mZ d dlZd dlmZ d dlmZmZmZm	Z	m
Z
mZmZmZmZ d dlZd dlmZ d dlmZmZ d dlmZ d dlmZmZmZmZ d d	lmZ d d
lmZ ertd dl m!Z! d dl"m#Z#m$Z$ d dlm%Z% d dlm&Z&m'Z' e(e)Z*dd Z+dd Z,G dd deZ-G dd deZ.G dd deZ/dS )    )annotationsN)MappingProxyType)	TYPE_CHECKINGAnyDictListMappingOptionalTupleUnioncast)_NPULinearMethodBase)FusedMoEMethodBaseQuantizationConfig)should_ignore_layer)ModelSlimW4A4Int4ModelSlimW4A8Int8MoEModelSlimW8A8Int8ModelSlimW8A8Int8MoE)UnquantizedLinearMethod)apply_module_patch)MoeRunnerConfig)CombineInputStandardDispatchOutput)QuantizeMethodBase)ModelSlimLinearSchemeModelSlimMoESchemec                   s   d fdd}|S )Nhidden_sizeintreturnNonec                   s4    | |fi | d| _ tjjt|dd| _d S )NTF)requires_grad)ignore_antitorchnn	Parameterzerosbias)selfr   
extra_argsfunc f/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/quantization/modelslim/modelslim.pyinit+   s   z&npu_wrapper_rmsnorm_init.<locals>.init)r   r   r   r    r,   )r+   r.   r,   r*   r-   npu_wrapper_rmsnorm_init*   s   r/   c                 C  s   	 	 d
ddd	}|S )Nxtorch.TensorresidualOptional[torch.Tensor]post_residual_additionr   6Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]c                 S  s   ddl m} | s| }|d ur0|d ur|| }|||| jj| j| j\}}||j	|fS t
jj|| jj| jd }|| j }||j	S )Nr   )add_rmsnorm_bias)$sgl_kernel_npu.norm.add_rmsnorm_biasr6   is_contiguous
contiguousweightdatar'   variance_epsilontodtyper#   opsnpunpu_rms_norm)r(   r0   r2   r4   r6   outresidual_outr,   r,   r-   _rmsnorm_forward_oot6   s"   
z9npu_wrapper_rmsnorm_forward.<locals>._rmsnorm_forward_oot)NN)r0   r1   r2   r3   r4   r3   r   r5   r,   )r+   rD   r,   r,   r-   npu_wrapper_rmsnorm_forward5   s   rE   c                      s   e Zd ZdZi fd1 fddZd2dd	Zed3ddZed4ddZed5ddZ	ed6ddZ
ed7ddZd8ddZd9d!d"Z	#d:d;d&d'Zd<d)d*Zei fd=d-d.Zd6d/d0Z  ZS )>ModelSlimConfigzT
    Config class for ModelSlim Quantization, a NPU-specific quantization type.
    quant_configDict[str, Any]c                   s   t    || _ttt |dg }|d ur|ng | _|di }|d ur(|ni | _| j	 D ]}d|v rDt
ddtg t
ddtg q0d S )Nignorepacked_modules_mappingz	norm.biasz#sglang.srt.layers.layernorm.RMSNorm__init__forward_npu)superrK   quant_descriptionr   r   strgetrI   rJ   keysr   r/   rE   )r(   rG   rI   rJ   name	__class__r,   r-   rK   X   s*   
zModelSlimConfig.__init__r   ModelSlimLinearMethodc                 C  s   t | S N)rU   r(   r,   r,   r-   get_linear_methodo   s   z!ModelSlimConfig.get_linear_methodList[torch.dtype]c                 C  s   t jt jt jgS rV   )r#   int8float16bfloat16clsr,   r,   r-   get_supported_act_dtypesr   s   z(ModelSlimConfig.get_supported_act_dtypesr   c                 C     dS )Nr   r,   r]   r,   r,   r-   get_min_capabilityv      z"ModelSlimConfig.get_min_capabilityrO   c                 C  r`   )N	modelslimr,   r]   r,   r,   r-   get_namez   rb   zModelSlimConfig.get_name	List[str]c                 C  s
   dg}|S )Nzquant_model_description.jsonr,   )r^   	filenamesr,   r,   r-   get_config_filenames~   s   z$ModelSlimConfig.get_config_filenamesconfigc                 C  s   | |S rV   r,   )r^   rh   r,   r,   r-   from_config   s   zModelSlimConfig.from_configlayertorch.nn.ModuleprefixOptional[QuantizeMethodBase]c           
      C  s   ddl m} ddlm} t||rat|| j| jdrt S d}d|v r&d}nd|v r,d}| j	|i }|}|
dd	 }||v rJ|||| d }| ||rSt S | j||d
}	|	|_t| S t||rq| |||_t| S d S )Nr   )
LinearBase)FusedMoE)rI   fused_mappingmodelvision_modelvisual.)rj   
layer_name)sglang.srt.layers.linearrn   &sglang.srt.layers.moe.fused_moe_tritonro   
isinstancer   rI   rJ   r   rP   splitreplaceis_layer_skippedget_linear_schemeschemerU   get_moe_schemeModelSlimFusedMoEMethod)
r(   rj   rl   rn   ro   keypacked_modules_mapping_subsetprefix_in_quant_config	proj_namer~   r,   r,   r-   get_quant_method   s@   

z ModelSlimConfig.get_quant_methodrv   r   c                 C  sN   | j |d d}|dks|dkrt| j |dS |dkr#t| j |dS td)N.weight W8A8_DYNAMICW8A8)rG   rl   W4A4_DYNAMICz)No modelslim compatible scheme was found.)rN   rP   r   r   NotImplementedError)r(   rv   
quant_typer,   r,   r-   _get_scheme_from_parts   s   z&ModelSlimConfig._get_scheme_from_partsNOptional[str]Optional[ModelSlimLinearScheme]c                 C  s"   | j |d}td|jj| |S )z
        get_scheme method adjusted for modelslim, taken from
        python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py
        )rv   zUsing scheme: %s for %s)r   loggerdebugrT   __name__)r(   rj   rv   r~   r,   r,   r-   r}      s
   z!ModelSlimConfig.get_linear_schemeOptional[ModelSlimMoEScheme]c                 C  s~   |d }| j |ddk}| j |ddk}|r!td t| S |r,td t| S td| j |  d|  d S )	Nz.0.down_proj.weightSTATICW4A8_DYNAMICr   zUsing ModelSlimW4A8Int8MoEzUsing ModelSlimW8A8Int8MoEz'Unsupported FusedMoe modelslim scheme: z in layer: )rN   rP   r   	info_oncer   r   warningstrip)r(   rj   rl   r   is_moe_w4a8_dynamicis_moe_w8a8_dynamicr,   r,   r-   r      s.   

zModelSlimConfig.get_moe_schemerp   Mapping[str, List[str]]c           
        s     dd t| ds2i }| j D ]\}}|dd}d|v r'|dd}|||< q|| _d| _|v rh fd	d
| D }d }|D ] }| j|d ddk}	|d u rZ|	}qF|	|krftd  dqFn| j d ddk}|d usyJ |S )Nrt   ru   _quant_description_normalizedzlanguage_model.r   rs   zmodel.Tc                   s   g | ]}  |qS r,   )r{   ).0shard_proj_namerl   r   r,   r-   
<listcomp>   s    
z4ModelSlimConfig.is_layer_skipped.<locals>.<listcomp>r   FLOATz$Detected some but not all shards of zF are quantized. All shards of fused layers to have the same precision.)rz   hasattrrN   itemsr{   r   rP   
ValueError)
r(   rl   rp   rN   prefix_valueshard_prefixes
is_skippedshard_prefixis_shard_skippedr,   r   r-   r|      s:   


z ModelSlimConfig.is_layer_skippedc                 C  s   g S rV   r,   rW   r,   r,   r-   get_scaled_act_names  s   z$ModelSlimConfig.get_scaled_act_names)rG   rH   )r   rU   )r   rY   )r   r   )r   rO   )r   re   )rh   rH   r   rF   )rj   rk   rl   rO   r   rm   )rv   rO   r   r   rV   )rj   rk   rv   r   r   r   )rj   rk   rl   rO   r   r   )rl   rO   rp   r   )r   
__module____qualname____doc__rK   rX   classmethodr_   ra   rd   rg   ri   r   r   r}   r   r   r|   r   __classcell__r,   r,   rS   r-   rF   S   s,    


(
 (rF   c                   @  s8   e Zd ZdddZdd	d
ZdddZ	dd ddZdS )!rU   quantization_configrF   c                 C  
   || _ d S rV   r   r(   r   r,   r,   r-   rK        
zModelSlimLinearMethod.__init__rj   rk   r   r    c                 C     |j | d S rV   r~   process_weights_after_loadingr(   rj   r,   r,   r-   r        z3ModelSlimLinearMethod.process_weights_after_loadinginput_size_per_partitionr   output_partition_sizes	List[int]
input_sizeoutput_sizeparams_dtypetorch.dtypec           	   	   K  s(   | d}|jj|||||||d dS )z
        Use the ModelSlimLinearScheme associated with the layer to create
        the necessary parameters for the layer. See LinearMethodBase for param
        details
        weight_loader)rj   r   r   r   r   r   r   N)rP   r~   create_weights)	r(   rj   r   r   r   r   r   extra_weight_attrsr   r,   r,   r-   r   !  s   

z$ModelSlimLinearMethod.create_weightsNr0   r1   r'   r3   c                 C  s&   |j }|du rtd|j|||dS )z
        Use the output of create_weights and the ModelSlimLinearScheme
        associated with the layer to apply the forward pass with the
        layer input.  See LinearMethodBase for param details

        N'A scheme must be defined for each layer)r'   r~   r   apply_weights)r(   rj   r0   r'   r~   r,   r,   r-   apply;  s   zModelSlimLinearMethod.applyr   rF   rj   rk   r   r    )rj   rk   r   r   r   r   r   r   r   r   r   r   rV   )rj   rk   r0   r1   r'   r3   )r   r   r   rK   r   r   r   r,   r,   r,   r-   rU     s    


rU   c                   @  sF   e Zd ZdddZd d	d
Zd!ddZd"ddZd#ddZdd ZdS )$r   r   rF   c                 C  r   rV   r   r   r,   r,   r-   rK   P  r   z ModelSlimFusedMoEMethod.__init__rj   rk   r   r    c                 C  r   rV   r   r   r,   r,   r-   r   S  r   z5ModelSlimFusedMoEMethod.process_weights_after_loadingnum_expertsr   r   intermediate_size_per_partitionr   r   c                 K  s"   |j jd|||||d| dS )z
        Use the ModelSlimMoEScheme associated with the layer to create
        the necessary parameters for the layer. See FusedMoEMethodBase for param
        details
        )rj   r   r   r   r   Nr,   )r~   r   )r(   rj   r   r   r   r   r   r,   r,   r-   r   V  s   
z&ModelSlimFusedMoEMethod.create_weightsmoe_runner_configr   c                 C  s   |j ||S rV   )r~   create_moe_runner)r(   rj   r   r,   r,   r-   r   m  s   z)ModelSlimFusedMoEMethod.create_moe_runnerdispatch_outputr   r   c                 C  s"   |j }|du rtd|||S )z
        Use the output of create_weights and the ModelSlimMoEScheme
        associated with the layer to apply the forward pass with the
        layer input.  See FusedMoEMethodBase for param details

        Nr   r   )r(   rj   r   r~   r,   r,   r-   r   r  s   zModelSlimFusedMoEMethod.applyc                 C  s   |j ||||||S rV   )r~   apply_without_routing_weights)r(   rj   hidden_stateshidden_states_scalegroup_list_type
group_listoutput_dtyper,   r,   r-   r     s   	z5ModelSlimFusedMoEMethod.apply_without_routing_weightsNr   r   )
rj   rk   r   r   r   r   r   r   r   r   )rj   rk   r   r   )rj   rk   r   r   r   r   )	r   r   r   rK   r   r   r   r   r   r,   r,   r,   r-   r   N  s    




r   )0
__future__r   loggingtypesr   typingr   r   r   r   r   r	   r
   r   r   r#   >sglang.srt.hardware_backend.npu.quantization.linear_method_npur   *sglang.srt.layers.quantization.base_configr   r   7sglang.srt.layers.quantization.compressed_tensors.utilsr   0sglang.srt.layers.quantization.modelslim.schemesr   r   r   r   &sglang.srt.layers.quantization.unquantr   sglang.srt.utilsr   sglang.srt.layers.moer   &sglang.srt.layers.moe.token_dispatcherr   r   r   r   r   	getLoggerr   r   r/   rE   rF   rU   r   r,   r,   r,   r-   <module>   s.    ,
 G5