o
    ei                     @   sx   d dl mZ ddlmZmZmZmZ ddlmZ ddl	m
Z
 e r%d dlZer-ddlmZ eeZG d	d
 d
eZdS )    )TYPE_CHECKING   )is_accelerate_availableis_torch_availableis_torch_xpu_availablelogging   )HfQuantizer)get_module_from_nameN)PreTrainedModelc                       s   e Zd ZdZdZ fddZdd Zddd	ed
efddZ	ddd	eddd
e
f fddZ		dddZdd Zdd Zed
efddZdd Zdd Z  ZS )FineGrainedFP8HfQuantizerz
    FP8 quantization implementation supporting both standard and MoE models.
    Supports both e4m3fn formats based on platform.
    Fc                    s   t  j|fi | d S )N)super__init__)selfquantization_configkwargs	__class__ o/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/quantizers/quantizer_finegrained_fp8.pyr      s   z"FineGrainedFP8HfQuantizer.__init__c                 O   s
  t  std| jjrd S tj s't s'| jr#t	
d d| j_d S tdtj rStj }|\}}|dk sA|dkrS|dk rSt	
d| d| d	 d| j_d S |d
}|d u rct	
d d S t|tr| jswt|dkrwd| v s}d| v rtdd S d S )NzMLoading an FP8 quantized model requires accelerate (`pip install accelerate`)zUsing FP8 quantized models requires a GPU or XPU, we will default to dequantizing the model to bf16 since no GPU or XPU is availableTzANo GPU or XPU found. A GPU or XPU is needed for FP8 quantization.   	   ziFP8 quantized models is only supported on GPUs with compute capability >= 8.9 (e.g 4090/H100), actual = `.z`. We will default to dequantizing the model to bf16. Feel free to use a different quantization method like bitsandbytes or torchao
device_mapzYou have loaded an FP8 model on CPU and have a CUDA or XPU device available, make sure to set your model on a GPU or XPU device in order to run your model. To remove this warning, pass device_map = 'cuda' or 'xpu'. r   cpudiskzYou are attempting to load an FP8 model with a device_map that contains a cpu/disk device.This is not supported when the model is quantized on the fly. Please use a quantized checkpoint or remove the cpu/disk device from the device_map.)r   ImportErrorr   
dequantizetorchcudais_availabler   pre_quantizedloggerwarning_onceRuntimeErrorget_device_capabilityget
isinstancedictlenvalues
ValueError)r   argsr   compute_capabilitymajorminorr   r   r   r   validate_environment   sR   



z.FineGrainedFP8HfQuantizer.validate_environmentmodelr   
param_namereturnc                 K   sF   ddl m}m} t||\}}t|||fr!| js|dkrdS dS dS )Nr   )	FP8Expert	FP8LinearbiasFT)integrations.finegrained_fp8r4   r5   r
   r'   r!   )r   r1   r2   r   r4   r5   moduletensor_namer   r   r   param_needs_quantizationM   s   z2FineGrainedFP8HfQuantizer.param_needs_quantizationparamztorch.Tensorc                    s    |  ||rdS t |||S )z4Return the element size (in bytes) for `param_name`.r   )r:   r   param_element_size)r   r1   r2   r;   r   r   r   r<   X   s   z,FineGrainedFP8HfQuantizer.param_element_sizec                 K   s<   ddl m} | || jj|j| _||| j| j| jd}d S )Nr   )replace_with_fp8_linear)modules_to_not_convertr   r!   )r7   r=   get_modules_to_not_convertr   r>   _keep_in_fp32_modulesr!   )r   r1   r   r=   r   r   r   $_process_model_before_weight_loading_   s   
z>FineGrainedFP8HfQuantizer._process_model_before_weight_loadingc                 C   s8   d|j jv rddddddddddddddd}||_|S )NQwen3colwiserowwise)z layers.*.self_attn.q_proj.weightz*layers.*.self_attn.q_proj.weight_scale_invz layers.*.self_attn.k_proj.weightz*layers.*.self_attn.k_proj.weight_scale_invz layers.*.self_attn.v_proj.weightz*layers.*.self_attn.v_proj.weight_scale_invz layers.*.self_attn.o_proj.weightz*layers.*.self_attn.o_proj.weight_scale_invzlayers.*.mlp.gate_proj.weightz'layers.*.mlp.gate_proj.weight_scale_invzlayers.*.mlp.up_proj.weightz%layers.*.mlp.up_proj.weight_scale_invzlayers.*.mlp.down_proj.weightz'layers.*.mlp.down_proj.weight_scale_inv)r   __name__base_model_tp_plan)r   config	text_planr   r   r   update_tp_planq   s$   z(FineGrainedFP8HfQuantizer.update_tp_planc                 C      dS )NTr   r   r   r   r   is_serializable   s   z)FineGrainedFP8HfQuantizer.is_serializablec                 C   rJ   )NFr   rK   r   r   r   is_trainable   s   z&FineGrainedFP8HfQuantizer.is_trainablec                 C   s   ddl m} || S )Nr   )Fp8Quantize)r7   rN   )r   rN   r   r   r   get_quantize_ops   s   z*FineGrainedFP8HfQuantizer.get_quantize_opsc                 C   sD   ddl m} ddlm} | jr | jjr |g dd|| gdgS g S )Nr   )WeightConverter)Fp8Dequantize)zweight$weight_scale_invactivation_scaleweight)source_patternstarget_patterns
operations)core_model_loadingrP   r7   rQ   r!   r   r   )r   rP   rQ   r   r   r   get_weight_conversions   s   	z0FineGrainedFP8HfQuantizer.get_weight_conversions)r1   r   )rE   
__module____qualname____doc__requires_calibrationr   r0   strboolr:   floatr<   rA   rI   rL   propertyrM   rO   rY   __classcell__r   r   r   r   r      s     1
r   )typingr   utilsr   r   r   r   baser	   quantizers_utilsr
   r   modeling_utilsr   
get_loggerrE   r"   r   r   r   r   r   <module>   s    
