o
    ei%                     @   s   d dl mZ ddlmZ erddlmZ ddlmZmZm	Z	m
Z
mZmZmZ ddlmZ e
 r3d dlZeeZG d	d
 d
eZdS )    )TYPE_CHECKING   )HfQuantizer   )PreTrainedModel)is_accelerate_availableis_fbgemm_gpu_availableis_kernels_availableis_torch_availableis_torch_cuda_availableis_torch_xpu_availablelogging)get_module_from_nameNc                       s   e Zd ZdZdZ fddZdd Zd!d
dZddded	e	fddZ
dddeddd	ef fddZ		d"ddZdd Zdd Zdd Zed	e	fddZdd  Z  ZS )#FbgemmFp8HfQuantizerz/
    FP8 quantization using fbgemm kernels
    Fc                    s   t  j|fi | d S )N)super__init__)selfquantization_configkwargs	__class__ j/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/quantizers/quantizer_fbgemm_fp8.pyr   /   s   zFbgemmFp8HfQuantizer.__init__c                 O   s   t  s
t s
tdt rt stdt  rt stdt s%tdt  r9tj }|\}}|dk r9t	d|
d}|d u rItd d S t|tra| jscd	| v s]d
| v ret	dd S d S d S )Nz3Using fbgemm fp8 quantization requires a GPU or XPUz@Using FP8 fbgemm on XPU requires kernels (`pip install kernels`)zLoading an FP8 fbgemm quantized model on CUDA requires fbgemm-gpu libraryPlease install the latest version of fbgemm-gpu library by following : https://pytorch.org/FBGEMM/fbgemm_gpu-development/InstallationInstructions.html#fbgemm-gpu-install-librarieszWLoading an FP8 quantized model requires accelerate (`pip install --upgrade accelerate`)	   zXFP8 quantized models is only supported on GPUs with compute capability >= 9.0 (e.g H100)
device_mapzYou have loaded an FP8 model on CPU and have a CUDA/XPU device available, make sure to set your model on a GPU/XPU device in order to run your model. To remove this warning, pass device_map = 'cuda' or 'xpu' or 'auto'. cpudiskzYou are attempting to load an FP8 model with a device_map that contains a CPU or disk device.This is not supported when the model is quantized on the fly. Please use a quantized checkpoint or remove the CPU or disk device from the device_map.)r   r   ImportErrorr	   r   r   torchcudaget_device_capability
ValueErrorgetloggerwarning_once
isinstancedictpre_quantizedvalues)r   argsr   compute_capabilitymajor_r   r   r   r   validate_environment2   s>   


z)FbgemmFp8HfQuantizer.validate_environmentdtypetorch.dtypereturnc                 C   s&   |t jkrtd| d t j}|S )NzSetting dtype to zP, but only bfloat16 is supported right now. Overwriting torch_dtype to bfloat16.)r   bfloat16r#   r$   )r   r.   r   r   r   update_dtypeV   s   

z!FbgemmFp8HfQuantizer.update_dtypemodelr   
param_namec                 K   sb   ddl m}m} t||\}}t||r| js|dkrdS dS t||r/| js+|dkr-dS dS dS )Nr   FbgemmFp8LinearFbgemmFp8Llama4TextExpertsbiasFT)integrationsr6   r7   r   r%   r'   )r   r3   r4   r   r6   r7   moduletensor_namer   r   r   param_needs_quantization^   s   

z-FbgemmFp8HfQuantizer.param_needs_quantizationparamztorch.Tensorc                    s    |  ||rdS t |||S )z4Return the element size (in bytes) for `param_name`.r   )r<   r   param_element_size)r   r3   r4   r=   r   r   r   r>   o   s   z'FbgemmFp8HfQuantizer.param_element_sizec                 K   s@   ddl m} | || jj|j| _||| j| j| j|jd}d S )Nr   )replace_with_fbgemm_fp8_linear)modules_to_not_convertr   r'   tp_plan)r9   r?   get_modules_to_not_convertr   r@   _keep_in_fp32_modulesr'   _tp_plan)r   r3   r   r?   r   r   r   $_process_model_before_weight_loadingv   s   
z9FbgemmFp8HfQuantizer._process_model_before_weight_loadingc                 K   sJ   ddl m}m} | D ]}t|||fr"t|dr"|j| jj	 q|S )z
        Force update the input scale upper bound after weight loading and device dispatch are complete.
        This resolves issues where persistent buffers are zeroed out or overwritten during the loading process.
        r   r5   input_scale_ub)
integrations.fbgemm_fp8r6   r7   modulesr%   hasattrrF   fill_r   activation_scale_ub)r   r3   r   r6   r7   mr   r   r   #_process_model_after_weight_loading   s   
z8FbgemmFp8HfQuantizer._process_model_after_weight_loadingc                 C   s   d|j jv rVi ddddddddddddd	d
dddddddddddddddd
ddddddd
dddd}| d urQ|| _|S ||_|S |S )NLlama4z layers.*.self_attn.q_proj.weightcolwisez&layers.*.self_attn.q_proj.weight_scalez layers.*.self_attn.k_proj.weightz&layers.*.self_attn.k_proj.weight_scalez layers.*.self_attn.v_proj.weightz&layers.*.self_attn.v_proj.weight_scalez layers.*.self_attn.o_proj.weightrowwisezlayers.*.input_layernorm.weightsequence_parallelz(layers.*.post_attention_layernorm.weightznorm.weightz4layers.*.feed_forward.shared_expert.gate_proj.weightz:layers.*.feed_forward.shared_expert.gate_proj.weight_scalez2layers.*.feed_forward.shared_expert.up_proj.weightz8layers.*.feed_forward.shared_expert.up_proj.weight_scalez4layers.*.feed_forward.shared_expert.down_proj.weightz0layers.*.feed_forward.experts.*.gate_proj.weightz6layers.*.feed_forward.experts.*.gate_proj.weight_scalepacked_rowwise)z.layers.*.feed_forward.experts.*.up_proj.weightz4layers.*.feed_forward.experts.*.up_proj.weight_scalez0layers.*.feed_forward.experts.*.down_proj.weightz*layers.*.feed_forward.experts.gate_up_projz0layers.*.feed_forward.experts.gate_up_proj_scalez'layers.*.feed_forward.experts.down_proj)r   __name__get_text_configbase_model_tp_plan)r   config	text_planr   r   r   update_tp_plan   sb   	
"
z#FbgemmFp8HfQuantizer.update_tp_planc                 C      dS )NTr   r   r   r   r   is_serializable   s   z$FbgemmFp8HfQuantizer.is_serializablec                 C   rY   )NFr   rZ   r   r   r   is_trainable   s   z!FbgemmFp8HfQuantizer.is_trainablec                 C   s   ddl m} || S )Nr   )FbgemmFp8Quantize)rG   r]   )r   r]   r   r   r   get_quantize_ops   s   z%FbgemmFp8HfQuantizer.get_quantize_ops)r.   r/   r0   r/   )r3   r   )rS   
__module____qualname____doc__requires_calibrationr   r-   r2   strboolr<   floatr>   rE   rM   rX   r[   propertyr\   r^   __classcell__r   r   r   r   r   (   s"    
$
,r   )typingr   baser   modeling_utilsr   utilsr   r   r	   r
   r   r   r   quantizers_utilsr   r   
get_loggerrS   r#   r   r   r   r   r   <module>   s   $	
