o
    i'5                     @   s|   d dl mZmZ ddlmZ erddlmZ ddlmZm	Z	m
Z
mZ ddlmZ e
 r/d dlZeeZG d	d
 d
eZdS )    )TYPE_CHECKINGOptional   )HfQuantizer   )PreTrainedModel)is_accelerate_availableis_fbgemm_gpu_availableis_torch_availablelogging)get_module_from_nameNc                       s   e Zd ZdZdZdZddgZ fddZdd	 Zd*ddZ	ddde
defddZddddde
ddfddZd+ddZ	d,dddeee
  fddZd ee
 d!e
dee
 fd"d#Zd$d% Zd,d&d'Zedefd(d)Z  ZS )-FbgemmFp8HfQuantizerz/
    FP8 quantization using fbgemm kernels
    TFz
fbgemm-gpu
acceleratec                    s   t  j|fi | || _d S N)super__init__quantization_config)selfr   kwargs	__class__ i/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/transformers/quantizers/quantizer_fbgemm_fp8.pyr   +   s   
zFbgemmFp8HfQuantizer.__init__c                 O   s   t  stdt stdtdstdtj stdtj }|\}}|dk r0t	d|
d}|d u r@td	 d S |d ur\| js^t|tr`d
| v sXd| v rbt	dd S d S d S d S )NzUsing fbgemm fp8 quantization requires torch >= 2.1.0Please install the latest version of torch ( pip install --upgrade torch )zUsing fbgemm fp8 quantization requires fbgemm-gpu libraryPlease install the latest version of fbgemm-gpu library by following : https://pytorch.org/FBGEMM/fbgemm_gpu-development/InstallationInstructions.html#fbgemm-gpu-install-librariesz0.32.2z`Loading an FP8 quantized model requires accelerate > 0.32.1 (`pip install --upgrade accelerate`)z=Using FP8 quantized models with fbgemm kernels requires a GPU	   zXFP8 quantized models is only supported on GPUs with compute capability >= 9.0 (e.g H100)
device_mapzYou have loaded an FP8 model on CPU and have a CUDA device available, make sure to set your model on a GPU device in order to run your model. To remove this warning, pass device_map = 'cuda'. cpudiskzYou are attempting to load an FP8 model with a device_map that contains a CPU or disk device.This is not supported when the model is quantized on the fly. Please use a quantized checkpoint or remove the CPU or disk device from the device_map.)r
   ImportErrorr	   r   torchcudais_availableRuntimeErrorget_device_capability
ValueErrorgetloggerwarning_oncepre_quantized
isinstancedictvalues)r   argsr   compute_capabilitymajorminorr   r   r   r   validate_environment/   sJ   


z)FbgemmFp8HfQuantizer.validate_environmentdtypetorch.dtypereturnc                 C   s4   |d u rt j}td| |S |t jkrtd|S )NzOverriding dtype=%s with `dtype=torch.bloat16` due to requirements of `fbgemm-gpu` to enable model loading in fp8. Pass your own dtype to specify the dtype of the remaining non-linear layers or pass dtype=torch.bfloat16 to remove this warning.zYYou cannot use FP8 with dtype=torch.float16.We recommend you passing dtype=torch.bfloat16)r   bfloat16r%   infofloat16r#   )r   r0   r   r   r   update_dtype\   s   
z!FbgemmFp8HfQuantizer.update_dtypemodelr   
param_namec                 K   sb   ddl m}m} t||\}}t||r| js|dkrdS dS t||r/| js+|dkr-dS dS dS )Nr   FbgemmFp8LinearFbgemmFp8Llama4TextExpertsbiasFT)integrationsr:   r;   r   r(   r'   )r   r7   r8   r   r:   r;   moduletensor_namer   r   r   param_needs_quantizationl   s   

z-FbgemmFp8HfQuantizer.param_needs_quantizationparam_valueztorch.Tensortarget_deviceztorch.devicec                 K   s  ddl m}m} t||\}}	t||r2| js|	dkr*|	dkr)|jtjkr)t	dn|	dkr2t	dt||rJ| jsJ|	dksJ|	dksF|	d	krJt	dt||r|	d
kr|
dd}
|
j}|
d|d }tjj|\}}||}|
dd}||d d|d }n4|	dkr|
dd}
|
j}|
d|d }tjj|\}}||}|
dd}||d |d d}tj|||j|	 d< ntjj|\}}tj||jd d||j|	 d< tj|||j|	< ~d S )Nr   r9   r<   weightz6Expect quantized weights but got an unquantized weightweight_scalez;Expect unquantized weights but got a quantized weight_scalegate_up_proj_scaledown_proj_scalegate_up_projr   r   	down_proj_scale)r=   r:   r;   r   r(   r'   r0   r   float8_e4m3fnr#   	transposeshapereshapeopsfbgemmquantize_fp8_per_rownn	Parameterto_parametersview)r   r7   rA   r8   rB   r   r:   r;   r>   r?   transposed_paramoriginal_shapeflattened_paramnew_value_flatweight_scale_flat	new_valuerD   r   r   r   create_quantized_param}   sJ   




 z+FbgemmFp8HfQuantizer.create_quantized_paramc                 K   s   |S r   r   )r   r7   r   r   r   r   #_process_model_after_weight_loading      z8FbgemmFp8HfQuantizer._process_model_after_weight_loadingNkeep_in_fp32_modulesc                 K   sT   ddl m} |j}| || jj|| _|j}||| j| j| j||d}| j|j_d S )Nr   )replace_with_fbgemm_fp8_linear)modules_to_not_convertr   r'   configtp_plan)r=   ra   _tp_planget_modules_to_not_convertr   rb   rc   r'   )r   r7   r`   r   ra   rd   rc   r   r   r   $_process_model_before_weight_loading   s   
	z9FbgemmFp8HfQuantizer._process_model_before_weight_loadingmissing_keysprefixc           	         s   ddl m}m} g  | D ],\}}t|||fr:|D ]}||v s*|| d| v r9|ds9|ds9 | qq fdd|D S )Nr   r9   .z.weightz.biasc                    s   g | ]}| vr|qS r   r   ).0knot_missing_keysr   r   
<listcomp>   s    z<FbgemmFp8HfQuantizer.update_missing_keys.<locals>.<listcomp>)r=   r:   r;   named_modulesr(   endswithappend)	r   r7   rh   ri   r:   r;   namer>   missingr   rm   r   update_missing_keys   s   
z(FbgemmFp8HfQuantizer.update_missing_keysc                 C   s   d|j jv rYi ddddddddddddd	d
dddddddddddddddddd
dddddddd
dddd	}| d urT|| _|S ||_|S |S )NLlama4z layers.*.self_attn.q_proj.weightlocal_colwisez&layers.*.self_attn.q_proj.weight_scalez layers.*.self_attn.k_proj.weightz&layers.*.self_attn.k_proj.weight_scalez layers.*.self_attn.v_proj.weightz&layers.*.self_attn.v_proj.weight_scalez layers.*.self_attn.o_proj.weightlocal_rowwisezlayers.*.self_attngatherzlayers.*.input_layernorm.weightsequence_parallelz(layers.*.post_attention_layernorm.weightznorm.weightz4layers.*.feed_forward.shared_expert.gate_proj.weightz:layers.*.feed_forward.shared_expert.gate_proj.weight_scalez2layers.*.feed_forward.shared_expert.up_proj.weightz8layers.*.feed_forward.shared_expert.up_proj.weight_scalez4layers.*.feed_forward.shared_expert.down_proj.weightzlayers.*.feed_forward.expertslocallocal_packed_rowwise)	zlayers.*.feed_forwardz0layers.*.feed_forward.experts.*.gate_proj.weightz6layers.*.feed_forward.experts.*.gate_proj.weight_scalez.layers.*.feed_forward.experts.*.up_proj.weightz4layers.*.feed_forward.experts.*.up_proj.weight_scalez0layers.*.feed_forward.experts.*.down_proj.weightz*layers.*.feed_forward.experts.gate_up_projz0layers.*.feed_forward.experts.gate_up_proj_scalez'layers.*.feed_forward.experts.down_proj)r   __name__get_text_configbase_model_tp_plan)r   rc   	text_planr   r   r   update_tp_plan   sh   	
%
z#FbgemmFp8HfQuantizer.update_tp_planc                 C      dS )NTr   )r   safe_serializationr   r   r   is_serializable  r_   z$FbgemmFp8HfQuantizer.is_serializablec                 C   r   )NFr   )r   r   r   r   is_trainable   s   z!FbgemmFp8HfQuantizer.is_trainable)r0   r1   r2   r1   )r7   r   r   )r}   
__module____qualname____doc__ requires_parameters_quantizationrequires_calibrationrequired_packagesr   r/   r6   strboolr@   r]   r^   r   listrg   ru   r   r   propertyr   __classcell__r   r   r   r   r   !   s<    
-

F


/r   )typingr   r   baser   modeling_utilsr   utilsr   r	   r
   r   quantizers_utilsr   r   
get_loggerr}   r%   r   r   r   r   r   <module>   s   
