o
    wi5                     @   s   d dl mZmZmZ ddlmZ erddlmZ ddlm	Z	m
Z
mZmZ ddlmZ e r1d dlZeeZG d	d
 d
eZdS )    )TYPE_CHECKINGAnyOptional   )HfQuantizer   )PreTrainedModel)is_accelerate_availableis_fbgemm_gpu_availableis_torch_availablelogging)get_module_from_nameNc                       s  e Zd ZdZdZdZddgZ fddZdd	 Zd,ddZ	ddddde
dee
ef fddZ	d-ddddde
dddee
ef deee
  fddZd.ddZ	d-dddeee
  fd d!Zd"ee
 d#e
dee
 fd$d%Zd&d' Zd-d(d)Zedefd*d+Z  ZS )/FbgemmFp8HfQuantizerz/
    FP8 quantization using fbgemm kernels
    TFz
fbgemm-gpu
acceleratec                    s   t  j|fi | || _d S N)super__init__quantization_config)selfr   kwargs	__class__ i/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/quantizers/quantizer_fbgemm_fp8.pyr   +   s   
zFbgemmFp8HfQuantizer.__init__c                 O   s   t  stdt stdtdstdtj stdtj }|\}}|dk r0t	d|
dd }|d u rAtd	 d S |d ur]| js_t|trad
| v sYd| v rct	dd S d S d S d S )NzUsing fbgemm fp8 quantization requires torch >= 2.1.0Please install the latest version of torch ( pip install --upgrade torch )zUsing fbgemm fp8 quantization requires fbgemm-gpu libraryPlease install the latest version of fbgemm-gpu library by following : https://pytorch.org/FBGEMM/fbgemm_gpu-development/InstallationInstructions.html#fbgemm-gpu-install-librariesz0.32.2z`Loading an FP8 quantized model requires accelerate > 0.32.1 (`pip install --upgrade accelerate`)z=Using FP8 quantized models with fbgemm kernels requires a GPU	   zXFP8 quantized models is only supported on GPUs with compute capability >= 9.0 (e.g H100)
device_mapzYou have loaded an FP8 model on CPU and have a CUDA device available, make sure to set your model on a GPU device in order to run your model. To remove this warning, pass device_map = 'cuda'. cpudiskzYou are attempting to load an FP8 model with a device_map that contains a CPU or disk device.This is not supported when the model is quantized on the fly. Please use a quantized checkpoint or remove the CPU or disk device from the device_map.)r   ImportErrorr
   r	   torchcudais_availableRuntimeErrorget_device_capability
ValueErrorgetloggerwarning_oncepre_quantized
isinstancedictvalues)r   argsr   compute_capabilitymajorminorr   r   r   r   validate_environment/   sJ   

z)FbgemmFp8HfQuantizer.validate_environmenttorch_dtypetorch.dtypereturnc                 C   s4   |d u rt j}td| |S |t jkrtd|S )Na  Overriding torch_dtype=%s with `torch_dtype=torch.bloat16` due to requirements of `fbgemm-gpu` to enable model loading in fp8. Pass your own torch_dtype to specify the dtype of the remaining non-linear layers or pass torch_dtype=torch.bfloat16 to remove this warning.zeYou cannot use FP8 with torch_dtype=torch.float16.We recommend you passing torch_dtype=torch.bfloat16)r   bfloat16r&   infofloat16r$   )r   r1   r   r   r   update_torch_dtype\   s   
z'FbgemmFp8HfQuantizer.update_torch_dtypemodelr   param_valueztorch.Tensor
param_name
state_dictc           
      K   s   ddl m}m} t||\}}	t||r5| js|	dkr+|	dkr)|jtjkr)t	ddS |	dkr3t	dd	S t||rQ| jsA|	dkrCdS |	d
ksK|	dkrOt	dd	S dS )Nr   FbgemmFp8LinearFbgemmFp8Llama4TextExpertsbiasweightz6Expect quantized weights but got an unquantized weightFweight_scalez;Expect unquantized weights but got a quantized weight_scaleTgate_up_proj_scaledown_proj_scale)
integrationsr=   r>   r   r)   r(   dtyper   float8_e4m3fnr$   )
r   r8   r9   r:   r;   r   r=   r>   moduletensor_namer   r   r   check_quantized_paraml   s"   

z*FbgemmFp8HfQuantizer.check_quantized_paramNtarget_deviceztorch.deviceunexpected_keysc                 C   s  ddl m} t||\}}	t||r|	dkrG|dd}
|
j}|
d|d }tjj	
|\}}||}|dd}||d d|d }n4|	dkr{|dd}
|
j}|
d|d }tjj	
|\}}||}|dd}||d |d d}tj|||j|	 d< ntjj	
|\}}tj||jd d||j|	 d< tj|||j|	< |d	ur||v r|| ~d	S )
z@
        Quantizes weights into weight and weight_scale
        r   )r>   gate_up_projr   r   	down_proj_scaleN)rD   r>   r   r)   	transposeshapereshaper   opsfbgemmquantize_fp8_per_rownn	Parameterto_parametersviewremove)r   r8   r9   r:   rJ   r;   rK   r>   rG   rH   transposed_paramoriginal_shapeflattened_paramnew_value_flatweight_scale_flat	new_valuerA   r   r   r   create_quantized_param   s8   


 
z+FbgemmFp8HfQuantizer.create_quantized_paramc                 K   s   |S r   r   )r   r8   r   r   r   r   #_process_model_after_weight_loading      z8FbgemmFp8HfQuantizer._process_model_after_weight_loadingkeep_in_fp32_modulesc                 K   sT   ddl m} |j}| || jj|| _|j}||| j| j| j||d}| j|j_d S )Nr   )replace_with_fbgemm_fp8_linear)modules_to_not_convertr   r(   configtp_plan)rD   rf   _tp_planget_modules_to_not_convertr   rg   rh   r(   )r   r8   re   r   rf   ri   rh   r   r   r   $_process_model_before_weight_loading   s   
	z9FbgemmFp8HfQuantizer._process_model_before_weight_loadingmissing_keysprefixc           	         s   ddl m}m} g  | D ]/\}}t||st||r=|D ]}||v s-|| d| v r<|ds<|ds< | qq fdd|D S )Nr   r<   .z.weightz.biasc                    s   g | ]}| vr|qS r   r   ).0knot_missing_keysr   r   
<listcomp>   s    z<FbgemmFp8HfQuantizer.update_missing_keys.<locals>.<listcomp>)rD   r=   r>   named_modulesr)   endswithappend)	r   r8   rm   rn   r=   r>   namerG   missingr   rr   r   update_missing_keys   s   
z(FbgemmFp8HfQuantizer.update_missing_keysc                 C   s   d|j jv rYi ddddddddddddd	d
dddddddddddddddddd
dddddddd
dddd	}| d urT|| _|S ||_|S |S )NLlama4z layers.*.self_attn.q_proj.weightlocal_colwisez&layers.*.self_attn.q_proj.weight_scalez layers.*.self_attn.k_proj.weightz&layers.*.self_attn.k_proj.weight_scalez layers.*.self_attn.v_proj.weightz&layers.*.self_attn.v_proj.weight_scalez layers.*.self_attn.o_proj.weightlocal_rowwisezlayers.*.self_attngatherzlayers.*.input_layernorm.weightsequence_parallelz(layers.*.post_attention_layernorm.weightznorm.weightz4layers.*.feed_forward.shared_expert.gate_proj.weightz:layers.*.feed_forward.shared_expert.gate_proj.weight_scalez2layers.*.feed_forward.shared_expert.up_proj.weightz8layers.*.feed_forward.shared_expert.up_proj.weight_scalez4layers.*.feed_forward.shared_expert.down_proj.weightzlayers.*.feed_forward.expertslocallocal_packed_rowwise)	zlayers.*.feed_forwardz0layers.*.feed_forward.experts.*.gate_proj.weightz6layers.*.feed_forward.experts.*.gate_proj.weight_scalez.layers.*.feed_forward.experts.*.up_proj.weightz4layers.*.feed_forward.experts.*.up_proj.weight_scalez0layers.*.feed_forward.experts.*.down_proj.weightz*layers.*.feed_forward.experts.gate_up_projz0layers.*.feed_forward.experts.gate_up_proj_scalez'layers.*.feed_forward.experts.down_proj)r   __name__get_text_configbase_model_tp_plan)r   rh   	text_planr   r   r   update_tp_plan   sh   	
%
z#FbgemmFp8HfQuantizer.update_tp_planc                 C      dS )NTr   )r   safe_serializationr   r   r   is_serializable#  rd   z$FbgemmFp8HfQuantizer.is_serializablec                 C   r   )NFr   )r   r   r   r   is_trainable&  s   z!FbgemmFp8HfQuantizer.is_trainable)r1   r2   r3   r2   r   )r8   r   )r   
__module____qualname____doc__ requires_parameters_quantizationrequires_calibrationrequired_packagesr   r0   r7   strr*   r   rI   r   listrb   rc   rl   rz   r   r   propertyboolr   __classcell__r   r   r   r   r   !   sV    
-

%



?


/r   )typingr   r   r   baser   modeling_utilsr   utilsr	   r
   r   r   quantizers_utilsr   r   
get_loggerr   r&   r   r   r   r   r   <module>   s   
