o
    i	%                     @   s|   d dl mZmZ ddlmZmZmZmZ ddlm	Z	 ddl
mZ e r'd dlZer/ddlmZ eeZG d	d
 d
e	ZdS )    )TYPE_CHECKINGOptional   )is_accelerate_availableis_torch_availableis_torch_xpu_availablelogging   )HfQuantizer)get_module_from_nameN)PreTrainedModelc                       s   e Zd ZdZdZdZdgZ fddZdd Zd+ddZ	ddddde
ddfddZddde
defddZ	d,dddeee
  fddZd-ddZdee
 d e
dee
 fd!d"Zd#d$ Zd,d%d&Zedefd'd(Zd)d* Z  ZS ).FineGrainedFP8HfQuantizerz
    FP8 quantization implementation supporting both standard and MoE models.
    Supports both e4m3fn formats based on platform.
    TF
acceleratec                    s   t  j|fi | || _d S N)super__init__quantization_config)selfr   kwargs	__class__ e/home/ubuntu/.local/lib/python3.10/site-packages/transformers/quantizers/quantizer_finegrained_fp8.pyr      s   
z"FineGrainedFP8HfQuantizer.__init__c                 O   s  t  stdt std|dds|ddrtdtj s*t s*t	dtj rOtj
 }|\}}|dk sD|dkrO|d	k rOtd
| d| d|d}|d u r_td d S |d ur{| js}t|trd| v swd| v rtdd S d S d S d S )NzxUsing fp8 quantization requires torch >= 2.1.0Please install the latest version of torch ( pip install --upgrade torch )zMLoading an FP8 quantized model requires accelerate (`pip install accelerate`)from_tfF	from_flaxzConverting into FP8 weights from tf/flax weights is currently not supported, please make sure the weights are in PyTorch format.zANo GPU or XPU found. A GPU or XPU is needed for FP8 quantization.   	   ziFP8 quantized models is only supported on GPUs with compute capability >= 8.9 (e.g 4090/H100), actual = `.`
device_mapzYou have loaded an FP8 model on CPU and have a CUDA or XPU device available, make sure to set your model on a GPU or XPU device in order to run your model. To remove this warning, pass device_map = 'cuda' or 'xpu'. cpudiskzYou are attempting to load an FP8 model with a device_map that contains a cpu/disk device.This is not supported when the model is quantized on the fly. Please use a quantized checkpoint or remove the cpu/disk device from the device_map.)r   ImportErrorr   get
ValueErrortorchcudais_availabler   RuntimeErrorget_device_capabilityloggerwarning_oncepre_quantized
isinstancedictvalues)r   argsr   compute_capabilitymajorminorr   r   r   r   validate_environment   sP   


z.FineGrainedFP8HfQuantizer.validate_environmentdtypetorch.dtypereturnc                 C   s   |d u rt d tj}|S )NzKSetting dtype to torch.float32 as no dtype was specified in from_pretrained)r*   infor%   float32)r   r5   r   r   r   update_dtypeN   s   
z&FineGrainedFP8HfQuantizer.update_dtypemodelr   param_valueztorch.Tensor
param_nametarget_deviceztorch.devicec              
   K   s  ddl m} ddlm} t||\}}	t||r6| js|	dkr.|	dkr-|jtj	kr-t
dn|	dkr6t
d||}ttj	j}
ttj	j}| jj\}}|jd	d  \}}|| d
ksd|| d
krut
d| d| d| d| d	|j}|d|| ||| |d
dddd}tjt|dd}|| }|j}|dd}tj|| |
|dtj	}|d
dddd}||}||  }|||| |||ddd
 d | d S )Nr   	FP8Linear)_load_parameter_into_modelbiasweightz6Expect quantized weights but got an unquantized weightweight_scale_invz;Expect unquantized weights but got a quantized weight_scaler   zMatrix dimensions (z, z$) must be divisible by block sizes ()r	         )rG   rE   )dim)minmaxr   z.weight_scale_inv)integrations.finegrained_fp8r@   modeling_utilsrA   r   r-   r,   r5   r%   float8_e4m3fnr$   tofinforK   rL   r   weight_block_sizeshapereshapepermuteamaxabs	unsqueezeclampsqueeze
reciprocalrsplit)r   r;   r<   r=   r>   r   r@   rA   moduletensor_namefp8_minfp8_maxblock_size_mblock_size_nrowscolsparam_value_orig_shapemax_absscalescale_orig_shapequantized_paramr   r   r   create_quantized_paramT   sF   


 z0FineGrainedFP8HfQuantizer.create_quantized_paramc                 K   s>   ddl m} t||\}}t||r| js|dkrdS dS dS )Nr   r?   rB   FT)rM   r@   r   r-   r,   )r   r;   r=   r   r@   r]   r^   r   r   r   param_needs_quantization   s   
z2FineGrainedFP8HfQuantizer.param_needs_quantizationNkeep_in_fp32_modulesc                 K   s@   ddl m} | || jj|| _||| j| jd}| j|j_d S )Nr   )replace_with_fp8_linear)modules_to_not_convertr   )rM   rm   get_modules_to_not_convertr   rn   config)r   r;   rl   r   rm   r   r   r   $_process_model_before_weight_loading   s   
z>FineGrainedFP8HfQuantizer._process_model_before_weight_loadingc                 K   s   |S r   r   )r   r;   r   r   r   r   #_process_model_after_weight_loading      z=FineGrainedFP8HfQuantizer._process_model_after_weight_loadingmissing_keysprefixc                    s   ddl m} g  | D ]*\}}t||r6|D ]}||v s&|| d| v r5|ds5|ds5 | qq fdd|D S )Nr   r?   r   z.weightz.biasc                    s   g | ]}| vr|qS r   r   ).0knot_missing_keysr   r   
<listcomp>   s    zAFineGrainedFP8HfQuantizer.update_missing_keys.<locals>.<listcomp>)integrationsr@   named_modulesr-   endswithappend)r   r;   rt   ru   r@   namer]   missingr   rx   r   update_missing_keys   s   

z-FineGrainedFP8HfQuantizer.update_missing_keysc                 C   sz   d|j jv r;i ddddddddddddd	d
dd
dddddddddddd
dd
dd}||_|S )NQwen3z layers.*.self_attn.q_proj.weightlocal_colwisez*layers.*.self_attn.q_proj.weight_scale_invz layers.*.self_attn.k_proj.weightz*layers.*.self_attn.k_proj.weight_scale_invz layers.*.self_attn.v_proj.weightz*layers.*.self_attn.v_proj.weight_scale_invz layers.*.self_attn.o_proj.weightlocal_rowwisez*layers.*.self_attn.o_proj.weight_scale_invzlayers.*.self_attngatherzlayers.*.mlp.gate_proj.weightz'layers.*.mlp.gate_proj.weight_scale_invzlayers.*.mlp.up_proj.weightz%layers.*.mlp.up_proj.weight_scale_invzlayers.*.mlp.down_proj.weightz'layers.*.mlp.down_proj.weight_scale_invzlayers.*.mlp)r   __name__base_model_tp_plan)r   rp   	text_planr   r   r   update_tp_plan   sH   	
z(FineGrainedFP8HfQuantizer.update_tp_planc                 C      dS )NTr   )r   safe_serializationr   r   r   is_serializable   rs   z)FineGrainedFP8HfQuantizer.is_serializablec                 C   r   )NFr   r   r   r   r   is_trainable      z&FineGrainedFP8HfQuantizer.is_trainablec                 C   r   )Nr   r   r   r   r   r   get_accelerator_warm_up_factor   r   z8FineGrainedFP8HfQuantizer.get_accelerator_warm_up_factor)r5   r6   r7   r6   r   )r;   r   )r   
__module____qualname____doc__ requires_parameters_quantizationrequires_calibrationrequired_packagesr   r4   r:   strrj   boolrk   r   listrq   rr   r   r   r   propertyr   r   __classcell__r   r   r   r   r      s>    
/
=



r   )typingr   r   utilsr   r   r   r   baser
   quantizers_utilsr   r%   rN   r   
get_loggerr   r*   r   r   r   r   r   <module>   s    
