o
    wi%                     @   s   d dl mZmZmZ ddlmZmZmZmZ ddl	m
Z
 ddlmZ e r)d dlZer1ddlmZ eeZG d	d
 d
e
ZdS )    )TYPE_CHECKINGAnyOptional   )is_accelerate_availableis_torch_availableis_torch_xpu_availablelogging   )HfQuantizer)get_module_from_nameN)PreTrainedModelc                       s
  e Zd ZdZdZdZdgZ fddZdd Zd-ddZ		d.ddddde
dddee
ef deee
  fddZddddde
dee
ef fddZ	d.dddeee
  fddZd/dd Zd!ee
 d"e
dee
 fd#d$Zd%d& Zd.d'd(Zedefd)d*Zd+d, Z  ZS )0FineGrainedFP8HfQuantizerz
    FP8 quantization implementation supporting both standard and MoE models.
    Supports both e4m3fn formats based on platform.
    TF
acceleratec                    s   t  j|fi | || _d S N)super__init__quantization_config)selfr   kwargs	__class__ n/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/quantizers/quantizer_finegrained_fp8.pyr      s   
z"FineGrainedFP8HfQuantizer.__init__c                 O   s  t  stdt std|dds|ddrtdtj s*t s*t	dtj rOtj
 }|\}}|dk sD|dkrO|d	k rOtd
| d| d|dd }|d u r`td d S |d ur|| js~t|trd| v sxd| v rtdd S d S d S d S )NzxUsing fp8 quantization requires torch >= 2.1.0Please install the latest version of torch ( pip install --upgrade torch )zMLoading an FP8 quantized model requires accelerate (`pip install accelerate`)from_tfF	from_flaxzConverting into FP8 weights from tf/flax weights is currently not supported, please make sure the weights are in PyTorch format.zANo GPU or XPU found. A GPU or XPU is needed for FP8 quantization.   	   ziFP8 quantized models is only supported on GPUs with compute capability >= 8.9 (e.g 4090/H100), actual = `.`
device_mapzYou have loaded an FP8 model on CPU and have a CUDA device available, make sure to set your model on a GPU device in order to run your model. To remove this warning, pass device_map = 'cuda'. cpudiskzYou are attempting to load an FP8 model with a device_map that contains a cpu/disk device.This is not supported when the model is quantized on the fly. Please use a quantized checkpoint or remove the cpu/disk device from the device_map.)r   ImportErrorr   get
ValueErrortorchcudais_availabler   RuntimeErrorget_device_capabilityloggerwarning_oncepre_quantized
isinstancedictvalues)r   argsr   compute_capabilitymajorminorr    r   r   r   validate_environment   sP   

z.FineGrainedFP8HfQuantizer.validate_environmenttorch_dtypetorch.dtypereturnc                 C   s   |d u rt d tj}|S )NzWSetting torch_dtype to torch.float32 as no torch_dtype was specified in from_pretrained)r+   infor&   float32)r   r6   r   r   r   update_torch_dtypeM   s   
z,FineGrainedFP8HfQuantizer.update_torch_dtypeNmodelr   param_valueztorch.Tensor
param_nametarget_deviceztorch.device
state_dictunexpected_keysc              
   C   s`  ddl m} ||}ttjj}ttjj}	| jj	\}
}|j
dd \}}||
 dks4|| dkrEtd| d| d|
 d| d		|j
}|d
||
 |
|| |ddddd}tjt|dd}|	| }|j
}|d
d
}tj|| ||	dtj}|ddddd}||}||  }|||| |||ddd d | dS )zO
        Quantizes weights to FP8 format using Block-wise quantization
        r   )_load_parameter_into_modelNr   zMatrix dimensions (z, z$) must be divisible by block sizes ()r
         )rE   rC   )dim)minmaxr   z.weight_scale_inv)modeling_utilsrB   tor&   finfofloat8_e4m3fnrI   rJ   r   weight_block_sizeshaper%   reshapepermuteamaxabs	unsqueezeclampsqueeze
reciprocalrsplit)r   r<   r=   r>   r?   r@   rA   rB   fp8_minfp8_maxblock_size_mblock_size_nrowscolsparam_value_orig_shapemax_absscalescale_orig_shapequantized_paramr   r   r   create_quantized_paramS   s4   

 z0FineGrainedFP8HfQuantizer.create_quantized_paramc           	      K   sj   ddl m} t||\}}t||r3| js|dkr)|dkr'|jtjkr'tddS |dkr1tdd	S dS )
Nr   	FP8Linearbiasweightz6Expect quantized weights but got an unquantized weightFweight_scale_invz;Expect unquantized weights but got a quantized weight_scaleT)	integrations.finegrained_fp8rg   r   r.   r-   dtyper&   rN   r%   )	r   r<   r=   r>   r@   r   rg   moduletensor_namer   r   r   check_quantized_param   s   
z/FineGrainedFP8HfQuantizer.check_quantized_paramkeep_in_fp32_modulesc                 K   s@   ddl m} | || jj|| _||| j| jd}| j|j_d S )Nr   )replace_with_fp8_linear)modules_to_not_convertr   )rk   rq   get_modules_to_not_convertr   rr   config)r   r<   rp   r   rq   r   r   r   $_process_model_before_weight_loading   s   
z>FineGrainedFP8HfQuantizer._process_model_before_weight_loadingc                 K   s   |S r   r   )r   r<   r   r   r   r   #_process_model_after_weight_loading      z=FineGrainedFP8HfQuantizer._process_model_after_weight_loadingmissing_keysprefixc                    s   ddl m} g  | D ]*\}}t||r6|D ]}||v s&|| d| v r5|ds5|ds5 | qq fdd|D S )Nr   rf   r   z.weightz.biasc                    s   g | ]}| vr|qS r   r   ).0knot_missing_keysr   r   
<listcomp>   s    zAFineGrainedFP8HfQuantizer.update_missing_keys.<locals>.<listcomp>)integrationsrg   named_modulesr.   endswithappend)r   r<   rx   ry   rg   namerm   missingr   r|   r   update_missing_keys   s   

z-FineGrainedFP8HfQuantizer.update_missing_keysc                 C   sz   d|j jv r;i ddddddddddddd	d
dd
dddddddddddd
dd
dd}||_|S )NQwen3z layers.*.self_attn.q_proj.weightlocal_colwisez*layers.*.self_attn.q_proj.weight_scale_invz layers.*.self_attn.k_proj.weightz*layers.*.self_attn.k_proj.weight_scale_invz layers.*.self_attn.v_proj.weightz*layers.*.self_attn.v_proj.weight_scale_invz layers.*.self_attn.o_proj.weightlocal_rowwisez*layers.*.self_attn.o_proj.weight_scale_invzlayers.*.self_attngatherzlayers.*.mlp.gate_proj.weightz'layers.*.mlp.gate_proj.weight_scale_invzlayers.*.mlp.up_proj.weightz%layers.*.mlp.up_proj.weight_scale_invzlayers.*.mlp.down_proj.weightz'layers.*.mlp.down_proj.weight_scale_invzlayers.*.mlp)r   __name__base_model_tp_plan)r   rt   	text_planr   r   r   update_tp_plan   sH   	
z(FineGrainedFP8HfQuantizer.update_tp_planc                 C      dS )NTr   )r   safe_serializationr   r   r   is_serializable   rw   z)FineGrainedFP8HfQuantizer.is_serializablec                 C   r   )NFr   r   r   r   r   is_trainable      z&FineGrainedFP8HfQuantizer.is_trainablec                 C   r   )Nr   r   r   r   r   r   get_cuda_warm_up_factor   r   z1FineGrainedFP8HfQuantizer.get_cuda_warm_up_factor)r6   r7   r8   r7   r   )r<   r   )r   
__module____qualname____doc__ requires_parameters_quantizationrequires_calibrationrequired_packagesr   r5   r;   strr/   r   r   listre   ro   ru   rv   r   r   r   propertyboolr   r   __classcell__r   r   r   r   r      sX    
.


6





r   )typingr   r   r   utilsr   r   r   r	   baser   quantizers_utilsr   r&   rK   r   
get_loggerr   r+   r   r   r   r   r   <module>   s    
