o
    iK                     @   s   d dl mZmZ ddlmZ erddlmZ ddlmZm	Z	m
Z
mZmZ ddlmZ e
 r1d dlZeeZdZG d	d
 d
eZdS )    )TYPE_CHECKINGOptional   )HfQuantizer   )PreTrainedModel)is_accelerate_availableis_kernels_availableis_torch_availableis_triton_availablelogging)get_module_from_nameNc                       s*  e Zd ZdZdZdZdgZ fddZdd Zd	d
 Z	d6ddZ
dddedefddZdddddeddfddZd7ddZdddee dee fdd Z	!d8ddd"eee  fd#d$Zd%ee d&edee fd'd(Zd)d* Zd+d, Zdedefd-d.Zd9d/efd0d1Zd8d2d3Zedefd4d5Z  ZS ):Mxfp4HfQuantizerz/
    FP4 quantization using fbgemm kernels
    TF
acceleratec                    s$   t  j|fi | || _d | _d S N)super__init__quantization_configtriton_kernels_hub)selfr   kwargs	__class__ d/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/transformers/quantizers/quantizer_mxfp4.pyr   1   s   
zMxfp4HfQuantizer.__init__c                 C   sF   | j du r zddlm} |d| _ W | j S  ty   tdw | j S )z3Lazy import and initialize kernels only when neededNr   )
get_kernelz kernels-community/triton_kernelsz2kernels package is required for MXFP4 quantization)r   kernelsr   ImportError)r   r   r   r   r   _lazy_import_kernels6   s   
z%Mxfp4HfQuantizer._lazy_import_kernelsc                 O   sh  t  std| jjrd S tj s)tj s)| jr%t	
d d| j_d S tdt s0tdtj r?d}tdo=t }ntj }|dk}tdoNt }| jrm|s_t	
d	 d| j_d S |slt	
d
 d| j_d S n|sstd|sytd| js|   |d}|d u rt	
d d S |d ur| jst|trd| v sd| v rtdd S d S d S d S )NzqUsing mxfp4 quantization requires torchPlease install the latest version of torch ( pip install --upgrade torch )z^Using MXFP4 quantized models requires a GPU, we will default to dequantizing the model to bf16Tz-Quantizing a model using MXFP4 requires a GPUz9Using mxfp4 requires Accelerate: `pip install accelerate`z3.5.0)      z3.4.0u   MXFP4 quantization is only supported on GPUs with compute capability >= 7.5 (e.g T4, A100, L4, H100, or B200) or XPUs (e.g Intel® Data Center GPU Max Series) We will default to dequantizing the model to bf16.zMXFP4 quantization requires Triton and kernels installed: CUDA requires Triton >= 3.4.0, XPU requires Triton >= 3.5.0, we will default to dequantizing the model to bf16u   MXFP4 quantization is only supported on GPUs with compute capability >= 7.5 (e.g T4, A100, L4, H100, or B200) or XPUs (e.g Intel® Data Center GPU Max Series) zuMXFP4 quantization requires Triton and kernels installed: CUDA requires Triton >= 3.4.0, XPU requires Triton >= 3.5.0
device_mapzYou have loaded an FP4 model on CPU and have a CUDA/XPU device available, make sure to set your model on a GPU/XPU device in order to run your model. To remove this warning, pass device_map = 'cuda' or device_map = 'xpu'. cpudiskzYou are attempting to load an FP4 model with a device_map that contains a CPU or disk device.This is not supported when the model is quantized on the fly. Please use a quantized checkpoint or remove the CPU or disk device from the device_map.)r
   r   r   
dequantizetorchcudais_availablexpupre_quantizedloggerwarning_onceRuntimeErrorr   r   r	   get_device_capability
ValueErrorr   get
isinstancedictvalues)r   argsr   gpu_is_supportedkernels_availablecompute_capabilityr!   r   r   r   validate_environmentA   s~   


z%Mxfp4HfQuantizer.validate_environmentdtypetorch.dtypereturnc                 C   s   |d u rt j}td| |S )NzOverriding dtype=%s with `dtype=torch.bfloat16` due to requirements of `fbgemm-gpu` to enable model loading in fp4. Pass your own dtype to specify the dtype of the remaining non-linear layers or pass dtype=torch.bfloat16 to remove this warning.)r%   bfloat16r*   info)r   r8   r   r   r   update_dtype   s   zMxfp4HfQuantizer.update_dtypemodelr   
param_namec                 K   s   ddl m} ddlm} | jjr'd|v sd|v r't||d td  \}}nt||\}}t||s<t||rD| jjrD|dv rBdS d	S dS )
Nr   Mxfp4GptOssExpertsGptOssExpertsblocksscales_blocks)down_proj_biasgate_up_proj_biasFT)	integrationsrA   models.gpt_oss.modeling_gpt_ossrC   r   r$   r   lenr0   )r   r>   r?   r   rA   rC   moduletensor_namer   r   r   param_needs_quantization   s   
z)Mxfp4HfQuantizer.param_needs_quantizationparam_valueztorch.Tensortarget_deviceztorch.devicec              	   K   s.  ddl m}m}m}m}	m}
 ddlm} | js| 	 }t
||\}}t|b t||r|	||\}}|jj|jj|jj}}}|
|||\}}d|v rPdnd}t||| t|| d|||| dd t|| d	 t|| d
 W d    d S W d    d S 1 sw   Y  d S |d}|d}|d}|d}|d}d|v sd|v r| jjrt
||d td	  \}}nt
||\}}||||||d}t||st||r| jjr| jjr|d td	  }||||||fi | d S |||||| 	 fi | d S d S d S )Nr   )rA   r$   load_and_swizzle_mxfp4quantize_to_mxfp4swizzle_mxfp4rB   gate_up_proj	down_proj_precision_config)rhs_data)weight_scaleflex_ctxrF   _scalesempty_paramcasting_dtypeto_contiguousrankdevice_meshrD   rE   )r[   r\   r]   r^   r_   r>   )rI   rA   r$   rQ   rR   rS   rJ   rC   r)   r   r   r%   devicer0   
matmul_ogsPrecisionConfigFlexCtx
InFlexDatasetattrdelattrr/   r   rK   )r   r>   rO   r?   rP   r   rA   r$   rQ   rR   rS   rC   r   rL   _triton_weight_tensorrX   rb   rc   rd   projr[   r\   r]   r^   r_   shard_kwargsdq_param_namer   r   r   create_quantized_param   sx   

"





	

z'Mxfp4HfQuantizer.create_quantized_paramc                 K   sF   | j jr	| | tj rtj  d S tj r!tj  d S d S r   )r   r$   remove_quantization_configr%   r&   r'   empty_cacher(   )r   r>   r   r   r   r   #_process_model_after_weight_loading  s   


z4Mxfp4HfQuantizer._process_model_after_weight_loadingexpected_keyscheckpoint_keysc                 C   s  g }|D ]|}| dr#|d td  }||d  ||d  q| dr@|d td  }||d  ||d  q| js{| d	rY|d td  }||d  q| d
ro|d td  }||d  q| druq|| q|| q|S )Nz.mlp.experts.gate_up_projrT   gate_up_proj_blocksgate_up_proj_scalesz.mlp.experts.down_projrU   down_proj_blocksdown_proj_scalesz.mlp.experts.down_proj_blocksz .mlp.experts.gate_up_proj_blocksrE   )endswithrK   appendr)   )r   r>   rp   rq   new_expected_keyskeybaser   r   r   update_expected_keys  s,   




z%Mxfp4HfQuantizer.update_expected_keysNkeep_in_fp32_modulesc                 K   sj   ddl m} | || jj|| _|dd}|r!td d| j_|j	}||| j| j|d}| j|j	_d S )Nr   )replace_with_mxfp4_linearuse_kernelsFzYou are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=FalseT)modules_to_not_convertr   config)
rI   r}   get_modules_to_not_convertr   r   r/   r*   r+   r$   r   )r   r>   r|   r   r}   r~   r   r   r   r   $_process_model_before_weight_loading(  s$   
z5Mxfp4HfQuantizer._process_model_before_weight_loadingmissing_keysprefixc                    s   ddl m} g  | D ]*\}}t||r6|D ]}||v s&|| d| v r5|ds5|ds5 | qq fdd|D S )Nr   r@   .z.weightz.biasc                    s   g | ]}| vr|qS r   r   ).0knot_missing_keysr   r   
<listcomp>T  s    z8Mxfp4HfQuantizer.update_missing_keys.<locals>.<listcomp>)rI   rA   named_modulesr0   rv   rw   )r   r>   r   r   rA   namerL   missingr   r   r   update_missing_keysG  s   

z$Mxfp4HfQuantizer.update_missing_keysc                 C   6   d|j jv rt|dd d ur|jddddd |S )NGptOssConfigbase_model_tp_plangrouped_gemmz(layers.*.mlp.experts.gate_up_proj_blocksz(layers.*.mlp.experts.gate_up_proj_scalesz%layers.*.mlp.experts.down_proj_blocksz%layers.*.mlp.experts.down_proj_scales)r   __name__getattrr   updater   r   r   r   r   update_tp_planV     zMxfp4HfQuantizer.update_tp_planc                 C   r   )Nr   base_model_ep_planr   r   )r   r   r   r   r   r   r   r   r   update_ep_planc  r   zMxfp4HfQuantizer.update_ep_planc                 C   sj   | j jrd|v r|ddS d|v r|ddS |S | js3|dr(|ddS |dr3|ddS |S )NrF    rZ   rT   rr   rU   rt   )r   r$   replacer)   rv   )r   r?   r   r   r   get_param_namep  s   

zMxfp4HfQuantizer.get_param_namesafe_serializationc                 C   s  ddl m} | }| D ]s\}}t||rt|drt|dr|jjj	|jjj
dddddd	|| d
< |jjjj	|jjjj
dd|| d< |jjj	|jjj
dddddd|| d< |jjjj	|jjjj
dd|| d< qi }||fS )Nr   r@   rT   rU       Z      z.gate_up_proj_blocksz.gate_up_proj_scalesi@  z.down_proj_blocksz.down_proj_scales)rI   rA   
state_dictr   r0   hasattrrT   storagelayoutunswizzle_datadata	transposereshapegate_up_proj_precision_configrX   rU   down_proj_precision_config)r   r>   r   rA   r   r   rL   metadatar   r   r   get_state_dict_and_metadata}  s<   

z,Mxfp4HfQuantizer.get_state_dict_and_metadatac                 C   s   dS )NTr   )r   r   r   r   r   is_serializable  s   z Mxfp4HfQuantizer.is_serializablec                 C   s   t d dS )NzMXFP4 quantization don't support training, please consider dequantizing the model first by passing quantization_config=Mxfp4Config(dequantize=True) to .from_pretrained()F)r*   r+   )r   r   r   r   is_trainable  s   zMxfp4HfQuantizer.is_trainable)r8   r9   r:   r9   )r>   r   r   )F)r   
__module____qualname____doc__ requires_parameters_quantizationrequires_calibrationrequired_packagesr   r   r7   r=   strboolrN   rl   ro   listr{   r   r   r   r   r   r   r   r   propertyr   __classcell__r   r   r   r   r   '   sF    
O

T
 


#r   )typingr   r   rz   r   modeling_utilsr   utilsr   r	   r
   r   r   quantizers_utilsr   r%   
get_loggerr   r*   r   r   r   r   r   r   <module>   s   
