o
    *i&                     @   s  d dl mZ d dlmZ d dlmZmZmZ d dlmZ d dlm	Z	 d dl
Z
d dlmZ d dlmZmZ d dlmZmZ d d	lmZ d d
lmZmZ d dlmZ d dlmZ d dlmZmZmZ d dl m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z) ddgZ*d dl+m,Z, d dl"m-Z- 		d/de)dee. dee/ fddZ0	d0de)de	edf de/fddZ1de
j2j)ded efd!d"Z3d#e.d$e.d%e)d&efd'd(Z4d)ee.ef d*ee. d+e.d,efd-d.Z5dS )1    )OrderedDict)deepcopy)DictListOptional)UnionN)CompressionFormat)initialize_hooked_attentioninitialize_hooked_kv_cache)"initialize_module_for_quantizationis_attention_module)QuantizationArgs)QuantizationConfigQuantizationStatus)QuantizationScheme)replace_module)is_narrow_matchmatch_named_modulesmatch_targets)update_parameter_data)get_safetensors_folder)logger)	safe_open)Module'load_pretrained_quantization_parametersapply_quantization_config)is_module_quantized)*get_quantization_parameter_to_path_mappingFmodelmodel_name_or_pathload_weight_qparamsc                 C   s   t |}t|}|  D ]9\}}t|sq|jjdur%d}t||||d |jjdur5d}t||||d |rE|jjrEd}t||||d qdS )az  
    Loads the quantization parameters (scale and zero point) from model_name_or_path to
    a model that has already been initialized with a quantization config.

    NOTE: Will always load inputs/output parameters. Will conditioanlly load weight
    parameters, if load_weight_qparams is set to True.

    :param model: model to load pretrained quantization parameters to
    :param model_name_or_path: Hugging Face stub or local folder containing a quantized
        model, which is used to load quantization parameters
    :param load_weight_qparams: whether or not the weight quantization parameters
        should be loaded
    Ninput)	base_namemodule_namemodulemappingoutputweight)	r   r   named_modulesr   quantization_schemeinput_activations_load_quant_args_from_mappingoutput_activationsweights)r   r   r    
model_pathr%   name	submoduler"    r1   l/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/compressed_tensors/quantization/lifecycle/apply.pyr   =   s>   configrun_compressedc                 C   s$  ddl m} t|}|du rt S |jtjk}|jdur$t| |j|j t	 }|j
 D ]}|jD ]}|||< q1q,t| ||jddD ]M\}}	t||	|}
t||
|}||	_|rut|	tjjru|jtjjkru|j|	||jd}t| || nt|	rt| |j|rt| |	 t|	|d |j|	_qBdS )an  
    Initializes the model for quantization in-place based on the given config.
    Optionally coverts quantizable modules to compressed_linear modules

    :param model: model to apply quantization config to
    :param config: quantization config
    :param run_compressed: Whether the model will be run in compressed mode or
        decompressed fully on load
    r   )CompressedLinearNT)warn_on_fail)r)   quantization_formatforce_zero_point) +compressed_tensors.linear.compressed_linearr5   r   dictquantization_statusr   
COMPRESSEDkv_cache_scheme_apply_kv_cache_schemer   config_groupsvaluestargetsr   ignorer   _scheme_from_targetsr)   
isinstancetorchnnLinearformatr   densevaluefrom_linearr   r   r   r	   r   )r   r3   r4   r5   r9   target_to_schemeschemetargetr/   r0   matched_targetscompressed_linearr1   r1   r2   r   p   sR   







r>   statusc                 C   sZ   |j stdtdg|d}|  D ]}t|r*||_t| | t|dd ||_	qd S )Nz6vLLM does not support asymmetric kv cache quantizationz.*self_attn$)rB   r*   Fr8   )
	symmetricr   warningr   modulesr   r)   r
   r   r<   )r   r>   rR   rN   r0   r1   r1   r2   r?      s   

r?   r"   r#   r$   r%   c                 C   sz  |  d}|  d}|  d}| | d| d}| | d| d}| | d| d}	|	dur[t|	ddd}
|
| d| }W d   n1 sPw   Y  t||| |durt|ddd}
|
| d| }W d   n1 s{w   Y  t||| |du rtj|dd	}n!t|ddd}
|
| d| }W d   n1 sw   Y  t||| dS dS )
a  
    Loads scale and zero point from a state_dict into the specified module

    :param base_name: quantization target, one of: weights, input_activations or
    output_activations
    :param module_name: pytorch module name to look up in state_dict
    :module: pytorch module associated with module_name
    :mapping: mapping to search fetch paths on disk for a given parameter
    _scale_zero_point_g_idx.Nptcpu)	frameworkdevice)r]   )getr   
get_tensorr   rF   
zeros_like)r"   r#   r$   r%   
scale_namezp_name
g_idx_namestate_dict_scale_pathstate_dict_zp_pathstate_dict_g_idx_pathfstate_dict_g_idxstate_dict_scalestate_dict_zpr1   r1   r2   r+      s.   


r+   rM   rB   r/   returnc                 C   s   | |d  S )Nr   r1   )rM   rB   r/   r1   r1   r2   rD      s   rD   )NF)F)6collectionsr   copyr   typingr   r   r   OrderedDictTyper   rF   compressed_tensors.configr   compressed_tensors.modelingr	   r
   4compressed_tensors.quantization.lifecycle.initializer   r   *compressed_tensors.quantization.quant_argsr   ,compressed_tensors.quantization.quant_configr   r   ,compressed_tensors.quantization.quant_schemer    compressed_tensors.utils.helpersr   compressed_tensors.utils.matchr   r   r    compressed_tensors.utils.offloadr   )compressed_tensors.utils.safetensors_loadr   logurur   safetensorsr   torch.nnr   __all__-compressed_tensors.quantization.utils.helpersr   r   strboolr   r   rG   r?   r+   rD   r1   r1   r1   r2   <module>   s   
4

J

,
