o
    *iw/                     @   sD  d dl Z d dlmZmZmZ d dlZd dlmZmZm	Z	m
Z
 d dlmZmZmZmZmZmZmZ d dlmZ d dlmZ d dlmZmZmZmZmZmZ d dlmZm Z  g d	Z!e "e#Z$		
ddedee de%fddZ&defddZ'	
ddede(dedeee)df  dej*de%fddZ+dedede%fddZ,defddZ-dS )    N)OptionalTupleUnion)	IMPL_ATTRKV_CACHE_ATTRQuantizedAttentionImplQuantizedKVCache)ActivationOrderingDynamicTypeQuantizationArgsQuantizationMetadataQuantizationSchemeQuantizationStatusQuantizationStrategy)wrap_module_forward_quantized)strategy_cdiv)disable_hf_hookget_execution_deviceget_head_dimget_num_attn_headsget_num_kv_headsregister_offload_parameter)Module	Parameter)"initialize_module_for_quantizationis_attention_moduleinitialize_qparamsinitialize_attn_qparamsTmoduleschemeforce_zero_pointc                 C   sb  |pt | dd}|du rdS t|  t| rt| || nt| tjjs/t	
dt|   t| dr@| j}t|tjs?J nt	
dt|  dt|   dS |jdurgt| d|j|jdd |j|d	 |jduryt| d|j|j|j|d	 |jdurt| d
|j|jdd |j|d	 t|  t| | W d   n1 sw   Y  || _tj| _dS )aa  
    Attaches appropriate scales, zero points, and observers to a layer
    given its target quantization scheme.

    Previously initialized scales and zero points will be removed from
    module if they no longer apply to the scheme

    :param module: module to set for calibration
    :param scheme: scheme to use for quantization. if None is provided,
        will attempt to use scheme stored in the module under `quantization_scheme`,
        if not provided, the layer will be skipped
    :param force_zero_point: whether to force initialization of a zero point for
        symmetric quantization
    quantization_schemeNz&Attempting to quantize module of type weightzmodule type zR targeted for quantization but has no attribute weight, skipping quantization for inputobserved_shapeobserved_dtyper    output)getattrr   clear_all_qparamsr   r   
isinstancetorchnnLinear_LOGGERwarningtypehasattrr"   Tensorinput_activationsr   shapedtypeweightsoutput_activationsr   r   r!   r   INITIALIZEDquantization_status)r   r   r    r"    r;   q/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/compressed_tensors/quantization/lifecycle/initialize.pyr   =   sb   



	
	
	r   c                 C   s.   d| j j v ot| dpt| dpt| dS )N	attentionk_projv_projqkv_proj)	__class____name__lowerr2   )r   r;   r;   r<   r      s   
r   	base_namequantization_argsr&   r'   c                 C   s  |j }|j}|j}t| }	|du rdS |tjkr.ttjdtj	|	ddd}
t
| | d|
 |tjkr5dS |tjkr=d}n|tjkrFtd	|tjkr\t|d
k rUtd|d df}n|tjtjfv r|jduskJ t|dk rutd|j}t|d ||}g |dd |R }|tjkrttj|d fd|	tjddd}t
| | d| nP|tjkr|jdusJ t|d
k rtd|j}t|d |d |}t|d |d |}||f}n|tjkrt|dk rtd|d ddf}nJ d| |}|tjtjtj	tjfvrtj}ttj|||	ddd}t
| | d| |s)|jsAttj||	|j ddd}t
| | d| dS dS )a  
    Initialize quantization parameters for a given basename according to the passed
    quantization args. The shape and dtype of the observed weight/activation must also
    be provided.

    Scales will always be initialized. Global scales are initialized depending on args.
    Zero points will be initialized if not symmetric or if `force_zero_point` is True.

    :param module: module to register qparams to
    :param base_name: base name of qparams, for example "input", "weight", "k", "v"
    :param quantization_args: arguments for quantization
    :param observed_shape: last (right-most) known dimensions of the observed weight/act
    :param observed_dtype: dtype of the observed weight/actt
    :param force_zero_point: force the zero_point parameter to be initialized
    TN   )r6   deviceF)requires_grad_global_scale)rF   z(Cannot perform static token quantization   z5Channel quant requires at least 2 observed dimensionsz2Group quant requires at least 1 observed dimensionr$   )rG   r6   _g_idxz3Block quant requires at least 2 observed dimensions   z7Attention quant requires at least 3 observed dimensionszUnknown strategy _scale_zero_point)!strategydynamicactorderr   r   TENSOR_GROUPr   r,   emptyfloat32r   r
   LOCALTENSORTOKEN
ValueErrorCHANNELlenGROUP
group_sizer   r	   fullintBLOCKblock_structure	ATTN_HEADfloat16bfloat16float64	symmetriczeroszp_dtype)r   rD   rE   r&   r'   r    rQ   rR   rS   rG   init_global_scaleexpected_shaper^   
num_groups
init_g_idxrb   num_rowsnum_colsscale_dtype
init_scaleinit_zero_pointr;   r;   r<   r      s   








r   c                 C   s   t | td}t | td}|du r|du rtdt dt dt| |j}t|}t|}t|}|d|f}	|d|f}
t	| 
 j}|durRt| d|j|	||d |durnt| d|j|
||d t| d|j|
||d dS dS )	z(Initlaize k_scale, v_scale for self_attnNz0Attention module has quantization scheme but no z or zc attributes. Please ensure that these attributes are initialized using `apply_quantization_config`.qr%   kv)r)   r   r   rZ   _validate_attention_schemeconfigr   r   r   next
parametersr6   r   r4   )r   r   r    implkv_cacherw   num_attn_headsnum_kv_headshead_dimq_observed_shapekv_observed_shaper'   r;   r;   r<   r     sV   

	
r   c                 C   s:   | j d ur	td| jd u rtd| jd urtdd S )NzmCannot apply weight quantization to attention. Instead, target the (q|k|v)_proj submodule layers of attentionzHCannot apply attention quantization without specifying input activationsz-Cannot apply output quantization to attention)r7   rZ   r4   r8   )r   r;   r;   r<   rv   N  s   


rv   )NT)T).loggingtypingr   r   r   r,   compressed_tensors.modelingr   r   r   r   compressed_tensors.quantizationr	   r
   r   r   r   r   r   1compressed_tensors.quantization.lifecycle.forwardr   %compressed_tensors.quantization.utilsr   compressed_tensors.utilsr   r   r   r   r   r   torch.nnr   r   __all__	getLoggerrB   r/   boolr   r   strr`   r6   r   r   rv   r;   r;   r;   r<   <module>   sV   $	 

U
{
9