o
    *ih?                     @   s  U d dl Z d dlZd dlmZmZmZ d dlZd dlmZm	Z	m
Z
mZmZmZmZ d dlmZ d dlmZmZmZ d dlmZ d dlmZ d dlmZmZmZ d d	lmZ g d
ZdgZe  e!Z"e j#e$d< 	dEdedededee deeef f
ddZ%	dEdededej&jdee fddZ'dede(defddZ)dede*fddZ+dede*fdd Z,dede(fd!d"Z-ed#d$dedeee(ef ddf fd%d&Z.ed#d$	'	(	(dFded)e*d*e*d+e*deee(ef ddf f
d,d-Z/dejde0fd.d/Z1dejd0d1de*fd2d3Z2e d4ede*fd5d6Z3e	eej4fd7ejd8ejd9ee
 d:ee
 d;eej5 f
d<d=Z6	(dGde0d>e0d?ee d@e*de0f
dAdBZ7d;ej5de8fdCdDZ9dS )H    N)	GeneratorOptionalTuple)FP4_E2M1_DATAFP8_E4M3_DATA	FloatArgsQuantizationArgsQuantizationStrategyQuantizationTyperound_to_quantized_type_dtype)QuantizationScheme)generate_mxfp4_scalesmaybe_convert_from_mxfp4_expshould_generatre_mxfp4_scales)
deprecated)logger)FloatTensor	IntTensorTensor)Module)is_module_quantizedis_model_quantizedmodule_typeget_torch_bit_depthcan_quantizeKV_CACHE_TARGETSis_kv_cache_quant_schemeiter_named_leaf_modulesiter_named_quantizable_modulescompute_dynamic_scales_and_zpcalculate_rangecalculate_qparamsgenerate_gparamstrategy_cdivzre:.*self_attn$_LOGGERmin_valsmax_valsquantization_argsglobal_scalereturnc                 C   s  t | t | } t |t |}| j}t||\}}|| }|jrMt t | t |}t|dr:t	|d}	n|t
|d  }	t j|	j|| jd}
n$|jdkr\|jtjkr\td||  t
| }	|| |	  }
t |
||}
|dury||	 }	|jdurt|	|jd}	t||	}	t|jdur|jn|	jd}t |	d	kt j||	j|d
|	}	t|
|jdd}
|	jd	kr|	d}	|
d}
|	|
fS )a  
    :param min_vals: tensor of min value(s) to calculate scale(s) and zero point(s)
        from
    :param max_vals: tensor of max value(s) to calculate scale(s) and zero point(s)
        from
    :param quantization_args: settings to quantization
    :param global_scale: additional global scale to scale the locally generated scale
        currently only applied/supported for Fp4

    :return: tuple of the calculated scale(s) and zero point(s). For FP4, the calculated
        scale is of dtype FP8
    )args)x   )devicedtype   z0Asymmetric Quantization is not supported for FP4Nr.   r   )r.   r-   F)r.   cast_to_original_dtype   )torchmin
zeros_likemaxr-   r    	symmetricabsr   r   floatzerosshaper.   num_bitstyper
   FLOATNotImplementedErrorclampscale_dtyper   r   _get_dtype_epswheretensorzp_dtypendimreshape)r%   r&   r'   r(   r-   bit_minbit_max	bit_rangemax_val_posscaleszero_pointseps rO   j/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/compressed_tensors/quantization/utils/helpers.pyr!   A   sV   







r!   valuer*   modulec           
         s   d}|j tjkrddh t fddt| jD }n;|j tjkr$d}n2|j tjtjfv rFd}d}t	
| jd |j |jf}| d|} ntjtjtjtjf}td	| |s`t| \}}	ntj| ||d
}tj| ||d
}	t||	||dS )a  
    Returns the computed scales and zero points for dynamic activation
    quantization.

    :param value: tensor to calculate quantization parameters for
    :param args: quantization args
    :param reduce_dims: optional tuple of dimensions to reduce along,
        returned scale and zero point will be shaped (1,) along the
        reduced dimensions
    :return: tuple of scale and zero point derived from the observed tensor
    Tr   r2   c                 3   s    | ]	}| vr|V  qd S NrO   ).0idxdimrO   rP   	<genexpr>   s    z0compute_dynamic_scales_and_zp.<locals>.<genexpr>NFz+Dynamic quantization is only supported for )rW   keepdims)r(   )strategyr	   TOKENtuplerangerF   TENSORTENSOR_GROUPGROUPmathceilr;   
group_size	unflatten
ValueErrorr3   aminmaxaminamaxr!   )
rQ   r*   rR   r(   	keep_dimsreduce_dimsreshaped_dimssupported_strategiesmin_valmax_valrO   rV   rP   r      s<   r   r-   c                 C   s   | j tjkr$d| j }tj|d d |d}tj| d |d}||fS | j tjkr`| jdkrCtjtj|d}tjtj	|d}||fS | jdkr\tjt
j|d}tjt
j	|d}||fS tdtd| j  )a  
    Calculated the effective quantization range for the given Quantization Args

    :param quantization_args: quantization args to get range of
    :param device: device to store the range to
    :return: tuple endpoints for the given quantization range
    r,   r2   )r-      r/   z1Range calculation only supported for 4 and 8 bitszInvalid quantization type )r=   r
   INTr<   r3   rD   r>   r   r6   r4   r   r?   rf   )r'   r-   rJ   q_maxq_minrO   rO   rP   r       s$   


r    c                 C   sB   t | dsdS | jjdurdS | jjdurdS | jjdurdS dS )z
    Check if a module is quantized, based on the existence of a non-empty quantization
    scheme

    :param module: pytorch module to check
    :return: True if module is quantized, False otherwise
    quantization_schemeFNT)hasattrrt   weightsinput_activationsoutput_activationsrR   rO   rO   rP   r      s   
r   modelc                 C   s   t dd |  D S )z
    Check if any modules in a model are quantized, based on the existence of a non-empty
    quantization scheme in at least one module

    :param model: pytorch model
    :return: True if model is quantized, False otherwise
    c                 s   s    | ]}t |V  qd S rS   )r   )rT   	submodulerO   rO   rP   rX     s    z%is_model_quantized.<locals>.<genexpr>)anymodules)rz   rO   rO   rP   r     s   r   c                 C   s
   t | jS )z
    Gets a string representation of a module type

    :module: pytorch module to get type of
    :return: module type as a string
    )r=   __name__ry   rO   rO   rP   r     s   
r   zThis function will be removed in a future release. Please use `model.named_modules()` and filter by compressed_tensors.InternalModule if neceessary)messagec                 c   s    |   D ]F\}}t| }t|dkrd|v r||fV  qt|dkr/tt|  \}}d}tt|D ]}|| }d|vrCd}q7|sK||fV  qdS )z
    Yields modules that do not have any submodules except observers. The observers
    themselves are not yielded
    :param model: model to get leaf modules of
    :returns: generator tuple of (name, leaf_submodule)
    r   observerFTN)named_moduleslistchildrenlenzipnamed_childrenr^   )rz   namer{   r   r   has_non_observer_childreni
child_namerO   rO   rP   r     s"   
r   TFinclude_childreninclude_attninclude_mlpc                 c   s    |   D ]`\}}|rMt| }t|dkr!d|vr!||fV  n,t|dkr1tt|  \}}d}tt|D ]}	||	 }
d|
vrEd}q9|sM||fV  |rY|drY||fV  |re|dre||fV  qdS )aU  
    Yield name and submodule of
    - leaf modules, set by include_children
    - attention modyles, set by include_attn
    :param model: model to get leaf modules of
    :param include_children: flag to get the leaf modules
    :param inlcude_attn: flag to get the attention modules
    :returns: generator tuple of (name, submodule)
    r   r   FT	self_attnmlpN)r   r   r   r   r   r   r^   endswith)rz   r   r   r   r   r{   r   r   r   r   r   rO   rO   rP   r   :  s0   




r   c                 C   s8   z
t | jj}W |S  ty   t | jj}Y |S w )z
    Determine the number of bits used to represent the dtype of a tensor

    :param value: tensor to check bit depth of
    :return: bit depth of each element in the value tensor
    )r3   finfor.   bits	TypeErroriinfo)rQ   	bit_depthrO   rO   rP   r   h  s   r   
quant_argsr   c                 C   s:   t | }|j}||jk rtd| d| d ||jkS )aI  
    Checks if value can be quantized by quant_args.

    :param value: tensor to check for quantization
    :param quant_args: QuantizationArgs to use for quantization
    :return: False if value is already quantized to quant_args or value is incompatible
    with quant_args, True if value can be quantized with quant_args
    z%Can't quantize tensor with bit depth z to zH.The QuantizationArgs provided are not compatible with the input tensor.)r   r<   r$   warn)rQ   r   r   requested_depthrO   rO   rP   r   w  s   	

r   schemec                 C   s   | j D ]	}|tv r dS qdS )a
  
    Check whether the QuantizationScheme targets the kv cache.
    It does if all the following criteria are met:
    - the scheme targets either exactly match the KV_CACHE_TARGETS
        or the match KV_CACHE_TARGETS regex pattern
    - the scheme quantizes output_activations (we want to quantize the
        outputs from the KV_CACHE_TARGETS, as their correspond to the
        keys and values that are to be saved in the cache)

    :param scheme: The QuantizationScheme to investigate
    :return: boolean flag
    TF)targetsr   )r   targetrO   rO   rP   r     s
   
r   updated_min_valupdated_max_val
scale_data
quant_datar.   c           	      C   s^   t | t | }t |t |}t t |t |}|j|j | }||dgS )ah  
    Generate a global scale for an entire tensor (input_tensor).
    Goal of the scale is to ensure that the quantization (local) scale
    falls into the approproiate dtype range.

    E.g. for NVFP4, group (local) scales are in dtype FP8. The global_scale
    attempts to use the entire FP8 dtype range while mapping a per-group max
    to the FP4 max.
    r2   )r3   r4   r5   r6   r8   torG   )	r   r   r   r   r.   r%   r&   rK   r(   rO   rO   rP   r"     s
   r"   divisorr[   strictc                 C   sV   t | | }|| | kr)| d|  d| d| }|r t|tjdd| |S )NzJ quantization strategy requires strict division of weight/activation size z and group/block size z[. consider reducing the group/block size or ignoring modules with weights not divisible by T)log_once)rb   rc   rf   r   bindwarning)rQ   r   r[   r   dividendr   rO   rO   rP   r#     s   r#   c                 C   s@   | t jkrdS | tjkrdS ttjg | drt| jS dS )Ng      ?g      ?r0   r2   )r   r.   r   r3   is_floating_pointrD   r   rN   r0   rO   rO   rP   rB     s   

rB   rS   )TFF)F):loggingrb   typingr   r   r   r3   *compressed_tensors.quantization.quant_argsr   r   r   r   r	   r
   r   ,compressed_tensors.quantization.quant_schemer   1compressed_tensors.quantization.utils.mxfp4_utilsr   r   r   compressed_tensors.utilsr   logurur   r   r   r   torch.nnr   __all__r   	getLoggerr~   r$   Logger__annotations__r!   nnr   strr    boolr   r   r   r   r   intr   r   r   float32r.   r"   r#   r9   rB   rO   rO   rO   rP   <module>   s   
$	

Z
;
&)

