o
    *iH                     @   sL  d dl Z d dlmZmZmZmZmZmZ d dlm	Z	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZmZ g dZdZG dd deZdedee deeef deee eeef f fddZdedefddZ dedeeegee f defddZ!dedeeegee f defddZ"dS )    N)AnyCallableDictListOptionalTuple)ReferenceTyperef)forward_quantize)getattr_chain)InternalModule)Tensor)Module)RemovableHandle)CachePretrainedConfigPreTrainedModel)QuantizedKVCacheinitialize_hooked_kv_cacheregister_key_hookregister_value_hookKV_CACHE_ATTRkv_cachec                       sp   e Zd ZdZdedef fddZdeeef fddZ	d	ed
edeeef fddZ
dee fddZ  ZS )r   ai  
    QuantizedKVCache module which wraps the functionality of any existing kvcache args.
    Unlike transform Cache instances, this cache is a `torch.nn.Module` which can be
    hooked to trigger transforms and calibration hooks.

    This module works by being registered as a submodule to attention modules via
    `initialize_hooked_kv_cache`, then adding a hook which replaces `past_key_values`
    kwargs with this module. This module adopts the functionality of the replaced cache,
    preserving caching functionality such as sliding window attention, ect.

    :param attn_module: parent attention module
    configattn_modulec                    s$   t    || _t|| _d | _d S N)super__init__r   r	   r   past_key_values)selfr   r   	__class__ `/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/compressed_tensors/modeling/kvcache.pyr   6   s   


zQuantizedKVCache.__init__returnc                 O   s   | |i |S r   r"   )r   argskwargsr"   r"   r#   update<   s   zQuantizedKVCache.update
key_statesvalue_statesc           
      O   s   |   }d}t||d }t|dd}|d ur&|r&t||d|}t||d|}| jd ur;|  j||g|R i |}	n||f}	d | _|	S )Nz%quantization_scheme.input_activationsquantization_enabledTkv)r   r   getattrr
   r   r'   )
r   r(   r)   r%   r&   modulequant_args_attr
quant_argsquant_enabledretr"   r"   r#   forward?   s$   
zQuantizedKVCache.forwardr   c                 C   s    |d urt || _d S d | _d S r   )r	   r   )r   r   r"   r"   r#   add_past_key_valuesZ   s   
z$QuantizedKVCache.add_past_key_values)__name__
__module____qualname____doc__r   r   r   r   r   r'   r3   r   r   r4   __classcell__r"   r"   r    r#   r   (   s    

r   r.   r%   r&   r$   c                 C   sJ   dt | jjv rdnd}||d}t| t}|| |||< ||fS )aZ  
    Hook which should be called before each quantized attention forward pass.
    This hook dynamically replaces the `past_key_values` kwarg to the attention
    forward function.

    The original kvcache object is assigned to QuantizedKVCache().past_key_values
    as a weakref to maintain original cache functionality and compute savings
    r   past_key_valueN)inspect	signaturer3   
parametersgetr-   r   r4   )r.   r%   r&   _past_kv_namer   cacher"   r"   r#   _kv_cache_attention_hookd   s   

rA   modelc                 C   s4   t |ts|tt| j| |jtdd dS dS )z
    Initialize a `QuantizedKVCache` instance attached to attention

    :param model: parent model of attention module
    :param module: attention module to initialize with
    Twith_kwargsN)hasattrr   register_moduler   r   register_forward_pre_hookrA   )rB   r.   r"   r"   r#   r   }   s   
r   hookc                    ,   t t}dtf fdd}|j|ddS )z
    Register a hook which takes post-rope key states as an argument and
    returns the modified key states or `None`

    :param module: attention module to add hook to
    :param hook: key hook function
    r@   c                    F   t | jj|i |} |jd }|d ur||jd< |j|jfS )Nr(   r;   r<   r3   bind	argumentsr%   r&   r@   r%   r&   boundvaluerH   r.   r"   r#   _hook   
   
z register_key_hook.<locals>._hookTrC   r-   r   r   rG   r.   rH   r   rR   r"   rQ   r#   r         

r   c                    rI   )z
    Register a hook which takes value states as an argument and
    returns the modified value states or `None`

    :param module: attention module to add hook to
    :param hook: value hook function
    r@   c                    rJ   )Nr)   rK   rN   rQ   r"   r#   rR      rS   z"register_value_hook.<locals>._hookTrC   rT   rU   r"   rQ   r#   r      rV   r   )#r;   typingr   r   r   r   r   r   weakrefr   r	   1compressed_tensors.quantization.lifecycle.forwardr
   compressed_tensors.utilsr   !compressed_tensors.utils.internalr   torchr   torch.nnr   torch.utils.hooksr   transformersr   r   r   __all__r   r   strrA   r   r   r   r"   r"   r"   r#   <module>   sJ    	<


