o
    Tio!                     @   s   d dl Z d dlmZ d dlmZmZmZmZ d dlZd dl	m
Z d dlmZ d dlmZ ddlmZ ddlmZ d	d
lmZ d	dlmZmZmZ dejdeejejf fddZG dd dZdS )    N)reduce)AnyIterableOptionalTuple)ReduceOp)get_accelerator   )	elem_size)inference_logger   )BlockedAllocator)AllocationModeKVCacheConfigMemoryConfigkv_cachereturnc                 C   s`   | j dkrtd| j  d| dddddddddf | dddddddddf fS )a  
    Split a KV cache instance into its key and value components.

    Parameters:
        kv_cache (torch.Tensor): The KV-cache to split. This should be a 5D tensor with the
            following shape: [num_blocks, block_size, 2, num_heads, head_size]

    Returns:
        Tuple[torch.Tensor, torch.Tensor]: The key and value components of the KV-cache. Both
            tensors will have the shape [num_blocks, block_size, num_heads, head_size].
       z%KV-cache must have 5 dimensions, got .Nr   r   )ndim
ValueError)r    r   Z/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/inference/v2/ragged/kv_cache.pysplit_kv   s   
Dr   c                   @   s(  e Zd ZU eejdf ed< 	 eedf ed< 	 eedf ed< 	 		d!deedf de	d	e
e d
eddf
ddZd"dededejfddZd"dee deddfddZd"dee dedejfddZd"dee dedejfddZd"dededejfddZedejfddZedefdd ZdS )#BlockedKVCache._caches_allocators_configsNFconfigsmemory_configmp_groupoffloadr   c              
   C   s  || _ || _|| _| jrtdt| jjtju rd}| j D ]}ttj	|j
|j}|d9 }||t|j 7 }qtj|ddkrTtjdtjt  d}tj|tj|d t   t  | jj }	t  }
t d|
 d	|	 d
|  |	|k rtd| d|	 |	| }tj|ddkrtj|tjt  d}tj|tj|d | }~t   n| jj}g }g }t | j D ]B\}}|j
d }|j
d }|j
d }|||jd||f}t !d| d| d| d |"tj#||jt  d |"t$| qt%|| _&t%|| _'dS )a  
        Create a container that will maintain the storage and allocations for a set of
        blocked KV-caches.

        Parameters:
            config (KVCacheConfig): The configuration of the KV-cache.
            slack (int): The amount of slack space to reserve in GPU memory for the cache.
            enable_offload (bool): Whether to enable offloading of the cache to the host.
            blocks (int): The number of blocks to pre-allocate for the cache. If this is set,
                slack will be ignored.
        z-Offloading of KV-caches is not yet supported.r   r	   )groupr   )dtypedevice)opr"   z6Memory usage before KV-cache allocation: total_memory=z, available_kv_memory=z, total_per_block_footprint=z5Insufficient memory to allocate KV-caches. Required: z, Available: zAllocating KV-cache z with shape: z consisting of z blocks.N)(r   _memory_config_enable_offloadNotImplementedErrorr   modeRESERVEr   operatormulcache_shape
block_sizer
   cache_dtypedistget_world_sizetorchtensorint32r   current_device
all_reducer   MINempty_cacheavailable_memorysizetotal_memoryr   debugr   item	enumerateinfoappendemptyr   tupler   r   )selfr   r   r    r!   total_per_block_footprintconfigper_block_footprintdummy_tensoravailable_kv_memoryr;   
num_blocksreduce_tensorcaches
allocatorscache_group_id
num_caches	num_heads	head_sizealloc_shaper   r   r   __init__<   sb   







zBlockedKVCache.__init__r   rI   cache_groupc                 C   s   | j | |S )a3  
        Reserve a number of blocks from the cache. This will return a 1D tensor of
        block_ids that have been marked as reserved.

        Parameters:
            num_blocks (int): The number of blocks to reserve.
            cache_group (int): The cache group to reserve from. Default is 0.
        )r   allocate)rC   rI   rS   r   r   r   reserve   s   	zBlockedKVCache.reserveblocksc                 C   s   | j | | dS )a
  
        Free a set of blocks from the cache. This will mark the blocks as free in the
        allocator.

        Parameters:
            blocks (Iterable[int]): The blocks to free.
            cache_group (int): The cache group to free from. Default is 0.
        N)r   freerC   rV   rS   r   r   r   rW      s   	zBlockedKVCache.freec                 C      t d)z
        Offload KV-cache blocks from accelerator memory to the host.

        Parameters:
            blocks (Iterable[int]): The blocks to offload.
            cache_group (int): The cache group to offload from. Default is 0.
         Offloading is not yet supported.r(   rX   r   r   r   r!         zBlockedKVCache.offloadc                 C   rY   )z
        Restore KV-cache blocks from the host to accelerator memory.

        Parameters:
            blocks (Iterable[int]): The blocks to restore.
            cache_group (int): The cache group to restore to. Default is 0.
        rZ   r[   rX   r   r   r   restore   r\   zBlockedKVCache.restorecache_idc                 C   s   | j | | S )z
        Get the tensor associated with the given cache ID.

        Parameters:
            cache_id (int): The ID of the cache tensor to get.
            cache_group (int): The cache group to get from. Default is 0.
        )r   )rC   r^   rS   r   r   r   	get_cache   s   zBlockedKVCache.get_cachec                 C   s   dd | j D S )z@
        Return the number of free blocks in each cache
        c                 S   s   g | ]}|j qS r   )free_blocks).0	allocatorr   r   r   
<listcomp>   s    z.BlockedKVCache.free_blocks.<locals>.<listcomp>)r   rC   r   r   r   r`      s   zBlockedKVCache.free_blocksc                 C   s
   t | jS )z-
        Return the number of caches
        )lenr   rd   r   r   r   rN      s   
zBlockedKVCache.num_caches)NF)r   )__name__
__module____qualname__r   r2   Tensor__annotations__r   r   r   r   r   boolrR   intrU   r   rW   r!   r]   r_   propertyr`   rN   r   r   r   r   r   (   s:   
 	

T


r   )r+   	functoolsr   typingr   r   r   r   r2   deepspeed.commcommr0   deepspeed.comm.reduce_opr   deepspeed.acceleratorr   inference_utilsr
   loggingr   blocked_allocatorr   manager_configsr   r   r   ri   r   r   r   r   r   r   <module>   s    