o
    TÃi  ã                   @   s„   d dl mZmZmZmZmZ d dlZddlmZ d dl	m
Z
 ddlmZ ddlmZ dd	lmZ G d
d„ deƒZG dd„ deƒZdS )é    )ÚAnyÚDictÚOptionalÚTupleÚTypeNé   )ÚRaggedBatchWrapper)ÚDeepSpeedConfigModelé   )ÚDSModuleBase)ÚDSModuleRegistryBase©ÚDSSelfAttentionConfigc                       sÒ   e Zd ZdZedee fdd„ƒZdede	e
ef ddf‡ fdd	„Zedefd
d„ƒZedefdd„ƒZdeddfdd„Z			ddejdejdedeej deej deej deejejf fdd„Z‡  ZS )ÚDSSelfAttentionBasea  
    Base mixin for all attention modules. The interface represented by this module
    is broadly:

    output = attention(query_key_value,
                       Optional[kv_cache],
                       Optional[attention_mask],
                       Optional[attention_bias])
    Úreturnc                   C   ó   t S ©Nr   © r   r   úl/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/inference/v2/modules/interfaces/attention_base.pyÚconfig_class   ó   z DSSelfAttentionBase.config_classÚconfigÚimplementation_configNc                    s   t ƒ  ||¡ d S r   )ÚsuperÚ__init__)Úselfr   r   ©Ú	__class__r   r   r       s   zDSSelfAttentionBase.__init__c                 C   ó   t ƒ ‚)zS
        Return preferred granulatity for blocked KV-cache implementation.
        ©ÚNotImplementedError©r   r   r   r   Úkv_block_size#   s   z!DSSelfAttentionBase.kv_block_sizec                 C   r   )zÛ
        Property to calculate blocking granularity for the query dimension.
        This has no impact on the KV-cache structure, but will  affect the
        number of attention atoms associated with a batch.
        r   r!   r   r   r   Úq_block_size*   s   z DSSelfAttentionBase.q_block_sizeÚragged_batchc                 C   s   dS )z¥
        Build the atoms for this module. This is not a strict requirement for the class,
        so this method is a no-op by default rather than abstract.
        Nr   )r   r$   r   r   r   Úbuild_atoms3   s   zDSSelfAttentionBase.build_atomsÚq_k_vÚkv_cacheÚbatchÚattention_maskÚattention_biasÚ	inv_freqsc                 C   r   )aÕ  
        Parameters:
            q_k_v (torch.Tensor): Query, key, and value tensors. Expected shape is:
                [
                    batch,
                    seq_len,
                    2 * self._config.n_heads_kv + self._config.n_heads_q,
                    self._config.head_size
                ].
            kv_cache (Optional[torch.Tensor]): Key and value cache tensor. Expected shape is
                [
                    2,
                    batch,
                    kv_cache_len,
                    self._config.n_heads_kv,
                    self._config.head_size
                ]. If None, cache is disabled. The `kv_cache_len` dimension does not need to
                be contiguous (it should expand stride by `max_out_tokens`).
            batch (RaggedBatchWrapper): Ragged batch metadata.
            attention_mask (Optional[torch.Tensor]): Attention mask tensor. If None, masking is
                disabled. This will defer to the config in the case of conflicting information.
                This means if the config class is implying causal attention, the mask will be ignored.
            attention_bias (Optional[torch.Tensor]): Attention bias tensor. If None, bias is disabled.
        r   )r   r&   r'   r(   r)   r*   r+   r   r   r   Úforward:   s   zDSSelfAttentionBase.forward)NNN)Ú__name__Ú
__module__Ú__qualname__Ú__doc__Ústaticmethodr   r	   r   r   r   Ústrr   r   ÚpropertyÚintr"   r#   r   r%   ÚtorchÚTensorr   r   r,   Ú__classcell__r   r   r   r   r      s8    
"úÿþýüûúúr   c                   @   s0   e Zd ZU i Zeed< edee fdd„ƒZ	dS )ÚDSSelfAttentionRegistryÚregistryr   c                   C   r   r   )r   r   r   r   r   Úassociated_class_   r   z(DSSelfAttentionRegistry.associated_classN)
r-   r.   r/   r9   r   Ú__annotations__r1   r   r   r:   r   r   r   r   r8   \   s   
 r8   )Útypingr   r   r   r   r   r5   Úraggedr   Údeepspeed.runtime.config_utilsr	   Ú	ds_moduler   Úmodule_registryr   Úconfigsr   r   r8   r   r   r   r   Ú<module>   s   K