o
    ̳i38                     @   sr   d dl Z d dlmZ d dlZd dlm  mZ d dlmZ d dlm	Z	 d dl
mZ e eZG dd dejZdS )    N)Optional)nn)	_MaskType)KVCachec                '       s  e Zd ZdZddddddddddd
ded	ed
ededejdejdejdejdeej deej deej dee dede	de
dee dee
 dee ddf& fddZdedejdeddfdd Zd!d" Z	d*ddd#d$ejd%eej d&ee d'eej dejf
d(d)Z  ZS )+Gemma2Attentionaz
  
    Adapated from official Google Pytorch Implementation:
    https://github.com/google/gemma_pytorch/blob/80881c2e6e797ef1913a4a705d4b40394791cc58/gemma/model.py#L213
    to match torchtune style.
    A new attention had to be added since nn.functional.scaled_dot_product_attention does allow soft capping
    Args:
        embed_dim (int): embedding dimension for the model
        num_heads (int): number of query heads. For MHA this is also the
            number of heads for key and value
        num_kv_heads (int): number of key and value heads. User should ensure
            ``num_heads % num_kv_heads == 0``. For standard MHA set ``num_kv_heads == num_heads``,
            for GQA ``num_kv_heads < num_heads``, and for MQA set ``num_kv_heads == 1``.
        head_dim (int): dimension of each head, calculated by ``embed_dim // num_heads``.
        q_proj (nn.Module): projection layer for query.
        k_proj (nn.Module): projection layer for key.
        v_proj (nn.Module): projection layer for value.
        output_proj (nn.Module): projection layer for output.
        pos_embeddings (Optional[nn.Module]): positional embeddings layer, e.g. RotaryPositionalEmbeddings.
        q_norm (Optional[nn.Module]): normalization layer for query, e.g. RMSNorm. For decoding, this is applied
            before updating from kv_cache. This means it will only support token wide normalization and not
            batch or sequence wide normalization.
        k_norm (Optional[nn.Module]): normalization layer for key, must be set if q_norm is.
        kv_cache (Optional[KVCache]): KVCache object used to cache key and value
        max_seq_len (int): maximum sequence length supported by the model.
            This is needed to compute the RoPE Cache. Default: 4096.
        is_causal (bool): sets the default mask to causal when no mask is provided
        attn_dropout (float): dropout value passed onto the
            scaled_dot_product_attention function. This argument is ignored if the
            self.training is False. Default value is 0.0.
        sliding_window_size (Optional[int]): size of the sliding window if None no sliding window is applied
        softcapping (Optional[float]): capping value used for soft caping, if None no capping is performed
        query_pre_attn_scalar (Optional[int]): value used for pre attention normalisation, if None head_dim is used instead
    Raises:
        ValueError:
            If ``num_heads % num_kv_heads != 0``, **or**
            if ``embed_dim % num_heads != 0``, **or**
            if ``attn_dropout < 0`` or ``attn_dropout > 1``, **or**
            if ``q_norm`` is defined without k_norm or vice versa
    Ni   Tg        g      I@)
pos_embeddingsq_normk_normkv_cachemax_seq_len	is_causalattn_dropoutsliding_window_sizesoftcappingquery_pre_attn_scalar	embed_dim	num_headsnum_kv_headshead_dimq_projk_projv_projoutput_projr   r   r	   r
   r   r   r   r   r   r   returnc                   s  t    || dkrtd| d| d|| dkr'td| d| d|dk s/|dkr7td| d	t|
t|A rCtd
|| _|| _|| _|| _|| _|| _	|| _
|| _|| _|| _|| _|| _|
| _|| _|	| _|| _|| _|d ur|d | _n| jd | _d| _d S )Nr   znum_heads (z%) must be divisible by num_kv_heads ()zembed_dim (z") must be divisible by num_heads (   zattn_dropout (z) must be between 0.0 and 1.0z!q and k norm must be set togetherg      F)super__init__
ValueErrorboolr   r   r   r   r   r   r   r
   r   r   r   r   r   r	   r   r   r   scalingcache_enabled)selfr   r   r   r   r   r   r   r   r   r   r	   r
   r   r   r   r   r   r   	__class__ V/home/ubuntu/.local/lib/python3.10/site-packages/torchtune/models/gemma2/_attention.pyr   <   sL   

zGemma2Attention.__init__
batch_sizedtypec                 C   s:   | j durtd dS t||| j| j|d| _ d| _dS )aQ  Setup key value caches for attention calculation. If called
        after kv_cache is already setup, this will be skipped.

        Args:
            batch_size (int): batch size for the caches.
            dtype (torch.dtype): dtype for the caches.
            max_seq_len (int): maximum sequence length model will be run with.
        NzWKey value caches are already setup. You cannot call ``setup_caches()`` twice. Skipping.)r'   r   r   r   r(   T)r
   loggerwarningr   r   r   r!   )r"   r'   r(   r   r%   r%   r&   setup_cache   s   

zGemma2Attention.setup_cachec                 C   s    | j du r	td| j   dS )zzReset the key value caches.

        Raises:
            RuntimeError: if key value caches are not already setup.
        Nz>Key value caches are not setup. Call ``setup_caches()`` first.)r
   RuntimeErrorreset)r"   r%   r%   r&   reset_cache   s
   
zGemma2Attention.reset_cache)mask	input_posxyr/   r0   c                C   s8  |durt |tjstd|j\}}}|dur|jd nd}| |}	| j| j }
|	||| j|
 | j	}	| j
durB| j
|	|d}	|	dd}	| jdurR| |	}	|du rk| jdu s^| jsbtd| jj}| jj}n| |}| |}|||d| j	}| j
dur| j
||d}|||| jd| j	}|||| jd| j	}| j| jkr|||| j|
| j	}|||| j|
| j	}|||d| j	}|||d| j	}|dd}|dd}| jdur| |}| jdur| jr| j||\}}|	| j t|	|dd	}|du rttj||ftjd
|j}|j tjkr+t!|" dd}| j#durSt$|}t%|d| j# d t|| j#d  }t!|dk|d}|& d	kr_|'d}| j(durt|| j( }t)|}|| j( }|| }t*j+|, dd-|	}t||}|dd. ||d}| /|S )a  
        Args:
            x (torch.Tensor): input tensor with shape [b x s_x x d] for the query
            y (Optional[torch.Tensor]): second input tensor with shape [b x s_y x d], is the input
                for k and v. For self attention, x=y. Optional only with kv_cache enabled.
            mask (Optional[_MaskType]): Used to mask the scores after the query-key multiplication
                and before the softmax. Either:

                A boolean tensor with shape ``[b x s x s]``, ``[b x s x self.encoder_max_cache_seq_len]``,
                or ``[b x s x self.encoder_max_cache_seq_len]`` if using KV-cacheing with encoder/decoder layers.
                A value of True in row ``i`` and column ``j`` means token ``i`` attends to token ``j``. A value of False means
                token ``i`` does not attend to token ``j``. If no mask is specified, a causal mask
                is used by default.

                A :class:`~torch.nn.attention.flex_attention.BlockMask` for document masking in a packed sequence
                created via `create_block_mask <https://pytorch.org/blog/flexattention/#mask-mods>`_. We  use
                :func:`~torch.nn.attention.flex_attention.flex_attention` when computing attention with block masks.
                Default is None.
            input_pos (Optional[torch.Tensor]): Optional tensor which contains the position ids
                of each token. During training, this is used to indicate the positions
                of each token relative to its sample when packed, shape [b x s].
                During inference, this indicates the position of the current token.
                If none, assume the index of the token is its position id. Default is None.

        Raises:
            NotImplementedError: If ``mask`` is provided, but mask is not an instance of ``torch.Tensor``.
            ValueError: If no ``y`` input and ``kv_cache`` is not enabled.

        Returns:
            torch.Tensor: output tensor with attention applied

        Notation used for tensor shapes:
            - b: batch size
            - s_x: sequence length for x
            - s_y: sequence length for y
            - n_h: num heads
            - n_kv: num kv heads
            - d: embed dim
            - h_d: head dim
        Nz5Block masks are not implemeted yet, use packed=False.r   r   )r0      zAMust provide y input or use kv_cache to enable streaming decoding   )sizer(   g<ff)dim)0
isinstancetorchTensorNotImplementedErrorshaper   r   r   viewr   r   	transposer   r
   r!   r   k_cachev_cacher   r   expandreshaper	   updatemul_r    matmultrilonesr   todevicer(   wherelogical_notr   	ones_liketriur7   	unsqueezer   tanhFsoftmaxfloattype_as
contiguousr   )r"   r1   r2   r/   r0   bs_x_s_yqq_per_kvkvoutputall_onessliding_maskr%   r%   r&   forward   s   1

















zGemma2Attention.forward)N)__name__
__module____qualname____doc__intr   Moduler   r   r   rR   r   r9   r(   r+   r.   r:   r   r`   __classcell__r%   r%   r#   r&   r      s    3	
I
r   )loggingtypingr   r9   torch.nn.functionalr   
functionalrP   !torchtune.modules.attention_utilsr   torchtune.modules.kv_cacher   	getLoggerra   r)   rf   r   r%   r%   r%   r&   <module>   s   
