o
    ©Ì³iþE  ã                   @   sÄ   d dl Z d dlmZ d dlZd dlm  mZ d dlmZ d dl	m
Z d dlmZmZ d dlm
Z
 e  e¡ZG dd„ dejƒZG dd	„ d	ejƒZd
ejjddfdd„Zd
ejjdejjfdd„ZdS )é    N)ÚOptional)Únn)ÚKVCache)Ú	_MaskTypeÚ_sdpa_or_flex_attentionc                !       sì   e Zd ZdZddddddddœdeded	ed
edejdejdejdejdeej deej deej dee dede	de
ddf ‡ fdd„Zdedejdeddfdd„Zdd„ Zdddœd ejd!ejd"ee d#eej dejf
d$d%„Z‡  ZS )&ÚMultiHeadAttentionuN  
    NOTE: torch.export.export() friendly MultiHeadAttention, modified from
    torchtune.modules.attention.MultiHeadAttention
    Major differences:
    - Rewrite `if y is None` to torch.cond().
      - Logic becomes `if all values of y are NaN`, to make torch.compile() happy.
      - No input mutations in both false and true branches, so we need to copy kv
        values back into kv cache after torch.cond().
    - Added a SDPA module
      - SDPA module includes transpose and expanding kv dimensions.
      - Makes it easy to swap with custom SDPAs that are needed by the users of exported
        program.
    - Uses new kv cache
      - This potentially can be merged with torchtune.modules.kv_cache.
      - Changed += to .add_ to avoid mutating module attributes.
      - Added clone() method.

    Multi-headed attention layer with support for grouped query
    attention (GQA) introduced in https://arxiv.org/abs/2305.13245v1.

    GQA is a version of multiheaded attention (MHA) which uses fewer
    key/value heads than query heads by grouping n query heads for each
    key and value head. Multi-Query Attention is an extreme
    version where we have a single key and value head shared by all
    query heads.

    Following is an example of MHA, GQA and MQA with num_heads = 4

    (credit for the documentation:
    `litgpt.Config <https://github.com/Lightning-AI/litgpt/blob/eda1aaaf391fd689664f95487ab03dc137e213fd/litgpt/config.py>`_).


    ::

        â”Œâ”€â”€â”€â”â”Œâ”€â”€â”€â”â”Œâ”€â”€â”€â”â”Œâ”€â”€â”€â”     â”Œâ”€â”€â”€â”    â”Œâ”€â”€â”€â”             â”Œâ”€â”€â”€â”
        â”‚ v â”‚â”‚ v â”‚â”‚ v â”‚â”‚ v â”‚     â”‚ v â”‚    â”‚ v â”‚             â”‚ v â”‚
        â””â”€â”€â”€â”˜â””â”€â”€â”€â”˜â””â”€â”€â”€â”˜â””â”€â”€â”€â”˜     â””â”€â”€â”€â”˜    â””â”€â”€â”€â”˜             â””â”€â”€â”€â”˜
        â”‚    â”‚    â”‚    â”‚         â”‚        â”‚                 â”‚
        â”Œâ”€â”€â”€â”â”Œâ”€â”€â”€â”â”Œâ”€â”€â”€â”â”Œâ”€â”€â”€â”     â”Œâ”€â”€â”€â”    â”Œâ”€â”€â”€â”             â”Œâ”€â”€â”€â”
        â”‚ k â”‚â”‚ k â”‚â”‚ k â”‚â”‚ k â”‚     â”‚ k â”‚    â”‚ k â”‚             â”‚ k â”‚
        â””â”€â”€â”€â”˜â””â”€â”€â”€â”˜â””â”€â”€â”€â”˜â””â”€â”€â”€â”˜     â””â”€â”€â”€â”˜    â””â”€â”€â”€â”˜             â””â”€â”€â”€â”˜
        â”‚    â”‚    â”‚    â”‚      â”Œâ”€â”€â”´â”€â”€â”  â”Œâ”€â”€â”´â”€â”€â”      â”Œâ”€â”€â”€â”€â”¬â”€â”€â”´â”€â”¬â”€â”€â”€â”€â”
        â”Œâ”€â”€â”€â”â”Œâ”€â”€â”€â”â”Œâ”€â”€â”€â”â”Œâ”€â”€â”€â”  â”Œâ”€â”€â”€â”â”Œâ”€â”€â”€â”â”Œâ”€â”€â”€â”â”Œâ”€â”€â”€â”  â”Œâ”€â”€â”€â”â”Œâ”€â”€â”€â”â”Œâ”€â”€â”€â”â”Œâ”€â”€â”€â”
        â”‚ q â”‚â”‚ q â”‚â”‚ q â”‚â”‚ q â”‚  â”‚ q â”‚â”‚ q â”‚â”‚ q â”‚â”‚ q â”‚  â”‚ q â”‚â”‚ q â”‚â”‚ q â”‚â”‚ q â”‚
        â””â”€â”€â”€â”˜â””â”€â”€â”€â”˜â””â”€â”€â”€â”˜â””â”€â”€â”€â”˜  â””â”€â”€â”€â”˜â””â”€â”€â”€â”˜â””â”€â”€â”€â”˜â””â”€â”€â”€â”˜  â””â”€â”€â”€â”˜â””â”€â”€â”€â”˜â””â”€â”€â”€â”˜â””â”€â”€â”€â”˜
        â—€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â–¶  â—€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â–¶  â—€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â–¶
                MHA                    GQA                   MQA
        n_kv_heads =4          n_kv_heads=2           n_kv_heads=1

    Args:
        embed_dim (int): embedding dimension for the model
        num_heads (int): number of query heads. For MHA this is also the
            number of heads for key and value
        num_kv_heads (int): number of key and value heads. User should ensure
            ``num_heads % num_kv_heads == 0``. For standard MHA set ``num_kv_heads == num_heads``,
            for GQA ``num_kv_heads < num_heads``, and for MQA set ``num_kv_heads == 1``.
        head_dim (int): dimension of each head, calculated by ``embed_dim // num_heads``.
        q_proj (nn.Module): projection layer for query.
        k_proj (nn.Module): projection layer for key.
        v_proj (nn.Module): projection layer for value.
        output_proj (nn.Module): projection layer for output.
        pos_embeddings (Optional[nn.Module]): positional embeddings layer, e.g. RotaryPositionalEmbeddings.
        q_norm (Optional[nn.Module]): normalization layer for query, e.g. RMSNorm. For decoding, this is applied
            before updating from kv_cache. This means it will only support token wide normalization and not
            batch or sequence wide normalization.
        k_norm (Optional[nn.Module]): normalization layer for key, must be set if q_norm is.
        kv_cache (Optional[KVCache]): KVCache object used to cache key and value
        max_seq_len (int): maximum sequence length supported by the model.
            This is needed to compute the RoPE Cache. Default: 4096.
        is_causal (bool): sets the default mask to causal when no mask is provided
        attn_dropout (float): dropout value passed onto the scaled_dot_product_attention function.
            Default value is 0.0.

    Raises:
        ValueError: If ``num_heads % num_kv_heads != 0``
        ValueError: If ``embed_dim % num_heads != 0``
        ValueError: If ``attn_dropout < 0`` or ``attn_dropout > 1``
        ValueError: if q_norm is defined without k_norm or vice versa
    Ni   Tç        )Úpos_embeddingsÚq_normÚk_normÚkv_cacheÚmax_seq_lenÚ	is_causalÚattn_dropoutÚ	embed_dimÚ	num_headsÚnum_kv_headsÚhead_dimÚq_projÚk_projÚv_projÚoutput_projr	   r
   r   r   r   r   r   Úreturnc             	      s"  t ƒ  ¡  || dkrtd|› d|› dƒ‚|| dkr'td|› d|› dƒ‚|dk s/|dkr7td|› d	ƒ‚t|
ƒt|ƒA rCtd
ƒ‚|| _|| _|| _|| _|| _|| _	|| _
|| _|| _|| _|| _|| _|
| _|| _|	| _tƒ | _t| j| j| j| jr| jnd| j
| j| jd| _d| _d S )Nr   znum_heads (z%) must be divisible by num_kv_heads (ú)zembed_dim (z") must be divisible by num_heads (é   zattn_dropout (z) must be between 0.0 and 1.0z!q and k norm must be set togetherr   )r   r   r   r   r   Úattention_fnr   F)ÚsuperÚ__init__Ú
ValueErrorÚboolr   r   r   r   r   r   r   r   r   r   r   r   r
   r   r	   r   Ú_attention_callÚSDPAÚtrainingÚ_sdpaÚcache_enabled)Úselfr   r   r   r   r   r   r   r   r	   r
   r   r   r   r   r   ©Ú	__class__© úW/home/ubuntu/.local/lib/python3.10/site-packages/torchtune/modules/_export/attention.pyr   f   sV   
ÿÿÿÿù
zMultiHeadAttention.__init__Ú
batch_sizeÚdtypec                 C   sF   | j durt d¡ dS t||| j| j|dd| _ | j | j_ d| _dS )aQ  Setup key value caches for attention calculation. If called
        after kv_cache is already setup, this will be skipped.

        Args:
            batch_size (int): batch size for the caches.
            dtype (torch.dtype): dtype for the caches.
            max_seq_len (int): maximum sequence length model will be run with.
        NzWKey value caches are already setup. You cannot call ``setup_caches()`` twice. Skipping.F)r*   r   r   r   r+   Útranspose_cacheT)r   ÚloggerÚwarningÚInferenceKVCacher   r   r#   r$   )r%   r*   r+   r   r(   r(   r)   Úsetup_cache°   s   
ÿú

zMultiHeadAttention.setup_cachec                 C   s    | j du r	tdƒ‚| j  ¡  dS )zReset the key value caches.Nz>Key value caches are not setup. Call ``setup_caches()`` first.)r   ÚRuntimeErrorÚreset©r%   r(   r(   r)   Úreset_cacheÌ   s
   
ÿzMultiHeadAttention.reset_cache)ÚmaskÚ	input_posÚxÚyr5   r6   c                   s(  |j \‰ }}ˆ |¡}ˆjˆj }| ˆ |ˆj| ˆj¡}ˆjdur)ˆj|ˆd}ˆjdur3ˆ |¡}‡ ‡‡fdd„‰‡fdd„}	‡‡fdd„}
ˆjdu r\|dusUJ d	ƒ‚ˆ|ƒ\}}n(t	 
t	 |¡ ¡  ¡ |	|
|f¡\}}}ˆjj |¡ ˆjj |¡ ˆjj |¡ ˆj|||ˆ ||d
}ˆ |¡S )a*  
        Args:
            x (torch.Tensor): input tensor with shape [b x s_x x d] for the query
            y (torch.Tensor): second input tensor with shape [b x s_y x d], is the input
                for k and v. For self attention, x=y. If all values are NaN, we read from kv cache.
            mask (Optional[_MaskType]): Used to mask the scores after the query-key multiplication
                and before the softmax. Either:

                A boolean tensor with shape ``[b x s x s]``, ``[b x s x self.encoder_max_cache_seq_len]``,
                or ``[b x s x self.encoder_max_cache_seq_len]`` if using KV-cacheing with encoder/decoder layers.
                A value of True in row ``i`` and column ``j`` means token ``i`` attends to token ``j``. A value of False means
                token ``i`` does not attend to token ``j``. If no mask is specified, a causal mask
                is used by default.

                A :class:`~torch.nn.attention.flex_attention.BlockMask` for document masking in a packed sequence
                created via `create_block_mask <https://pytorch.org/blog/flexattention/#mask-mods>`_. We  use
                :func:`~torch.nn.attention.flex_attention.flex_attention` when computing attention with block masks.
                Default is None.
            input_pos (Optional[torch.Tensor]): Optional tensor which contains the position ids
                of each token. During training, this is used to indicate the positions
                of each token relative to its sample when packed, shape [b x s].
                During inference, this indicates the position of the current token.
                If none, assume the index of the token is its position id. Default is None.

        Returns:
            torch.Tensor: output tensor with attention applied

        Notation used for tensor shapes:
            - b: batch size
            - s_x: sequence length for x
            - s_y: sequence length for y
            - n_h: num heads
            - n_kv: num kv heads
            - d: embed dim
            - h_d: head dim
        N©r6   c                    sv   | j d }ˆ | ¡}ˆ | ¡}| ˆ |dˆj¡}| ˆ |dˆj¡}ˆjd ur-ˆj|ˆd}ˆjd ur7ˆ |¡}||fS )Nr   éÿÿÿÿr9   )Úshaper   r   Úviewr   r	   r   )r8   Ús_yÚkÚv)Úbr6   r%   r(   r)   Úcalculate_kv  s   





z0MultiHeadAttention.forward.<locals>.calculate_kvc                    s   ˆ j  ¡ }|j|j|jfS ©N)r   ÚcloneÚk_cacheÚv_cacheÚ	cache_pos)r8   r   r3   r(   r)   Útrue_fn'  s   
z+MultiHeadAttention.forward.<locals>.true_fnc                    s2   ˆ | ƒ\}}ˆj  ¡ }| ||¡ |j|j|jfS rB   )r   rC   ÚupdaterD   rE   rF   )r8   r>   r?   r   )rA   r%   r(   r)   Úfalse_fn+  s   
z,MultiHeadAttention.forward.<locals>.false_fnzAMust provide y input or use kv_cache to enable streaming decoding)r5   )r;   r   r   r   r<   r   r	   r
   r   ÚtorchÚcondÚisnanÚallÚitemrD   Úcopy_rE   rF   r#   r   )r%   r7   r8   r5   r6   Ús_xÚ_ÚqÚq_per_kvrG   rI   r>   r?   rF   Úoutputr(   )r@   rA   r6   r%   r)   ÚforwardÔ   s0   .





ÿ
ÿ
zMultiHeadAttention.forward)Ú__name__Ú
__module__Ú__qualname__Ú__doc__Úintr   ÚModuler   r   r   Úfloatr   rJ   r+   r0   r4   ÚTensorr   rU   Ú__classcell__r(   r(   r&   r)   r      s‚    [ïýüûúùø	÷
öõôóòñðïîJÿÿÿ
þúþýûúùr   c                       sp   e Zd ZdZdedededededdf‡ fd	d
„Z	ddej	dej	dej	dedede
e dej	fdd„Z‡  ZS )r!   zr
    TorchTune's SDPA which can be optimized and can be swapped
    out for a more efficient implementations.
    r   r   r   r   r   r   Nc                    sF   t ƒ  ¡  || _|| _|| _| j| j | _|| _|| _|| _|| _	d S rB   )
r   r   r   r   r   rS   r   r   Ú_attention_fnr   )r%   r   r   r   r   r   r   r   r&   r(   r)   r   M  s   


zSDPA.__init__rR   r>   r?   ÚbszÚseq_lenr5   c           	      C   s¶   |  dd¡}|  dd¡}|  dd¡}| j| jkr8dd| jddf}| d¡ |¡ dd¡}| d¡ |¡ dd¡}| j||||| j| j	d u oK|d u oK| j
d}|  dd¡ ¡  ||d¡S )Nr   é   r:   )r5   Ú	dropout_pr   )Ú	transposer   r   rS   Ú	unsqueezeÚexpandÚflattenr_   r   r   r   Ú
contiguousr<   )	r%   rR   r>   r?   r`   ra   r5   Úexpand_shaperT   r(   r(   r)   rU   a  s    ú	zSDPA.forwardrB   )rV   rW   rX   rY   rZ   r\   r   r   rJ   r]   r   r   rU   r^   r(   r(   r&   r)   r!   G  s>    þýüûú	÷ùþýüûúùør!   Úmoduler   c                 C   sx   |   ¡ D ]5\}}t|tjƒr5t| |t|j|j|j|j|j	|j
|j|j|j|j|j|j|j|j|jdƒ qt|ƒ qd S )N)r   r   r   r   r   r   r   r   r	   r
   r   r   r   r   r   )Únamed_childrenÚ
isinstanceÚTorchTuneAttentionr   Úsetattrr   r   r   r   r   r   r   r   r	   r
   r   r   r   r   r   Úreplace_mha_with_inference_mha)rj   ÚnameÚchildr(   r(   r)   Ú_replace_mha_with_inference_mha…  s2   ñý
èrr   c                 C   s   t | ƒ | S )z˜
    Replace TorchTune's MHA with an inference friendly version of MHA that
    separates out the inference-related parts for further optimization.
    )rr   )rj   r(   r(   r)   ro   ¡  s   ro   )ÚloggingÚtypingr   rJ   Útorchtune.modules.attentionÚmodulesÚ	attentionrm   r   Ú"torchtune.modules._export.kv_cacher   r/   Ú!torchtune.modules.attention_utilsr   r   Útorchtune.modules.kv_cacheÚ	getLoggerrV   r-   r[   r   r!   rr   ro   r(   r(   r(   r)   Ú<module>   s   
  4>