o
    ̳i                     @   s6   d dl mZ d dlZd dlmZ G dd dejZdS )    )OptionalN)nnc                	       sx   e Zd ZdZ		ddedededdf fd	d
Zdd ZddeddfddZ	dde	j
dee	j
 de	j
fddZ  ZS )Qwen2RotaryPositionalEmbeddingsau  
    RoPE Embeddings used in the Qwen2 model.
    Ref: https://huggingface.co/Qwen/Qwen2-7B-Instruct

    This class is not numerically equivalent to the RoPE Embedding module
    used by Llama2 and Llama3.

    Args:
        dim (int): Embedding dimension. This is usually set to the dim of each
            head in the attention module computed as ``embed_dim`` // ``num_heads``
        max_seq_len (int): Maximum expected sequence length for the
            model, if exceeded the cached freqs will be recomputed
        base (float): The base for the geometric progression used to compute
            the rotation angles
           .Adimmax_seq_lenbasereturnNc                    s(   t    || _|| _|| _|   d S N)super__init__r   r	   r   	rope_init)selfr   r   r	   	__class__ a/home/ubuntu/.local/lib/python3.10/site-packages/torchtune/models/qwen2/_positional_embeddings.pyr      s
   
z(Qwen2RotaryPositionalEmbeddings.__init__c                 C   sR   d| j td| jdd | jd   | j   }| jd|dd | | j d S )Ng      ?r      thetaF
persistent)r	   torcharanger   floatregister_bufferbuild_rope_cacher   )r   r   r   r   r   r   +   s   &z)Qwen2RotaryPositionalEmbeddings.rope_initc                 C   sl   t j|| jj| jjd}t d|| j }t j||gdd}t j| |	 gdd}| j
d|dd d S )N)dtypedevicez
i, j -> ijr   cacheFr   )r   r   r   r   r   einsumr   catcossinr   )r   r   seq_idx	idx_thetafreqsr!   r   r   r   r   3   s   z0Qwen2RotaryPositionalEmbeddings.build_rope_cachex	input_posc                 C   s   | d}| d}|du r| jd| n| j| }|d|d|d }|dd|f |j}|d|df |j}|dd|jd d f }|d|jd d df }	tj|	 |fdd}
|| |
|  }||S )ae  
        Args:
            x (torch.Tensor): input tensor with shape
                [b, s, n_h, h_d]
            input_pos (Optional[torch.Tensor]): Optional tensor which contains the position ids
                of each token. During training, this is used to indicate the positions
                of each token relative to its sample when packed, shape [b, s].
                During inference, this indicates the position of the current token.
                If none, assume the index of the token is its position id. Default is None.

        Returns:
            Tensor: output tensor with RoPE applied

        Notation used for tensor shapes:
            - b: batch size
            - s: sequence length
            - n_h: num heads
            - h_d: head dim

        TODO: The implementation below can be made more efficient
        for inference.
           r   Nr   .r    )	sizer!   viewtor   shaper   r#   type_as)r   r)   r*   seq_lenhead_dim
rope_cacher$   r%   x1x2rotatedx_outr   r   r   forwardD   s   


z'Qwen2RotaryPositionalEmbeddings.forward)r   r   )r   r   )__name__
__module____qualname____doc__intr   r   r   r   r   Tensorr   r8   __classcell__r   r   r   r   r      s.    r   )typingr   r   r   Moduler   r   r   r   r   <module>   s   