o
    ̳is'                     @   s0  U d dl Z d dlmZmZmZmZ d dlZd dlmZ d dlm	Z	 d dl
mZmZ e Ze jed< e	rfd dlmZmZmZ dd	 Ze Zejjd
ddejdejdejdedejf
ddZeejef ZnejZdeej dejfddZdeej dejfddZdeej defddZdefddZdS )    N)CallableListOptionalUnion)nn)_SUPPORTS_FLEX_ATTENTION)
get_loggerlog_once_log)	BlockMaskcreate_block_maskflex_attentionc                  C   s   zt jtddW S  tyC }  z/td|  d zt jtdddW W  Y d } ~ S  ty> }  z
td|  d  d } ~ ww d } ~ ww )	NF)dynamicz,Compiling flex_attention failed with error 'z%'. Retrying with mode='max-autotune'.zmax-autotune)r   modez-Compiling flex_attention failed with error: 'z', Updating your pytorch version to nightlies may solve it, or you can setin your config dataset.packed=False to avoid using flex attention.)torchcompiler   	Exceptionr
   info)e r   U/home/ubuntu/.local/lib/python3.10/site-packages/torchtune/modules/attention_utils.pycompile_flex_attention   s"   

r   F)	recursiveqkv
block_maskreturnc                 C   s   t | |||dS )Nr   )flex_attention_compiled)r   r   r   r   r   r   r   compile_friendly_flex_attention2   s   r    seq_lensc                 C   sN   t | }g }t|D ]}tdd t| | D }|| q
t|}|S )a  
    Convert a batch tensor of seq lens into integer IDs denoting sample ownership.
    For example, seq_lens = [2, 3, 1] would return [0, 0, 1, 1, 1, 2].

    Args:
        seq_lens (List[torch.Tensor]): Sequence lengths of samples in each pack in the batch,
            shape (batch_size, n), where n is the max number of sequences in a pack and can vary
            across packs.

    Returns:
        Tensor: Document IDs of shape (batch_size, max_seq_len).
    c                 S   s(   g | ]\}}t j|f|t j|jd qS )dtypedevice)r   fulllongr$   .0iseq_lenr   r   r   
<listcomp>U   s    z3_get_document_ids_from_seq_lens.<locals>.<listcomp>)lenranger   cat	enumerateappendstack)r!   
batch_sizebatch_document_ids
sample_idxdocument_idsr   r   r   _get_document_ids_from_seq_lens@   s   

r6   c                 C   sJ   g }t | }t|D ]}dd t| | D }|tj|  q
t|S )a  
    Given a batch tensor of seq lens defining the lengths of samples in each pack,
    Construct a 2D block causal mask for each pack in the batch. For example, if
    a single sample's seq_lens is [3, 2, 1], the mask would be::

        mask = [
            [1, 0, 0, 0, 0, 0],
            [1, 1, 0, 0, 0, 0],
            [1, 1, 1, 0, 0, 0],
            [0, 0, 0, 1, 0, 0],
            [0, 0, 0, 1, 1, 0],
            [0, 0, 0, 0, 0, 1],
        ]

    Args:
        seq_lens (List[torch.Tensor]): Sequence lengths of samples in each pack in the batch,
            shape (batch_size, n), where n is the max number of sequences in a pack and can vary
            across packs.


    Returns:
        Tensor: Block causal mask of shape (batch_size, max_seq_len, max_seq_len).
    c              
   S   s,   g | ]\}}t t j||t j|jd qS r"   )r   trilonesboolr$   r'   r   r   r   r+   z   s    z,create_block_causal_mask.<locals>.<listcomp>)r,   r-   r/   r0   r   
block_diagr1   )r!   batch_block_attn_masksr2   r4   block_attn_masksr   r   r   create_block_causal_mask_   s   

r=   c                    sJ   t r t|   j\}} d  fdd}t||d||ddS t| dS )a  
    Create a block causal document mask for a batch of packed sequences. If
    flex attention is supported by the current hardware, block causal logic and
    passing this into :func:`torch.nn.attention.flex_attention.create_block_mask`.
    The resultant BlockMask is a compressed representation of the full block causal
    mask. If on an older version, a standard 2D block causal mask is created and returned.

    Args:
        seq_lens (List[torch.Tensor]): Sequence lengths of samples in each pack in the batch,
            shape (batch_size, n), where n is the max number of sequences in a pack and can vary
            across packs.

    Returns:
        _MaskType: BlockMask or Tensor if torch version < 2.5.0.
    cudac                    s(   ||k} | |f  | |f k}||@ S )a  
            Defines the logic of a block causal mask by combining both a standard causal mask
            and a block diagonal document mask.

            See :func:`~torchtune.modules.attention_utils.create_block_causal_mask`
            for an illustration.
            r   )bhq_idxkv_idxcausal_maskdocument_maskr5   r   r   mask_mod   s   z*packed_block_causal_mask.<locals>.mask_modN)r$   )r!   )r   r6   shapetocreate_block_causal_mask_flexr=   )r!   r2   max_seq_lenrF   r   rE   r   packed_block_causal_mask   s   


	rK   c                  C   sp   t rdtjdtjdtjdtt dtdtdtjfdd	} | S dtjdtjdtjdtt dtdtdtjfd
d	} | S )aE  
    Helper function to decide when to call flex attention or SDPA. It will use
    flex attention if ALL of the following conditions are met, otherwise it will
    default to SDPA:
    - torch version >= 2.5.0
    - we are sample packing, therefore mask is a BlockMask
    - torch.cuda.get_device_capability() >= (7, 5)
    r   r   r   mask	dropout_p	is_causalr   c                 S   sv   t |trttdtjd |dkrtdt| |||dS |d ur/|d d d d d d d f }tj	j
| |||||dS )NzOUsing flex attention for attention computation since a BlockMask was passed in.)levelg        zCFlex attention does not support dropout. Please set dropout to 0.0.r   	attn_maskrM   rN   )
isinstancer   r	   r
   loggingDEBUG
ValueErrorr    r   
functionalscaled_dot_product_attentionr   r   r   rL   rM   rN   r   r   r   _attention_call   s4   
	z0_sdpa_or_flex_attention.<locals>._attention_callc                 S   s<   |d ur|d d d d d d d f }t jj| |||||dS )NrP   )r   rV   rW   rX   r   r   r   rY      s   	)r   r   Tensorr   	_MaskTypefloatr9   )rY   r   r   r   _sdpa_or_flex_attention   sB   

G
r]   ) rS   typingr   r   r   r   r   r   torchtune.utils._import_guardr   torchtune.utils._loggingr   r	   r
   Logger__annotations__!torch.nn.attention.flex_attentionr   r   rI   r   r   r   compilerdisablerZ   r    r[   r6   r=   rK   r]   r   r   r   r   <module>   sJ   

&
4