o
     i&                     @   s   d dl Z d dlm  mZ d dlmZmZ G dd de jj	Z
e
jZG dd de jj	ZejZG dd de jj	ZejZdd	d
Zdd Zdd ZdS )    N)	rearrangerepeatc                   @   $   e Zd Zedd Zedd ZdS )IndexFirstAxisc              	   C   sh   |  | |jdksJ |jd |jdd  | _}| }tt|ddt|d|dj	dg|R  S )N   r      b ... -> b (...)z -> z dd)
save_for_backwardndimshapefirst_axis_dimnumeltorchgatherr   r   reshape)ctxinputindicesother_shape
second_dim r   U/home/ubuntu/.local/lib/python3.10/site-packages/xformers/_flash_attn/bert_padding.pyforward	   s   
zIndexFirstAxis.forwardc                 C   s   | j \}|jdksJ |jdd  }t|d}tj| j|jd g|j|jd}|	dt
|d|jd d| |j| jg|R  d fS )Nr   r   r   devicedtyper   r	   r
   )saved_tensorsr   r   r   r   zerosr   r   r   scatter_r   r   )r   grad_outputr   r   
grad_inputr   r   r   backward   s   
zIndexFirstAxis.backwardN__name__
__module____qualname__staticmethodr   r%   r   r   r   r   r      s
    
r   c                   @   r   )IndexPutFirstAxisc                 C   sZ   |  | |jdksJ |jdksJ tj|g|jdd  R |j|jd}|||< |S )Nr   r   r   )r   r   r   r!   r   r   r   )r   valuesr   r   outputr   r   r   r   *   s   
zIndexPutFirstAxis.forwardc                 C   s   | j \}|| }|d d fS N)r    )r   r#   r   grad_valuesr   r   r   r%   7   s   
zIndexPutFirstAxis.backwardNr&   r   r   r   r   r+   )   
    
r+   c                   @   r   )IndexFirstAxisResidualc                 C   sP   |  | |jdksJ |jd |jdd  | _}| }|| }|| fS )Nr   r   r   )r   r   r   r   r   detach)r   r   r   r   r   r-   r   r   r   r   D   s   
zIndexFirstAxisResidual.forwardc                 C   s   | j \}|jdksJ |jdd  }|jdd  |ksJ |}|j|jd gd|jd  R  }||}|d|| |j| jg|R  d fS )Nr   r   r   )r   )r    r   r   r   	expand_asscatter_add_r   )r   r#   grad_residualr   r   r$   r   r   r   r%   Q   s   "
zIndexFirstAxisResidual.backwardNr&   r   r   r   r   r1   C   r0   r1   c           	      C   s   |dur|| n|}|j dtjd}|j dtjd}tj| dd }|  }ttj	|dtjdd}t
t| d|||||fS )	a  
    Arguments:
        hidden_states: (batch, seqlen, ...)
        attention_mask: (batch, seqlen), bool / int, 1 means valid and 0 means not valid.
        unused_mask: (batch, seqlen), bool / int, 1 means the element is allocated but unused.
    Return:
        hidden_states: (total_nnz, ...), where total_nnz = number of tokens selected in attention_mask + unused_mask.
        indices: (total_nnz), the indices of masked tokens from the flattened input sequence.
        cu_seqlens: (batch + 1), the cumulative sequence lengths, used to index into hidden_states.
        max_seqlen_in_batch: int
        seqused: (batch), returns the number of tokens selected in attention_mask + unused_mask.
    Nr   dimr   Fas_tupler   r   r   b s ... -> (b s) ...)sumr   int32nonzeroflattenmaxitemFpadcumsumindex_first_axisr   )	hidden_statesattention_maskunused_mask	all_masksseqlens_in_batchused_seqlens_in_batchr   max_seqlen_in_batch
cu_seqlensr   r   r   unpad_inputb   s   rN   c           
      C   s   |j dd}|d}tj||j|jdt|||dk }tj	|
 dd
 }|
 | }tj	|
 dd
 }|  }ttj|dtjdd	}	tt| d
|||	|fS )a  
    Supports concatenating short samples in one sequence. The attention_mask_in_length is utilized to mask other short samples. It helps efficient training of variant lengths-based samples (e.g., the supervised fine-tuning task in large language model).
    The motivation for this function is explained [here](https://github.com/Dao-AILab/flash-attention/issues/432#issuecomment-1668822286).
    
    For example, if batch = 3 and seqlen = 6, the attention_mask_in_length is:
        ```
        [
          [2, 3, 0, 0, 0, 0],
          [3, 2, 0, 0, 0, 0],
          [6, 0, 0, 0, 0, 0]
        ]
        ```
    , which refers to the 3D-attention mask:
        ```
        [
          [
            [1, 0, 0, 0, 0, 0],
            [1, 1, 0, 0, 0, 0],
            [0, 0, 1, 0, 0, 0],
            [0, 0, 1, 1, 0, 0],
            [0, 0, 1, 1, 1, 0],
            [0, 0, 0, 0, 0, 1]
          ],
          [
            [1, 0, 0, 0, 0, 0],
            [1, 1, 0, 0, 0, 0],
            [1, 1, 1, 0, 0, 0],
            [0, 0, 0, 1, 0, 0],
            [0, 0, 0, 1, 1, 0],
            [0, 0, 0, 0, 0, 1]
          ],
          [
            [1, 0, 0, 0, 0, 0],
            [1, 1, 0, 0, 0, 0],
            [1, 1, 1, 0, 0, 0],
            [1, 1, 1, 1, 0, 0],
            [1, 1, 1, 1, 1, 0],
            [1, 1, 1, 1, 1, 1]
          ]
        ]
        ```.

    Arguments:
        hidden_states: (batch, seqlen, ...)
        attention_mask_in_length: (batch, seqlen), int, a nonzero number (e.g., 1, 2, 3, etc.) means length of concatenated sequence in b-th batch, and 0 means none.
    Return:
        hidden_states: (total_nnz, ...), where total_nnz = number of tokens in selected in attention_mask.
        indices: (total_nnz), the indices of non-masked tokens from the flattened input sequence.
        cu_seqlens: (batch + 1), the cumulative sequence lengths, used to index into hidden_states.
        max_seqlen_in_batch: int
    r   )r7   r   r   Fr8   r   r6   r:   r;   )r<   sizer   aranger   r   expandlen	unsqueezer>   r?   r@   rA   rB   rC   rD   r=   rE   r   )
rF   attention_mask_in_lengthlengthseqlenattention_mask_2dreal_indices_idxrJ   r   rL   rM   r   r   r   &unpad_input_for_concatenated_sequences   s   4
*rY   c                 C   s(   | j d }t| ||| }t|d|dS )a  
    Arguments:
        hidden_states: (total_nnz, ...), where total_nnz = number of tokens in selected in attention_mask.
        indices: (total_nnz), the indices that represent the non-masked tokens of the original padded input sequence.
        batch: int, batch size for the padded sequence.
        seqlen: int, maximum sequence length for the padded sequence.
    Return:
        hidden_states: (batch, seqlen, ...)
    r   z(b s) ... -> b s ...)b)r   index_put_first_axisr   )rF   r   batchrV   r7   r-   r   r   r   	pad_input   s   

r]   r.   )r   torch.nn.functionalnn
functionalrB   einopsr   r   autogradFunctionr   applyrE   r+   r[   r1   index_first_axis_residualrN   rY   r]   r   r   r   r   <module>   s   
!I