o
    wiQd                  (   @   sN  d dl Z d dlZd dlZd dlmZmZ d dlZd dlm  m	Z
 ddlmZmZmZmZmZmZ eeZdZdd ZdLddZd	d
 ZdZe rmd dlmZ d dlmZ d dlmZ  d dlm!Z" d dl#m$Z$ dZ%dZn2e rddl&m'Z$ ddl&m(Z ddl&m)Z ddl&mZ  ddl&m!Z" dZ%dZndZdZdZ dZ"dZ$dZ%e rd dl*mZ+ d dl*mZ, eZ-eZ.dZ/dZn
dZ+dZ,dZ-dZ.dZ/ere0 de d Ze0 de d Ze0 de  Z!e0 de  ZdZ1erde2e 3ej4v Z1dd Z5dd  Z6d!ej7d"e8ej7ej7e9f fd#d$Z:d%ej7d&ej7d'ej7d!ej7d(e9f
d)d*Z;d+d, Z<d-d. Z=	dLd/ej7d0ej7d1ej7d2eej> fd3d4Z?ed5Z@daA	6												dMd7ej7d8ej7d9ej7d!eej7 d(e9d:eBd;eCd<eej7 d=eeC d>ee9 d?eBd@eeC dAeeB dBeejD dCeejD dDee9 dEee9 d2eej> dFeeE f&dGdHZFG dIdJ dJeddKZGdS )N    N)Optional	TypedDict   )is_flash_attn_2_availableis_flash_attn_3_availableis_flash_attn_greater_or_equal#is_flash_attn_greater_or_equal_2_10is_torch_npu_availableloggingc                 C   s$   | j dg| jdd R  }|| S )a  
    A local implementation of the PyTorch indexing operation `tensor[indices]` on the first axis,
    after flattening the first two dimensions of the tensor. This is functionally equivalent to
    FA2's `index_first_axis` and replaces the need to import it.
       N)reshapeshape)tensorindicesreshaped_tensor r   h/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/modeling_flash_attention_utils.py_index_first_axis%   s   r   c           	      C   s   |dur|| n|}|j dtjd}|j dtjd}tj| dd }|  }ttj	|dtjdd}t
| |||||fS )a  
    FA3-compatible unpad_input function.

    Arguments:
        hidden_states: (batch, seqlen, ...)
        attention_mask: (batch, seqlen), bool / int, 1 means valid and 0 means not valid.
        unused_mask: (batch, seqlen), bool / int, 1 means the element is allocated but unused.
    Return:
        hidden_states: (total_nnz, ...), where total_nnz = number of tokens selected in attention_mask + unused_mask.
        indices: (total_nnz), the indices of masked tokens from the flattened input sequence.
        cu_seqlens: (batch + 1), the cumulative sequence lengths, used to index into hidden_states.
        max_seqlen_in_batch: int
        seqused: (batch), returns the number of tokens selected in attention_mask + unused_mask.
    Nr   dimdtypeFas_tupler   r   r   )sumtorchint32nonzeroflattenmaxitemFpadcumsumr   )	hidden_statesattention_maskunused_mask	all_masksseqlens_in_batchused_seqlens_in_batchr   max_seqlen_in_batch
cu_seqlensr   r   r   _fa3_unpad_input1   s   r-   c                 C   sL   | j dd }tj|| g|R | j| jd}| ||< |j||g|R  S )a  
    FA3-compatible pad_input function.

    Arguments:
        hidden_states: (total_nnz, ...), where total_nnz = number of tokens in selected in attention_mask.
        indices: (total_nnz), the indices that represent the non-masked tokens of the original padded input sequence.
        batch: int, batch size for the padded sequence.
        seqlen: int, maximum sequence length for the padded sequence.
    Return:
        hidden_states: (batch, seqlen, ...)
    r   Ndevicer   )r   r   zerosr/   r   view)r%   r   batchseqlenr   outputr   r   r   _fa3_pad_inputP   s   "r5   )flash_attn_func)flash_attn_varlen_func)	pad_input)unpad_input)apply_rotary_embTr   )npu_apply_rotary_emb)npu_flash_attn_func)npu_flash_attn_varlen_funcF   flash_attn__func_varlen_funcunpad_input_fapad_input_fawindow_sizec                   C   s"   t  rdS t r
dS t rdS dS )z5Determine whether flash-attention can be used or not.TF)r   r   r	   r   r   r   r   is_flash_attn_available   s   rE   c                  C   s4   t  rdS t rt  S t rddlm}  |  S dS )zBDetermine whether flash-attention uses top-left or down-right maskFr   'is_npu_fa2_top_left_aligned_causal_mask)r   r   r   r	    integrations.npu_flash_attentionrG   rF   r   r   r   !flash_attn_supports_top_left_mask   s   rI   r&   returnc                 C   sV   | j dtjd}tj|  dd }|  }ttj	|dtjdd}|||fS )aq  
    Retrieves indexing data required to repad unpadded (ragged) tensors.

    Arguments:
        attention_mask (`torch.Tensor`):
            Boolean or int tensor of shape (batch_size, sequence_length), 1 means valid and 0 means not valid.

    Return:
        indices (`torch.Tensor`):
            The indices of non-masked tokens from the flattened input sequence.
        cu_seqlens (`torch.Tensor`):
            The cumulative sequence lengths, used to index into ragged (unpadded) tensors. `cu_seqlens` shape is (batch_size + 1,).
        max_seqlen_in_batch (`int`):
            Maximum sequence length in batch.
    r   r   Fr   r   r   )
r   r   r   r   r   r    r!   r"   r#   r$   )r&   r)   r   r+   r,   r   r   r   _get_unpad_data   s   rK   query_layer	key_layervalue_layerquery_lengthc                 C   s(  t |\}}}|jd |jd  }	kr4|ddd|	ddddf |ddd|	ddddf }}|j\}
}}}t||}t||}||krUt| |} |}|}|}n3|dkrsd}tj|
d tj| jd}|dd }| d} n|dd| df }|| |^} }}}}| |||||f||ffS )a   
    Unpads query, key, and values tensors, using a single dimension for all tokens even though they belong to different batches.

    This function is used instead of `flash_attn.bert_padding.unpad_input` in order to avoid the recomputation of the same intermediary
    tensors for query, key, value tensors.

    Arguments:
        query_layer (`torch.Tensor`):
            Query state with padding. Shape: (batch_size, query_length, num_heads, head_dim).
        key_layer (`torch.Tensor`):
            Key state with padding. Shape: (batch_size, kv_seq_len, num_key_value_heads, head_dim).
        value_layer (`torch.Tensor`):
            Value state with padding. Shape: (batch_size, kv_seq_len, num_key_value_heads, head_dim).
        attention_mask (`torch.Tensor`):
            Boolean or int tensor of shape (batch_size, sequence_length), 1 means valid and 0 means not valid.
        query_length (`int`):
            Target length.
        unpad_input_func:
            The function to use for unpadding the input tensors.

    Return:
        query_layer (`torch.Tensor`):
            Query state without padding. Shape: (total_target_length, num_heads, head_dim).
        key_layer (`torch.Tensor`):
            Key state with padding. Shape: (total_source_length, num_key_value_heads, head_dim).
        value_layer (`torch.Tensor`):
            Value state with padding. Shape: (total_source_length, num_key_value_heads, head_dim).
        indices_q (`torch.Tensor`):
            The indices of non-masked tokens from the flattened input target sequence.
        (cu_seqlens_q, cu_seqlens_k) (`tuple[int]`):
            The cumulative sequence lengths for the target (query) and source (key, value), used to index into ragged (unpadded) tensors. `cu_seqlens` shape is (batch_size + 1,).
        (max_seqlen_in_batch_q, max_seqlen_in_batch_k) (`tuple[int]`):
            Maximum sequence length in batch (`max_seqlen_in_batch_q` for the target sequence i.e. query, `max_seqlen_in_batch_k` for the source sequence i.e. key/value).
    r   r   N)r   r/   )rK   r   r   r   aranger   r/   squeeze)rL   rM   rN   r&   rO   unpad_input_func	indices_kcu_seqlens_kmax_seqlen_in_batch_kseq_len
batch_size
kv_seq_lennum_key_value_headshead_dimcu_seqlens_qmax_seqlen_in_batch_q	indices_q_r   r   r   _upad_input   s6   *B


r_   c                 C   s   |  d| d| d} |  d|d|d}|  d|d|d}| }tj|d|jtjd}t||dk tj	| |jtjdf}|
 d }| |||||f||ffS )aI  
    This function returns necessary arguments to call `flash_attn_varlen_func`.
    All three query, key, value states will be flattened.
    Cumulative lengths of each examples in the batch will be extracted from position_ids.

    NOTE: ideally cumulative lengths should be prepared at the data collator stage

    Arguments:
        query (`torch.Tensor`):
            Query state with padding. Shape: (batch_size, query_length, num_heads, head_dim).
        key (`torch.Tensor`):
            Key state with padding. Shape: (batch_size, kv_seq_len, num_key_value_heads, head_dim).
        value (`torch.Tensor`):
            Value state with padding. Shape: (batch_size, kv_seq_len, num_key_value_heads, head_dim).
        position_ids (`torch.Tensor`):
            Boolean or int tensor of shape (batch_size, sequence_length), 1 means valid and 0 means not valid.

    Return:
        query (`torch.Tensor`):
            Query state without padding. Shape: (total_target_length, num_heads, head_dim).
        key (`torch.Tensor`):
            Key state with padding. Shape: (total_source_length, num_key_value_heads, head_dim).
        value (`torch.Tensor`):
            Value state with padding. Shape: (total_source_length, num_key_value_heads, head_dim).
        indices_q (`torch.Tensor`):
            The indices of non-masked tokens from the flattened input target sequence.
        (cu_seqlens_q, cu_seqlens_k) (`tuple[int]`):
            The cumulative sequence lengths for the target (query) and source (key, value), used to index into ragged (unpadded) tensors. `cu_seqlens` shape is (batch_size + 1,).
        (max_seqlen_in_batch_q, max_seqlen_in_batch_k) (`tuple[int]`):
            Maximum sequence length in batch (`max_seqlen_in_batch_q` for the target sequence i.e. query, `max_seqlen_in_batch_k` for the source sequence i.e. key/value).
    r   r   r.   r   )r1   size
contiguousr   r   rP   r/   r   catr   r    )querykeyvalueposition_idsr]   cu_seq_lens
max_lengthr   r   r   *_prepare_flash_attention_from_position_ids,  s    
rj   c                  O   s   t dt t| i |S )NzThe function `prepare_fa2_from_position_ids` in `transformers.modeling_flash_attention_utils` is deprecated and will be removed in a future version. Please use `_prepare_flash_attention_from_position_ids` instead.)warningswarnFutureWarningrj   )argskwargsr   r   r   prepare_fa2_from_position_ids^  s
   rp   rd   re   rf   target_dtypec                 C   s\   |du r	| ||fS | j }|tjkr)td| d | |} ||}||}| ||fS )aG  
    PEFT usually casts the layer norms in float32 for training stability reasons
    therefore the input hidden states gets silently casted in float32. Hence, we need
    cast them back in float16 / bfloat16 just to be sure everything works as expected.
    This might slowdown training & inference so it is recommended to not cast the LayerNorms!

    Args:
        query (`torch.Tensor`):
            Input query states to be passed to Flash Attention API
        key (`torch.Tensor`):
            Input key states to be passed to Flash Attention API
        value (`torch.Tensor`):
            Input value states to be passed to Flash Attention API
        target_dtype (`torch.dtype`, *optional*):
            The dtype to convert the attention tensors to. Conversion can be ignored by
            not providing the target dtype.
    NzThe input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in .)r   r   float32loggerwarning_onceto)rd   re   rf   rq   input_dtyper   r   r   fa_peft_integration_checkf  s   





rx   z2.4.1        query_states
key_statesvalue_states	is_causaldropoutrg   softmax_scalesliding_windowuse_top_left_masksoftcapdeterministiccu_seq_lens_qcu_seq_lens_kmax_length_qmax_length_kattn_implementationc           (   	   K   s  |du rt }t}t}t}t}n|dkrt}t}t}t}d}n|dkr,t	}t
}t}t}d}|
s1|}n|o6|dk}toC|	duoC|jd |	k}|rLd|	|	fini }|rZ|dkrYtd	 n||d
< trw|du rstdu rqtjdddkat}||d< |dur||d< t| |||\} }}|duo| jd dko|dup|dkotj|dddk  }tdd ||||fD }|dur| jd }t| |||||\} }}}} }!| \}"}#|!\}$}%|| ||f|"|#|$|%||d|}&||&|||}'n~|s|r_| d}|du s|du rt| |||\} }}}} }!| \}}|!\}}n'| d| d| d} |d|d|d}|d|d|d}|| ||f||||||d|}'|'|d|'d|'d}'n|| ||f||d|}'t|'t rv|'d S |'S )aq  
    Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
    first unpad the input, then computes the attention scores and pad the final attention scores.

    Args:
        query_states (`torch.Tensor`):
            Input query states to be passed to Flash Attention API
        key_states (`torch.Tensor`):
            Input key states to be passed to Flash Attention API
        value_states (`torch.Tensor`):
            Input value states to be passed to Flash Attention API
        attention_mask (`torch.Tensor`, *optional*):
            The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
            position of padding tokens and 1 for the position of non-padding tokens.
        dropout (`float`):
            Attention dropout
        softmax_scale (`float`, *optional*):
            The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
        use_top_left_mask (`bool`, defaults to `False`):
            flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference.
        softcap (`float`, *optional*):
            Softcap for the attention logits, used e.g. in gemma2.
        deterministic (`bool`, *optional*):
            Determines if the deterministic option introduced in flash_attn>=2.4.1 is enabled.
        attn_implementation (`str`, *optional*):
            The attention implementation to use. If None, will default to the one based on the environment.
    Nflash_attention_3Tflash_attention_2Fr   rD   ry   zCFlash Attention 3 does not support dropout. Setting dropout to 0.0.	dropout_pFLASH_ATTENTION_DETERMINISTIC01r   r   r   r   )r   c                 s   s    | ]}|d uV  qd S Nr   ).0kwargr   r   r   	<genexpr>  s    
z+_flash_attention_forward.<locals>.<genexpr>)r[   rT   max_seqlen_qmax_seqlen_kr   causalr`   )r   r   )!r7   r6   r8   r9   HAS_FA3flash_attn_3_varlen_funcflash_attn_3_funcpad_input_fa3unpad_input_fa3flash_attn_2_varlen_funcflash_attn_2_funcpad_input_fa2unpad_input_fa2_flash_supports_window_sizer   rt   ru   	flash_241deterministic_gosenvirongetrx   r   diffallr_   ra   rj   r   r1   
isinstancetuple)(rz   r{   r|   r&   rO   r}   r~   rg   r   r   r   r   r   r   r   r   r   rq   r   ro   _flash_attn_varlen_func_flash_attn_func
_pad_input_unpad_input_is_fa3r   use_sliding_windowsflash_kwargsis_fa2_with_position_idsis_fa2_with_varlen_kwargsrW   r]   rh   max_seq_lensr[   rT   r\   rU   attn_output_unpadattn_outputr   r   r   _flash_attention_forward  s   1

&






r   c                   @   sF   e Zd ZU dZeej ed< eej ed< ee ed< ee ed< dS )FlashAttentionKwargsa  
    Keyword arguments for Flash Attention with Compile.

    Attributes:
        cumulative_seqlens_q (`torch.LongTensor`, *optional*)
            Gets cumulative sequence length for query state.
        cumulative_seqlens_k (`torch.LongTensor`, *optional*)
            Gets cumulative sequence length for key state.
        max_length_q (`int`, *optional*):
            Maximum sequence length for query state.
        max_length_k (`int`, *optional*):
            Maximum sequence length for key state.
    cumulative_seqlens_qcumulative_seqlens_kr   r   N)	__name__
__module____qualname____doc__r   r   
LongTensor__annotations__intr   r   r   r   r   I  s   
 r   )totalr   )ry   NNNFNNNNNNNN)Hinspectr   rk   typingr   r   r   torch.nn.functionalnn
functionalr"   utilsr   r   r   r   r	   r
   
get_loggerr   rt   r6   r   r-   r5   
FA_VERSION
flash_attnr   r7   r   flash_attn.bert_paddingr8   r   r9   r   flash_attn.layers.rotaryr:   HAS_FA2rH   r;   r<   r=   flash_attn_interfacer   r   r   r   r   globalsr   list	signature
parametersrE   rI   Tensorr   r   rK   r_   rj   rp   r   rx   r   r   boolfloatr   strr   r   r   r   r   r   <module>   s   


"
P2
)
	

 7