o
    wi(                     @   s  d dl Z d dlZd dlm  mZ ddlmZ e r0d dlZd dl	Z	d dl
mZmZ d dl	mZ dZdZee jdedZeeefvrGed	i Zd
d Zdd ZG dd dejjZejZG dd dejjZejZdd ZdddZ			dddZ 					d ddZ!dd Z"dS )!    N   )is_torch_npu_available)	rearrangerepeat)npu_rotary_mul   NPU_FA2_SPARSE_MODE)defaultzEnvironment variable `NPU_FA2_SPARSE_MODE` can only be set as 2 (top-left aligned causal mask) or 3 (down-right aligned causal mask).c                 C   s4   | t vrtjtjddg| ddd t | < t |  S )z6Get or create attention mask for the specified device.i   device   )diagonal)ATTN_MASK_NPU_CACHEtorchtriuonesboolr
    r   j/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/integrations/npu_flash_attention.pyget_attn_mask_npu,   s   $r   c                   C   s   t  rttkS dS )NF)r   SPARSE_MODE!TOP_LEFT_ALIGNED_CAUSAL_MASK_MODEr   r   r   r   'is_npu_fa2_top_left_aligned_causal_mask3   s   r   c                   @   $   e Zd Zedd Zedd ZdS )IndexFirstAxisc              	   C   sh   |  | |jdksJ |jd |jdd  | _}| }tt|ddt|d|dj	dg|R  S )Nr   r   r   b ... -> b (...)z -> z dd)
save_for_backwardndimshapefirst_axis_dimnumelr   gatherr   r   reshape)ctxinputindicesother_shape
second_dimr   r   r   forward9   s   
zIndexFirstAxis.forwardc                 C   s   | j \}|jdksJ |jdd  }t|d}tj| j|jd g|j|jd}|	dt
|d|jd d| |j| jg|R  d fS )Nr   r   r   r   dtyper   r   r   )saved_tensorsr!   r"   r   r   zerosr#   r   r.   scatter_r   r&   )r'   grad_outputr)   r*   
grad_inputr   r   r   backwardE   s   
zIndexFirstAxis.backwardN__name__
__module____qualname__staticmethodr,   r4   r   r   r   r   r   8   s
    
r   c                   @   r   )IndexPutFirstAxisc                 C   sZ   |  | |jdksJ |jdksJ tj|g|jdd  R |j|jd}|||< |S )Nr   r   r-   )r    r!   r   r0   r"   r   r.   )r'   valuesr)   r#   outputr   r   r   r,   [   s   
(zIndexPutFirstAxis.forwardc                 C   s   | j \}|| }|d d fS N)r/   )r'   r2   r)   grad_valuesr   r   r   r4   f   s   
zIndexPutFirstAxis.backwardNr5   r   r   r   r   r:   Z   s
    

r:   c                 C   s   t | ||| }t|d|dS )a  
    Arguments:
        hidden_states: (total_nnz, ...), where total_nnz = number of tokens in selected in attention_mask.
        indices: (total_nnz), the indices that represent the non-masked tokens of the original padded input sequence.
        batch: int, batch size for the padded sequence.
        seqlen: int, maximum sequence length for the padded sequence.
    Return:
        hidden_states: (batch, seqlen, ...)
    z(b s) ... -> b s ...)b)index_put_first_axisr   )hidden_statesr)   batchseqlenr<   r   r   r   	pad_inputs   s   rD   c           	      C   s   |dur|| n|}|j dtjd}|j dtjd}tj| dd }|  }ttj	|dtjdd}t
t| d|||||fS )	a  
    Arguments:
        hidden_states: (batch, seqlen, ...)
        attention_mask: (batch, seqlen), bool / int, 1 means valid and 0 means not valid.
        unused_mask: (batch, seqlen), bool / int, 1 means the element is allocated but unused.
    Return:
        hidden_states: (total_nnz, ...), where total_nnz = number of tokens selected in attention_mask + unused_mask.
        indices: (total_nnz), the indices of masked tokens from the flattened input sequence.
        cu_seqlens: (batch + 1), the cumulative sequence lengths, used to index into hidden_states.
        max_seqlen_in_batch: int
        seqused: (batch), returns the number of tokens selected in attention_mask + unused_mask.
    Nr   )dimr.   F)as_tupler   )r   r   zb s ... -> (b s) ...)sumr   int32nonzeroflattenmaxitemFpadcumsumindex_first_axisr   )	rA   attention_maskunused_mask	all_masksseqlens_in_batchused_seqlens_in_batchr)   max_seqlen_in_batch
cu_seqlensr   r   r   unpad_input   s   rX           Fc                 K   s   d| }|d u rdt | jd  }|s)| jd }tj| |||d||dd }	|	S t| j}
| jd }tj| |||d|||
td	d }	|	S )N      ?r   r   BSND)	keep_probscaler   )r\   r]   
atten_masksparse_mode)mathsqrtr"   	torch_npunpu_fusion_attentionr   r   r   )qkv	dropout_psoftmax_scalecausalkwargsr\   head_numr<   attn_mask_npur   r   r   npu_flash_attn_func   s.   	



rm   c
                 K   s   d| }|d u rdt | jd  }|	sE| jd }tj| |||d d ||dt|dd     t|dd     dd }|S t	| j
}| jd }tj| |||d d |||dt|dd     t|dd     tdd }|S )NrZ   r   r   TND)pser^   r]   r\   input_layoutactual_seq_qlenactual_seq_kvlenr   )	ro   padding_maskr^   r]   r\   rp   rq   rr   r_   )r`   ra   r"   rb   rc   tuplecpunumpytolistr   r   r   )rd   re   rf   cu_seqlens_qcu_seqlens_kmax_seqlen_qmax_seqlen_krg   rh   ri   rj   r\   rk   r<   rl   r   r   r   npu_flash_attn_varlen_func   sR   
 

r|   c                 K   s   t |jdkr!|jd | jd d kr!|dd}|dd}t |jdkrB|jd | jd d krB|dd}|dd}t| ||S )Nr   r   r   r   )lenr"   r   	unsqueezer   )xcossinrj   r   r   r   npu_apply_rotary_emb   s   &&r   r=   )rY   NF)NNrY   NF)#osr   torch.nn.functionalnn
functionalrM   utils.import_utilsr   r`   rb   einopsr   r   r   r   #DOWN_RIGHT_ALIGNED_CAUSAL_MASK_MODEintgetenvr   
ValueErrorr   r   r   autogradFunctionr   applyrP   r:   r@   rD   rX   rm   r|   r   r   r   r   r   <module>   sH   
%
)
7