o
    
۾i7                     @   sP  d Z ddlZddlZddlm  mZ ddlmZ ddl	m
Z
 			d dejdejdejded	ed
edB dedB dejdB dejdB dejfddZ			d dejdejdejded	ed
edB dedB dejdB dejdB dejfddZe
deed 			d dejdejdejded	ed
edB dedB dejdB dejdB dejfddZ		d!dejdejdejdedB dedejfddZ			d"dejdejdejdedB dejdB dedejfddZ	d#dejdejdejdedB dejdB dedejfddZe
deed 			d"dejdejdejdedB dejdB dedejfddZdS )$a  
This file contains ops for ViT attention to be compatible with torch.compile
as there are operations here not supported by torch.compile (for instance,
`.item()` in flash attention)

Using these ops and wrapping vision blocks with `torch.compile` can speed up
throughput in vision models by ~5% relative on H100, and improve token
latencies by ~7% (see qwen2_5_vl for example usage)

To use these ops, you must have a recent version of PyTorch installed (>= 2.4.0)
    N)current_platform)direct_register_custom_opqkv
batch_sizeis_rocm_aiter
fa_versionscale
cu_seqlens
max_seqlenreturnc	              
   C   s   i }	|rddl m}
 nddlm}
 t s|d ur||	d< | d}|d u r6tjd|d | |tj| j	d}|d u r<|n|
 }dd | ||fD \} }}|
| ||f||||dd	|d
|	}tj|d|d}|S )Nr   )flash_attn_varlen_funcr	      )stepdtypedevicec                 s       | ]	}t |d V  qdS )zb s ... -> (b s) ...Neinops	rearrange.0x r   [/home/ubuntu/.local/lib/python3.10/site-packages/vllm/v1/attention/ops/vit_attn_wrappers.py	<genexpr>2       z/flash_attn_maxseqlen_wrapper.<locals>.<genexpr>        F)cu_seqlens_qcu_seqlens_kmax_seqlen_qmax_seqlen_k	dropout_pcausalsoftmax_scalez(b s) h d -> b s h d)b)aiterr   #vllm.v1.attention.backends.fa_utilsr   is_rocmsizetorcharangeint32r   itemr   r   )r   r   r   r   r   r	   r
   r   r   kwargsr   q_lenoutputcontext_layerr   r   r   flash_attn_maxseqlen_wrapper   s<   
r3   c	           	      C   
   t | S Nr+   
empty_like	r   r   r   r   r   r	   r
   r   r   r   r   r   !flash_attn_maxseqlen_wrapper_fakeD   s   
r9   )op_nameop_func	fake_implc	           	      C   s   t jj| ||||||||	S r5   )r+   opsvllmr3   r8   r   r   r   vit_flash_attn_wrapperY   s   r?   F
enable_gqac                 C   s@   dd | ||fD \} }}t j| ||d||d}t|d}|S )zI
    Input shape:
    (batch_size x seq_len x num_heads x head_size)
    c                 s   r   )zb s h d -> b h s dNr   r   r   r   r   r   |   r   zapply_sdpa.<locals>.<genexpr>r   )r#   r
   r@   zb h s d -> b s h d )Fscaled_dot_product_attentionr   r   )r   r   r   r
   r@   r1   r   r   r   
apply_sdpaq   s   rC   c                 C   s   t  r|  } | }| }|d u rt| ||||dS g }|dd  |d d   }tj| |dd}tj||dd}	tj||dd}
t||	|
D ]\}}}t|||||d}|| qKtj	|dd}|S )N)r
   r@   r   )dim)
r   r)   
contiguousrC   tolistr+   splitzipappendcat)r   r   r   r
   r   r@   outputslensq_chunksk_chunksv_chunksq_ik_iv_ioutput_ir2   r   r   r   torch_sdpa_wrapper   s    
rU   c                 C   r4   r5   r6   r   r   r   r
   r   r@   r   r   r   torch_sdpa_wrapper_fake   s   
rW   c                 C   s   t jjj| |||||dS )N)r@   )r+   r=   r>   rU   rV   r   r   r   vit_torch_sdpa_wrapper   s   rX   )NNN)NF)NNF)F)__doc__r   r+   torch.nn.functionalnn
functionalrA   vllm.platformsr   vllm.utils.torch_utilsr   Tensorintboolfloatr3   r9   r?   rC   rU   rW   rX   r   r   r   r   <module>   s6  
	

4	

	



%
