o
    پi:                     @  sr   d Z ddlmZ ddlZddlmZmZmZmZm	Z	 ddl
Z
ddlmZ ddlmZ ddlmZ G dd dZdS )	zViT CUDA Graph Runner class.    )annotationsN)DictHashableListOptionalTuple)VisionAttention)get_global_server_argsc                   @  s   e Zd ZdZd*ddZed+d	d
Zed,ddZd-ddZd.ddZ				d/d0ddZ
		d1d2d"d#Z				d3d4d%d&Z			d/d5d(d)ZdS )6ViTCudaGraphRunnera  Generic ViT CUDA Graph Runner.

    This runner captures the "blocks + merger + deepstack merger (optional)" part
    of a vision transformer into a CUDA graph and replays it for identical shapes.

    Optional for Qwen2.5 windowed attention:
      - vit.fullatt_block_indexes: Sequence[int]
      - run() provides both cu_seqlens and cu_window_seqlens

    Optional for Qwen3 deepstack:
      - vit.deepstack_vision_indexes: Sequence[int]
      - vit.deepstack_merger_list: nn.ModuleList (same length as deepstack_vision_indexes)
    vit	nn.ModulereturnNonec                 C  s   || _ i | _i | _i | _i | _i | _i | _i | _i | _d | _	t
|dd | _tt
|dd| _tt
|dg p6g | _t
|dd | _|jd }dt|jjv | _t
|dd | _t
| jd	d | _d S )
Nmax_context_lenfullatt_block_indexes deepstack_visual_indexesdeepstack_merger_listr   	output_wsattnqkv_backend)r   block_inputblock_wsblock_graphsblock_outputcu_full_lencu_window_lencu_full_len_kkcu_window_len_kk
sin_cos_wsgetattrr   set_fullatt_block_indexeslist_deepstack_visual_indexes_deepstack_merger_listblocksinspect	signatureforward
parameters_blk_accepts_output_ws_attn_attn_backend)selfr   	first_blkr   r   _/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/multimodal/vit_cuda_graph_runner.py__init__,   s*   
zViTCudaGraphRunner.__init__torch.devicec                 C     | j jS N)r   devicer.   r   r   r0   r5   S      zViTCudaGraphRunner.devicetorch.dtypec                 C  r3   r4   )r   dtyper6   r   r   r0   r9   W   r7   zViTCudaGraphRunner.dtypeseq_leninthead_dimc                 C  s   | j d u r,| jp	|}t||}tj||| j| jd}tj||| j| jd}||f| _ d S | j d d|k r`t| j d dd |}tj||| j| jd}tj||| j| jd}||f| _ d S d S )N)r9   r5   r      )r   r   maxtorchemptyr9   r5   size)r.   r:   r<   	max_shapecos_wssin_wsr   r   r0   _ensure_sin_cos_ws[   s(   


z%ViTCudaGraphRunner._ensure_sin_cos_wsx_3dtorch.Tensorc                 C  s
   |j d S )Nr   )shape)r.   rF   r   r   r0   _get_graph_keyq   s   
z!ViTCudaGraphRunner._get_graph_keyN	graph_keyposition_embeddings+Optional[Tuple[torch.Tensor, torch.Tensor]]rotary_pos_emb_cosOptional[torch.Tensor]rotary_pos_emb_sinc              	   C  s4  t j }| j}| jr| j| }| j| }t| 	 }	| j
| }
| j| }t| 	 }t j}t j| d }g }d}t|jD ]\}}| jr_||jv rX|
}|}|}n|}|}|	}n|
}|}|}|dkro|||g}n|dkrx||g}ntd|d ur|dkr|| j| ||| j| d}n4||||| j| d}n(|d ur|d ur|dkr|| j| |||| j| d}n|||||| j| d}| jr|| jv r| jd u rtd| j| |}|| |d7 }qE||}|rt j|g| dd	| j|< n|| j|< W d    n	1 sw   Y  || j|< d S )
Nr   triton_attnfa3z#Not supported ViT attention backend)
cu_seqlensrK   r   )rR   rM   rO   r   zEdeepstack_visual_indexes exists but deepstack_merger_list is missing.   )dim)r?   cuda	CUDAGraphr   r"   r   r   r;   r>   itemr   r   r	   mm_attention_backendgraph	enumerater&   r   RuntimeErrorr   r   r$   r%   appendmergercatr   r   )r.   rJ   rK   rM   rO   rY   r   	cu_windowcu_window_kkmax_window_lencu_full
cu_full_kkmax_full_lenoverride_backendydeepstack_outsdeepstack_capture_idx	layer_numblkcu_seqlens_nowcu_seqlens_kk_nowmax_lencu_seq_len_wsdeepstack_outmain_outr   r   r0   _create_graphu   s   














Tz ViTCudaGraphRunner._create_graphrR   cu_window_seqlensc                 C  s:  | j }| |}|| jv r|S |jd j}	|	j}
|	j}|| jvrItj	|| j
d | j|< tj	|| j
d | j|< tj||
|| j
| jd| j|< | jrz|| jvry|| j|< || j|< |dd  |d d  | j|< |dd  |d d  | j|< n|| jvr|| j|< |dd  |d d  | j|< |d ur|d jd }| || | jd d |d d f }| jd d |d d f }||d  ||d  ||f}| j||d |S |d ur|d ur|jd }| || | jd d |d d f }| jd d |d d f }|| || | j|d ||d |S )Nr   )r5   )r5   r9   rS   )rJ   rK   )rJ   rK   rM   rO   )r   rI   r   r&   r   !num_attention_heads_per_partition	head_sizer   r?   
empty_liker5   
contiguousr   r@   r9   r   r"   r   r   r   r   rH   rE   r   copy_rq   )r.   rF   rR   rr   rK   rM   rO   r   rJ   attn_module	num_headsattn_head_dimr<   used_cos_wsused_sin_wspersist_position_embeddingsr   r   r0   create_graph   s|   


	







zViTCudaGraphRunner.create_graphoutput_indicesc                 C  s"  |d ur:|d j d }| || | jd d |d d f }| jd d |d d f }	||d  |	|d  n7|d urq|d urq|j d }| || | jd d |d d f }| jd d |d d f }	|| |	| | j| | | j|   | j| }
|d ur|
d|}
|
S )Nr   rS   )	rH   rE   r   rx   r   r   replayr   index_select)r.   rJ   rF   rK   rM   rO   r   r<   r|   r}   outr   r   r0   r   6  s(   




zViTCudaGraphRunner.replayxc           
      C  sJ   | d}| |}	|	| jvr| j||||||d | j|	|||||dS )NrS   )rF   rK   rR   rr   rM   rO   )rJ   rF   rK   rM   rO   r   )	unsqueezerI   r   r   r   )
r.   r   rR   rr   rK   rM   rO   r   rF   rJ   r   r   r0   run_  s&   


	zViTCudaGraphRunner.run)r   r   r   r   )r   r2   )r   r8   )r:   r;   r<   r;   )rF   rG   r   r;   )NNN)rJ   r;   rK   rL   rM   rN   rO   rN   )NN)rF   rG   rR   rG   rr   rG   rK   rL   rM   rN   rO   rN   r   r;   )NNNN)rJ   r;   rF   rG   rK   rL   rM   rN   rO   rN   r   rN   r   rG   )r   rG   rR   rG   rr   rG   rK   rL   rM   rN   rO   rN   r   rN   r   rG   )__name__
__module____qualname____doc__r1   propertyr5   r9   rE   rI   rq   r   r   r   r   r   r   r0   r
      s2    
'

	wV/r
   )r   
__future__r   r'   typingr   r   r   r   r   r?   torch.nnnn"sglang.srt.layers.attention.visionr   sglang.srt.server_argsr	   r
   r   r   r   r0   <module>   s   