o
    .i5                     @   s  d dl Z d dlmZ d dlmZmZ ejdejdejdejfddZG d	d
 d
Z		d6de j
de j
dede	dedee j
e j
f fddZ		d7de j
de j
dede	dB fddZ			d8de j
de j
dede	dB def
ddZ			d8de j
de j
dede	dB def
ddZejdejd ejd!ejd"ejd#ejd$ejfd%d&Zed' d(d(fd)e j
d*e j
d+ed,ed-ede j
fd.d/Zejd0ejd!ejd ejd#ejd$ejf
d1d2Z	(	(d9d3e j
d*e j
d,ed-ede j
f
d4d5ZdS ):    N)GroupCoordinator)tltritonHEAD_DIM	N_ROUNDED	IS_BASE_Ec                 C   s  t jddt j}t jddt j}t d|}t d|}|| ||  ||	  }t || }t ||k|tdkB td |}t j|dd}t |td kd|}||8 }|rpt 	|}t j
|dd}t |}nt |}t j
|dd}t |}||7 }|| ||	  }t || | || ||  ||  }|
| ||  ||	  }t || }|| }t ||k|tdkB td |}|rt 	|nt |}t | | }|| }t || | dS )aB  
    Apply the all-gathered lses to correct each local rank's attention
    output. we still need perform a cross-rank reduction to obtain the
    final attention output.

    Args:
        outputs_ptr (triton.PointerType):
            Pointer to input tensor of shape [ B, H, D ]
        lses_ptr (triton.PointerType):
            Pointer to input tensor of shape [ N, B, H ]
        new_output_ptr (triton.PointerType):
            Pointer to output tensor of shape [ B, H, D ]
        vlse_ptr (triton.PointerType):
            Pointer to output tensor of shape [ B, H ]
    r   )axis   infN)r   
program_idtoint64arangeloadwherefloatmaxexpsumlogexp2log2store)outputs_ptrnew_output_ptrlses_ptrvlse_ptroutputs_stride_Boutputs_stride_Houtputs_stride_Dlses_stride_Nlses_stride_Blses_stride_Hlse_idxr   r   r   	batch_idxhead_idx	d_offsetsnum_n_offsetslse_offsetslselse_maxlse_explse_accoutput_offsets
lse_offsetlse_tmplse_finallyfactoroutput r3   Y/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/v1/attention/ops/common.py_correct_attn_cp_out_kernel	   sX    $


r5   c                   @   s    e Zd ZdZdd Zdd ZdS )CPTritonContextzEThe CPTritonContext is used to avoid recompilation of the Triton JIT.c                 C   s
   d | _ d S Ninner_kernel)selfr3   r3   r4   __init__d   s   
zCPTritonContext.__init__c                 O   s4   | j d u r|| |i || _ d S | j | |  d S r7   r8   )r:   kernelgridregular_args
const_argsr3   r3   r4   call_kernelg   s   
zCPTritonContext.call_kernelN)__name__
__module____qualname____doc__r;   r@   r3   r3   r3   r4   r6   a   s    r6   Toutlsescp_rankctxis_lse_base_on_ereturnc                 C   sR  |du rt  }| jdkr| jd dkr| d} | jdks'J dt| j |jdkr8|jd dkr8|d}|jdkrI|jd dkrI|d}|jdksXJ dt|j | j\}}}|jd }|  \}	}
}| \}}}tj||f||f|j|j	d	}||df}| | |||	|
|||||f}|||d
}|j
t|g|R i | | |fS )ae  Correct the attention output using the all-gathered lses.

    Args:
        out: Tensor of shape [ B, H, D ]
        lses: Tensor of shape [ N, B, H ]
        cp_rank: Current rank in the context-parallel group
        ctx: Triton context to avoid recompilation

    Returns:
        Tuple of (out, lse) with corrected attention and final log-sum-exp.
    N   r	      z'expected out [B,H,D] or [B,1,H,D], got zAexpected lses [N,B,H] (optionally with a 1-sized extra dim), got r   devicedtype)r   r   r   )r6   ndimshapesqueezetuplestridetorchempty_stridedrO   rP   r@   r5   )rE   rF   rG   rH   rI   BHDNo_sBo_sHo_sDl_sNl_sBl_sHr)   r=   r>   r?   r3   r3   r4   correct_attn_outn   sJ   




rb   cp_attn_outcp_attn_lsecp_groupc                 C   sv   |j dkr| S |du rt }tj|j f|j |j|jd}| }|j|dd	|}t
| ||j||d\}}||fS )<
    cp_attn_out: [ B, H, D ]
    cp_attn_lse: [ B, H ]
    r	   N)rP   rO   r   dim)rI   )
world_sizer6   rV   emptyrR   rP   rO   
contiguous
all_gatherview_asrb   rank_in_group)rc   rd   re   rH   rI   rF   rE   r)   r3   r3   r4   _cp_lse_common   s&   

ro   F
return_lsec           
      C   sj   t | ||||d\}}|j|dd}|r3|jd |j }|j}	|dd||	 ||	d  f }||fS |S )rf   rH   rI   r	   rg   N)ro   reduce_scatterrR   ri   rn   )
rc   rd   re   rH   rp   rI   rE   r)   cp_num_headsrG   r3   r3   r4   cp_lse_ag_out_rs   s   

 rt   c                 C   s0   t | ||||d\}}||}|r||fS |S )rf   rq   )ro   
all_reduce)rc   rd   re   rH   rp   rI   rE   r)   r3   r3   r4   cp_lse_ag_out_ar   s   


rv   r[   rZ   Lmax	PAD_VALUEBLOCK_TBLOCK_Dc	                 C   sv  t d}	t d}
t d}|
| t d| }|| t d| }d}t|	D ]}|t || 7 }q)t ||	 }||k }|| }||k |@ }| |d d d f |  |d d d f  }||	| | d d d f |  |d d d f  }|d d d f |k }t ||g|t j}t j|||d d d f |@ d t j||d d d f |@ d}t j|||d d d f |@ d d S Nr   r	      )mask)r   r   r   ranger   fullfloat32r   )x_ptrout_ptrlengths_ptrr[   rZ   rw   rx   ry   rz   pid_bpid_tpid_doff_toff_din_startiseq_lent_maskin_row	valid_row	x_row_ptrout_row_ptrd_maskpad_valsx_valsr3   r3   r4   _pack_seq_kernel  s&   


(0 $r   r
   @   xlengths	pad_valueblock_tblock_dc                 C   s   | j }t|dkr|d }| |d}|j d }n| j \}}| }| }	t|  }
tj|	|
|f| j	| j
d}|	t|
|t||f}t| ||| |||
t|||ddd t|dkrp|	|
f|dd  }||}|S )	a  
    Pack sequences of different lengths into a batched tensor.

    Args:
        x: [N, ...] - input tensor where N is total number of tokens
        lengths: [B] - sequence lengths for each batch
        pad_value: value to use for padding
        block_t: block size for time dimension
        block_d: block size for feature dimension

    Returns:
        packed: [B, Lmax, ...] - packed tensor
    r|   r   rM   r	   rN   rK   )rx   ry   rz   	num_warps
num_stagesN)rR   lenreshapenumelintr   itemrV   rj   rO   rP   r   cdivr   r   )r   r   r   r   r   original_shaper[   
x_reshapedrZ   rX   rw   rE   r=   output_shaper3   r3   r4   pack_seq_triton7  s8   

r   rX   c                 C   sB  t d}t d}	t d}
|	| t d| }|
| t d| }d}t|D ]}|t || 7 }q)t || }||k }||k |@ }|| }| || | d d d f |  |d d d f  }||d d d f |  |d d d f  }|d d d f |k }t j||d d d f |@ d}t j|||d d d f |@ d d S r{   )r   r   r   r~   r   r   )
packed_ptrr   r   rX   rw   rZ   ry   rz   r   r   r   r   r   r   r   r   r   r   out_rowpacked_row_ptrr   r   packed_valsr3   r3   r4   _unpack_seq_triton_kernelt  s"   


0($r   packed_tensorc                 C   s   | j }t|dkr|dd \}}| ||d}|j d }n| j \}}}| }t|  }	tj|	|f| j| j	d}
|t
||t
||f}t| ||
| |||||ddd
 t|dkrm|	f|dd  }|
|}
|
S )a  
    Unpack a packed decode query tensor back to the original format.
    Efficient Triton implementation.

    Args:
        packed_tensor: [B, Lmax, ...] - packed tensor from pack_seq_triton
        lengths: [B] - sequence lengths for each batch
        block_t: block size for time dimension
        block_d: block size for feature dimension

    Returns:
        unpacked_tensor: [N, ...] where N = sum(lengths)
    rL   Nr|   rM   rN   rK   )ry   rz   r   r   )rR   r   r   r   r   r   rV   rj   rO   rP   r   r   r   )r   r   r   r   r   rX   rw   packed_reshapedrZ   r[   rE   r=   r   r3   r3   r4   unpack_seq_triton  s4   
r   )T)NT)NFT)r   r   )rV   vllm.distributed.parallel_stater   vllm.triton_utilsr   r   jit	constexprr5   r6   Tensorr   boolrT   rb   ro   rt   rv   r   r   r   r   r   r3   r3   r3   r4   <module>   s   W
K
'

	2
=-