o
    پia                  %   @   s6  d dl mZmZ d dlZd dlZd dlmZ d dlm	Z	 d dl
mZ ejdgddejdejd	ejd
ejdejdejdejdejdejdejdejdejdejfddZ		d?dejdejdejdejdejdedejdeded eej d!eejejf fd"d#ZG d$d% d%ejjZ						d@dejdejdejdejdejdedejded eej ded!eejejf fd&d'Zejdgdd(ejd)ejd*ejdejdejd	ejd
ejdejdejdejdejdejdejdejd+ejd,ejd-ejd.ejf$d/d0Z								dAdejdejdejdejdejded1ejd2ejded eej d3ed4ed5eej d6eej d7ee d8eej d!ejf"d9d:ZG d;d< d<ejjZ												dBdejdejdejdejdejded1ejd2ejd eej ded3ed4ed5eej d6eej d7ee d8eej d!ejf"d=d>ZdS )C    )OptionalTupleN)exp)input_guardT)do_not_specializeBHHVKVBKBVUSE_INITIAL_STATESTORE_FINAL_STATEIS_BETA_HEADWISEUSE_QK_L2NORM_IN_KERNEL	IS_VARLENIS_KDAc           8      C   s  t dt dt d}}}|| || }}|||  }|rBt || t jt || d t j}}|
} || }
n||
 ||
 |
 }}||
 } || t d| }!|| t d| }"| || | |  |! }#||| | |  |! }$||| | |  |" }%|r||| | |  |" }&n|||  | }&|s|||  | }'n||| | |  |! }(|||  | | | |  |" })|!|k }*|"|k }+|*d d d f |+d d d f @ },t j||gt jd}-|r||| |  |!d d d f |  |"d d d f  }.|-t j|.|,ddt j7 }-td|
D ]}/t j|#|*ddt j}0t j|$|*ddt j}1t j|%|+ddt j}2|r^|0t 	t 
|0|0 d  }0|1t 	t 
|1|1 d  }1|0|	 }0|sut |'t j}3|-t|39 }-nt |(t j}4|-t|4d d d f 9 }-|2t 
|-|1d d d f  d8 }2|rt j|&|+ddt j}5n	t |&t j}5|2|59 }2|-|1d d d f |2d d d f  7 }-t 
|-|0d d d f  d}6t j|)|6|)jj|+d |#|| 7 }#|$|| 7 }$|)|| 7 })|%|| 7 }%|s|'|7 }'n|(|| 7 }(|&||r|nd 7 }&q|rC||| |  |!d d d f |  |"d d d f  }7t j|7|-|7jj|,d d S d S )Nr         dtypemaskotherư>r   )tl
program_idloadtoint64arangezerosfloat32rangesqrtsumr   storer   
element_ty)8qkvgbetaoh0ht
cu_seqlensscaler   r   r	   r
   r   r   r   r   r   r   r   r   r   r   i_ki_vi_nhi_ni_hvi_hboseosallo_ko_vp_qp_kp_vp_betap_gp_gkp_omask_kmask_vmask_hb_hp_h0_b_qb_kb_vb_gb_gkb_betab_op_ht rU   c/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/attention/fla/fused_recurrent.py+fused_recurrent_gated_delta_rule_fwd_kernel   s   "

  4 $
4rW   Fr+   r,   r-   r.   r/   r4   initial_stateoutput_final_stateuse_qk_l2norm_in_kernelr3   returnc
                 C   s  g |j |j d R \}
}}}}|j d }|	d u r|
nt|	d }t|tt|d}}t||t||}}|dksDJ dd}d}| j|g|j R  }|r`| j||||tjd}nd }|||| f}t	| d$i d| d	|d
|d|d|d|d|d|d|	d|d|d|
d|d|d|d|d|d|d|d ud|d ud|j
|j
kd|d|	d udd d!|d"| |d#}||fS )%Nr   r       NK > 1 is not supported yet   r   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r   r   r	   r
   r   r   r   r   r   r   r   r   r   r   F	num_warps
num_stagesr   rU   )shapelentritonnext_power_of_2mincdiv	new_emptytorchr%   rW   ndimsqueeze)r+   r,   r-   r.   r/   r4   rX   rY   rZ   r3   r   r   r	   r   r   r
   Nr   r   NKNVra   r`   r0   final_stategridrU   rU   rV   $fused_recurrent_gated_delta_rule_fwd|   s    
	

rq   c                   @   sn   e Zd Zee		ddejdejdejdejdejded	ejd
ede	ej
 defddZeedd ZdS )FusedRecurrentFunctionNFr+   r,   r-   r.   r/   r4   rX   rY   r3   rZ   c                 C   s(   t |||||||||
|	d
\}}||fS )N)
r+   r,   r-   r.   r/   r4   rX   rY   rZ   r3   )rq   )ctxr+   r,   r-   r.   r/   r4   rX   rY   r3   rZ   r0   ro   rU   rU   rV   forward   s   
zFusedRecurrentFunction.forwardc                 C      t dNzBackward pass is not implemented yet and we do not have plans to implement it because we haven't figured out how to compute dg without materializing the full hidden states for all time steps.NotImplementedErrorrs   dodhtrU   rU   rV   backward      zFusedRecurrentFunction.backward)NF)__name__
__module____qualname__staticmethodr   ri   Tensorfloatboolr   
LongTensorrt   r|   rU   rU   rU   rV   rr      s:    	
rr   c
                 C   s   |dur7| j d dkrtd| j d  d|dur7|j d t|d kr7tdt|d  d|j d  d|du rC|j d	 d
 }n|dksKJ d|du rVt| d }t| |||||||||	
\}
}|
|fS )a
  
    Args:
        q (torch.Tensor):
            queries of shape `[B, T, H, K]`.
        k (torch.Tensor):
            keys of shape `[B, T, H, K]`.
        v (torch.Tensor):
            values of shape `[B, T, HV, V]`.
            GVA is applied if `HV > H`.
        g (torch.Tensor):
            g (decays) of shape `[B, T, HV]`.
        beta (torch.Tensor):
            betas of shape `[B, T, HV]`.
        scale (Optional[int]):
            Scale factor for the RetNet attention scores.
            If not provided, it will default to `1 / sqrt(K)`. Default: `None`.
        initial_state (Optional[torch.Tensor]):
            Initial state of shape `[N, HV, K, V]` for `N` input sequences.
            For equal-length input sequences, `N` equals the batch size `B`.
            Default: `None`.
        output_final_state (Optional[bool]):
            Whether to output the final state of shape `[N, HV, K, V]`. Default: `False`.
        cu_seqlens (torch.LongTensor):
            Cumulative sequence lengths of shape `[N+1]` used for variable-length training,
            consistent with the FlashAttention API.
    Returns:
        o (torch.Tensor):
            Outputs of shape `[B, T, HV, V]`.
        final_state (torch.Tensor):
            Final state of shape `[N, HV, K, V]` if `output_final_state=True` else `None`.
    Examples::
        >>> import torch
        >>> import torch.nn.functional as F
        >>> from einops import rearrange
        >>> from fla.ops.gated_delta_rule import fused_recurrent_gated_delta_rule
        # inputs with equal lengths
        >>> B, T, H, HV, K, V = 4, 2048, 4, 8, 512, 512
        >>> q = torch.randn(B, T, H, K, device='cuda')
        >>> k = F.normalize(torch.randn(B, T, H, K, device='cuda'), p=2, dim=-1)
        >>> v = torch.randn(B, T, HV, V, device='cuda')
        >>> g = F.logsigmoid(torch.rand(B, T, HV, device='cuda'))
        >>> beta = torch.rand(B, T, HV, device='cuda').sigmoid()
        >>> h0 = torch.randn(B, HV, K, V, device='cuda')
        >>> o, ht = fused_gated_recurrent_delta_rule(
            q, k, v, g, beta,
            initial_state=h0,
            output_final_state=True
        )
        # for variable-length inputs, the batch size `B` is expected to be 1 and `cu_seqlens` is required
        >>> q, k, v, g, beta = map(lambda x: rearrange(x, 'b t ... -> 1 (b t) ...'), (q, k, v, g, beta))
        # for a batch with 4 sequences, `cu_seqlens` with 5 start/end positions are expected
        >>> cu_seqlens = q.new_tensor([0, 2048, 4096, 6144, 8192], dtype=torch.long)
        >>> o_var, ht_var = fused_gated_recurrent_delta_rule(
            q, k, v, g, beta,
            initial_state=h0,
            output_final_state=True,
            cu_seqlens=cu_seqlens
        )
    Nr   r   /The batch size is expected to be 1 rather than Q when using `cu_seqlens`.Please flatten variable-length inputs before processing.]The number of initial states is expected to be equal to the number of input sequences, i.e.,  rather than .r\         scale must be positive.r   )rb   
ValueErrorrc   ri   	ones_likerr   apply)r+   r,   r-   r.   r/   r4   rX   rY   r3   rZ   r0   ro   rU   rU   rV    fused_recurrent_gated_delta_rule   s>   G
r    stride_retrieve_parent_token_seq"stride_retrieve_parent_token_tokenNP2_TDISABLE_STATE_UPDATEDISABLE_OUTPUT_CALCULATIONCACHE_INTERMEDIATE_STATESHAS_EAGLE_TREE_CUSTOM_ATTN_MASKc!           H      C   s
  t dt dt d}!}"}#|#| |#| }$}%|%||  }&|rBt ||$ t jt ||$ d t j}'}(|})|(|' }n|$| |$| | }'}(|| })|!| t d| }*|"| t d| }+| |'| |& |  |* },||'| |& |  |* }-||'| |% |  |+ }.|r||'| |% |  |+ }/n||'|  |% }/||'|  |% }0||!|) |' | |% |  |+ }1| rt d|}2|2|k }3||$|  |2|  }4t |4|3}5|*|k }6|+|k }7|6d d d f |7d d d f @ }8t j||gt jd}9|r5t ||$ }:|:dkr5||:| | |  |%| |  |*d d d f |  |+d d d f  };|9t j|;|8ddt j7 }9d}<|rAt ||$ }<d}=td|D ]w}>| r|=dkr|<dkrt 	t 
|2|=k|5d}?|?| | | }@|
|<| | | |  |@ |%| |  |*d d d f |  |+d d d f  }At j|A|8ddt j}9t j|,|6ddt j}Bt j|-|6ddt j}Ct j|.|7ddt j}Dt |0t j}E|r|Bt t 	|B|B d  }B|Ct t 	|C|C d  }C|B|	 }B|9t|E9 }9|Dt 	|9|Cd d d f  d8 }D|rt j|/|7ddt j}Fn	t |/t j}F|D|F9 }D|9|Cd d d f |Dd d d f  7 }9|sQt 	|9|Bd d d f  d}Gt j|1|G|1jj|7d |r|<dkr|=| | | }@|
|<| | | |  |@ |%| |  |*d d d f |  |+d d d f  }At j|A|9|Ajj|8d |=d7 }=|,|| 7 },|-|| 7 }-|1|| 7 }1|.|| 7 }.|0|7 }0|/||r|nd 7 }/qH|st ||$ }:|:dkr||:| | |  |%| |  |*d d d f |  |+d d d f  };t j|;|9|;jj|8d d S d S d S )	Nr   r   r   r   r   r\   r   r   )r   r   r    r!   r"   r#   r$   r%   r&   r(   wherer'   r   r)   r   r*   )Hr+   r,   r-   r.   r/   r0   	h0_source
h0_indicesr3   r4   intermediate_states_bufferintermediate_state_indicescache_stepsretrieve_parent_token_ptrr   r   r   r   r   r	   r
   r   r   r   r   r   r   r   r   r   r   r   r   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rF   token_indicesmask_retrieveretrieve_parent_token_baseparent_idx_tokensrG   rH   rI   rJ   idxrK   	cache_idxstep_idxrL   parent_step_idxstep_offset	cache_ptrrM   rN   rO   rP   rR   rS   rU   rU   rV   2fused_recurrent_gated_delta_rule_update_fwd_kernelW  s  "$

  


 $



r   initial_state_sourceinitial_state_indicesdisable_state_updatedisable_output_calculationr   r   r   retrieve_parent_tokenc           "      C   s  g |j |j d R \}}}}}|j d }|	d u r|nt|	d }t|tt|d}}t||t||}}|dksDJ dd}d}|rT| |dddd}n
| j|g|j R  }|||| f}|d uru|d|d}} nd }} t|}!t| d+i d| d	|d
|d|d|d|d|d|d|	d|d|d|d|d u rdnY|d|d|d| d|d|!d|d|d|d|d|d|d |d!|d ud"|	d ud#|d ud$|d ud%|j	|j	kd&|d'|
d(|d)|d*| |
d}|S d|d|d| d|d|!d|d|d|d|d|d|d |d!|d ud"|	d ud#|d ud$|d ud%|j	|j	kd&|d'|
d(|d)|d*| |
d}|S ),Nr\   r   r      r^   r_   r   r+   r,   r-   r.   r/   r0   r   r   r3   r4   r   r   r   r   r   r   r   r   r   r	   r
   r   r   r   r   r   r   r   r   r   r   r   r   r`   ra   rU   )rb   rc   rd   re   rf   rg   rh   strider   rj   rk   )"r+   r,   r-   r.   r/   r4   r   r   rZ   r3   r   r   r   r   r   r   r   r   r	   r   r   r
   rl   r   r   rm   rn   ra   r`   r0   rp   r   r   r   rU   rU   rV   +fused_recurrent_gated_delta_rule_update_fwd
  s   

	
 !"#
% !"#
%r   c                $   @   s   e Zd Zee								ddejdejdejdejdejded	ejd
ejdeej	 de
de
de
deej deej dee deej f ddZeedd ZdS )FusedRecurrentUpdateFunctionNFr+   r,   r-   r.   r/   r4   r   r   r3   rZ   r   r   r   r   r   r   c                 C   sn   t di d|d|d|d|d|d|d|d|d	|
d
|	d|d|d|d|d|d|}|S )Nr+   r,   r-   r.   r/   r4   r   r   rZ   r3   r   r   r   r   r   r   rU   )r   )rs   r+   r,   r-   r.   r/   r4   r   r   r3   rZ   r   r   r   r   r   r   r0   rU   rU   rV   rt   b  sD   	
z$FusedRecurrentUpdateFunction.forwardc                 C   ru   rv   rw   ry   rU   rU   rV   r|     r}   z%FusedRecurrentUpdateFunction.backward)NFFFNNNN)r~   r   r   r   r   ri   r   r   r   r   r   intrt   r|   rU   rU   rU   rV   r   `  s^    	
(r   c                 C   s  |d urR| j d dkrtd| j d  d|d urR|j d t|d kr7tdt|d  d|j d  d|j d |j d krRtd|j d  d	|j d  d|d u r^|j d
 d }n|dksfJ d|d u rqt| d }t| |||||||||	|
|||||}|S )Nr   r   r   r   r   r   r   ziThe number of intermediate state indices is expected to be equal to the number of input sequences, i.e., z != r\   r   r   r   )rb   r   rc   ri   r   r   r   )r+   r,   r-   r.   r/   r4   r   r   r3   rZ   r   r   r   r   r   r   r0   rU   rU   rV   'fused_recurrent_gated_delta_rule_update  s\   
r   )FN)NNNFNF)FNFFNNNN)NNNNNFFFNNNN)typingr   r   ri   rd   triton.languagelanguager   "sglang.srt.layers.attention.fla.opr   %sglang.srt.layers.attention.fla.utilsr   jit	constexprrW   r   r   r   r   rq   autogradFunctionrr   r   r   r   r   r   r   rU   rU   rU   rV   <module>   s  u	

</	

u ! <	

V;	
