o
    ÓÙ¾iI-  ã                   @   s†  d dl Z d dlZd dlmZ d dlmZ ejejddiddejddiddejdd	iddejdd
iddejddiddgdgdej	dej
dej
fdd„ƒƒZej	dej
dej
dej
dej
dej
f
dd„ƒZej	dej
dej
fdd„ƒZ			d-de jde jde jdededefd d!„Z		d.de jd"e jd#e jd$e jd%e jd&e jd'e jd(e jdedefd)d*„Ze ¡ rÁd+d,lmZ eZdS dS )/é    N)Úcurrent_platformÚBLOCK_Né@   é   )Ú	num_warpsé€   é   é   i   i   é   Ú	inner_dim)ÚconfigsÚkeyÚscale_constantc                 C   sü   t  d¡}t  d¡}||
 t  d|
¡ }||k }|| }|| | }| | | }|| }|| }||	 }|| | }|||  | }|||  | }t j||dd}t j||dd}t j||dd}t j|
g||jd}|||  | }t j|||d d S )Nr   é   g        ©ÚmaskÚother©Údtype©r   ©ÚtlÚ
program_idÚarangeÚloadÚfullr   Ústore)Ú
output_ptrÚnormalized_ptrÚ	scale_ptrÚ	shift_ptrr   Úrowsr   Úseq_lenÚ
num_framesÚframe_seqlenr   Úpid_rowÚpid_colÚcol_offsetsr   Úrow_baseÚ	norm_ptrsÚout_ptrsÚb_idxÚt_idxÚframe_idx_in_batchÚscale_row_idxÚ
scale_ptrsÚ
shift_ptrsÚ
normalizedÚscaleÚshiftÚscale_const_tensorÚoutput© r6   úb/home/ubuntu/.local/lib/python3.10/site-packages/sglang/jit_kernel/diffusion/triton/scale_shift.pyÚ_fused_scale_shift_4d_kernel   s&   

r8   ÚSCALE_IS_SCALARÚSHIFT_IS_SCALARÚBLOCK_LÚBLOCK_Cc           &      C   s¶  t  d¡}t  d¡}t  d¡}|| t  d|¡ }|| t  d|¡ }||k }||k }|d d …d f |d d d …f @ }|| |d d …d f |	  |d d d …f |
  }t j| | |dd}|rpt  |¡}t j||f||jd} n"|| |d d …d f |  |d d d …f |  }!t j||! |dd} |r¥t  |¡}"t j||f|"|"jd}#n"|| |d d …d f |  |d d d …f |  }$t j||$ |dd}#|||#  |  }%t j|| |%|d d S )Nr   r   r   r   r   r   r   )&Úx_ptrr    r   r   Úy_ptrÚBÚLÚCÚ
stride_x_bÚ
stride_x_lÚ
stride_x_cÚ
stride_s_bÚ
stride_s_lÚ
stride_s_cÚstride_sc_bÚstride_sc_lÚstride_sc_cr9   r:   r;   r<   Úpid_lÚpid_cÚpid_bÚ	l_offsetsÚ	c_offsetsÚmask_lÚmask_cr   Úx_offÚxÚ	shift_valr3   Ús_offÚ	scale_valr2   Úsc_offÚyr6   r6   r7   Úfuse_scale_shift_kernel_blc_opt>   sJ   


 ÿþÿ
ÿþÿ
ÿþÿrY   c#           @      C   sð  t  d¡}#t  d¡}$t  d¡}%|#|! t  d|!¡ }&|$|" t  d|"¡ }'|&|k }(|'|k })|(d d …d f |)d d d …f @ }*|%| |&d d …d f |  |'d d d …f |  }+t j| |+ |*dd},|%| |&|  }-t j||- |(dd t j¡d d …d f }.|%| |'d d d …f |  }/|%| |'d d d …f |  }0|%| |'d d d …f |  }1|%| |'d d d …f |  }2|%| |'d d d …f |  }3|%| |'d d d …f |  }4t j||/ |)d d d …f dd}5t j||0 |)d d d …f dd}6t j||1 |)d d d …f dd}7t j||2 |)d d d …f dd}8t j||3 |)d d d …f dd}9t j||4 |)d d d …f dd}:t  |.|8|5¡};t  |.|9|6¡}<t  |.|:|7¡}=|,d|<  |; }>t j||+ |>|*d |%| |&d d …d f |  |'d d d …f |   }?t j|	|? |=|*d d S )Nr   r   r   r   r   )r   r   r   r   ÚtoÚint1Úwherer   )@r=   Ú
shift0_ptrÚ
scale0_ptrÚ	gate0_ptrÚ
shift1_ptrÚ
scale1_ptrÚ	gate1_ptrÚ	index_ptrr>   Úgate_out_ptrr?   r@   rA   rB   rC   rD   Ústride_s0_bÚstride_s0_cÚstride_sc0_bÚstride_sc0_cÚstride_g0_bÚstride_g0_cÚstride_s1_bÚstride_s1_cÚstride_sc1_bÚstride_sc1_cÚstride_g1_bÚstride_g1_cÚ
stride_i_bÚ
stride_i_lÚstride_go_bÚstride_go_lÚstride_go_cr;   r<   rK   rL   rM   rN   rO   rP   rQ   r   rR   rS   Úidx_offÚidxÚs0_offÚsc0_offÚg0_offÚs1_offÚsc1_offÚg1_offÚshift0Úscale0Úgate0Úshift1Úscale1Úgate1r3   r2   ÚgaterX   Úgo_offr6   r6   r7   Ú-fuse_scale_shift_gate_select01_kernel_blc_opt‚   sR   
&

 ÿþÿ(      ÿþÿr†   ç      ð?rS   r2   r3   Úblock_lÚblock_cc                    s  | j r|j sJ ‚|  ¡ sJ ‚| j\}}‰ t | ¡}| ¡ dkrp|| ‰|  ˆˆ ¡}	| ˆˆ ¡}
‡ ‡fdd„}|jd }|| dksEJ dƒ‚|| }| d¡ dˆ ¡ 	¡ }| d¡ dˆ ¡ 	¡ }t
| |
|	|||ˆˆ |||ƒ
 |S | ¡ dks‚| ¡ dkrˆ| ¡ dkrˆ| d¡}n| ¡ dkrš|d d …d d d …f }n| ¡ d	kr£|}ntd
ƒ‚| ¡ dks¹| ¡ dkr¿| ¡ dkr¿| d¡}n| ¡ dkrÑ|d d …d d d …f }n| ¡ d	krÚ|}n|}| ¡ dkoç| ¡ dk}| ¡ dkoó| ¡ dk}|s| ||ˆ ¡}| ¡ \}}}nd } }}|s| ||ˆ ¡}| ¡ \}}}nd } }}|rC|rC| ¡  ¡ dkrC| ¡  ¡ dkrC| | ¡ |S t ||¡t ˆ |¡|f}t| | |rY|n||r_|n|||||ˆ |  d¡|  d¡|  d¡||||||||||ddd |S )Nr   c                    s   ˆt  ˆ | d ¡fS )Nr   )ÚtritonÚcdiv)ÚMETA©rA   r!   r6   r7   Ú<lambda>í   s    z)fuse_scale_shift_kernel.<locals>.<lambda>r   r   z:seq_len must be divisible by num_frames for 4D scale/shiftr   éÿÿÿÿé   z"scale must be 0D/1D(1)/2D/3D or 4D)r9   r:   r;   r<   r   Ú
num_stages)Úis_cudaÚis_contiguousÚshapeÚtorchÚ
empty_likeÚdimÚviewÚsqueezeÚreshapeÚ
contiguousr8   ÚnumelÚ
ValueErrorÚexpandÚstrideÚabsÚmaxÚcopy_rŠ   r‹   rY   )rS   r2   r3   r   rˆ   r‰   r?   r@   r5   Úx_2dÚ	output_2dÚgridr#   r$   Úscale_reshapedÚshift_reshapedÚ	scale_blcÚ	shift_blcÚneed_scale_scalarÚneed_shift_scalarÚ	scale_expÚs_sbÚs_slÚs_scÚ	shift_expÚsh_sbÚsh_slÚsh_scr6   r   r7   Úfuse_scale_shift_kernelÚ   s¨   

ÿöR$¾$$
ér´   r   r~   r€   r‚   r   rƒ   Úindexc
                 C   sÈ  |   ¡ sJ ‚| j\}
}}t | ¡}t | ¡}| ¡ dks:| ¡ dks:| ¡ dks:| ¡ dks:| ¡ dks:| ¡ dkr>tdƒ‚| ¡ dkrHtdƒ‚t ||¡t ||	¡|
f}t| g | ‘|‘|‘|‘|‘|‘|‘|‘|‘|‘|
‘|‘|‘|  	d¡‘|  	d¡‘|  	d¡‘| 	d¡‘| 	d¡‘| 	d¡‘| 	d¡‘| 	d¡‘| 	d¡‘| 	d¡‘| 	d¡‘| 	d¡‘| 	d¡‘| 	d¡‘| 	d¡‘| 	d¡‘| 	d¡‘| 	d¡‘| 	d¡‘| 	d¡‘R ||	dddœŽ ||fS )Nr   z9scale0/shift0/gate0/scale1/shift1/gate1 must be 2D [B, C]zindex must be 2D [B, L]r   r   r   )r;   r<   r   r‘   )
r“   r”   r•   r–   r—   r   rŠ   r‹   r†   rŸ   )rS   r   r~   r€   r‚   r   rƒ   rµ   rˆ   r‰   r?   r@   rA   r5   Úgate_outr¥   r6   r6   r7   Ú%fuse_scale_shift_gate_select01_kernelM  s®   

ÿþýüûúùø	÷
öõôóòñðïîíìëêéèçæåäãâá à!ß"Û'r·   r   )Úfuse_scale_shift_native)r‡   r   r   )r   r   )r•   rŠ   Útriton.languageÚlanguager   Ú'sglang.multimodal_gen.runtime.platformsr   ÚautotuneÚConfigÚjitÚ	constexprr8   rY   r†   ÚTensorÚfloatÚintr´   r·   Úis_npuÚnpu_fallbackr¸   r6   r6   r6   r7   Ú<module>   s     ûø
ûõ+üîíìëC"Þ#Ý[úÿþýüû
ú|öÿþýüûúùø	÷

öHý