o
    پiH                  %   @   s  d dl mZmZ d dlZd dlmZ d dlmZ dejdejdejdej	fd	d
Z
d}ddZdd Zd}ddZdejdejdejdejdejdejddfddZ	d}dejdejdejdejdejdeej dejfddZ	d}dejdejdeej dejfd d!Z	"	"		d~d#ejd$ejd%ejd&ed'ed(ed)ed*ed+ed,eej d-ee ddfd.d/ZeZeZd#ejd$ejd%ejd0eddf
d1d2Zd#ejd$ejd%ejddfd3d4Zd5ejd6ejd7ejd8ejd9ejd:ejdejfd;d<Zd#ejd=ejdeejejf fd>d?Z	d}d@ejdAejdBejdCejdDejdEejdFeej dejfdGdHZ	d}d@ejdAejdIejdJejdBejdCejdFeej dejfdKdLZejfdMejdNejd:ejdejfdOdPZdQdR Z dSejd=ejdTejfdUdVZ!dSejd=ejdTejfdWdXZ"	d}dSejd=ejdYejdZejd[ed\eej de#ejejf fd]d^Z$	_	"	"	"dd5ejd`eej daejdbejdceej ddeej deeej dfeej dgejdhediedjedkedledmednedoedejf$dpdqZ%d5ejdaejdrejdsejdtejduedvedejfdwdxZ&dyejdzejdveddfd{d|Z'dS )    )OptionalTupleN)
ScalarType)_get_cache_bufqweightscalesqzerosreturnc                 C   s   t jjj| ||S N)torchops
sgl_kernelawq_dequantizedefault)r   r   r    r   C/home/ubuntu/.local/lib/python3.10/site-packages/sgl_kernel/gemm.pyr      s   r   c                 C      t jjj| |||||S r
   )r   r   r   int8_scaled_mmr   mat_amat_bscales_ascales_b	out_dtypebiasr   r   r   r         
r   c                 C   s   t jjj| ||||S r
   )r   r   r   fp8_blockwise_scaled_mmr   )r   r   r   r   r   r   r   r   r      s   
r   c                 C   r   r
   )r   r   r   fp8_scaled_mmr   r   r   r   r   r   #   r   r   workspace_bufferABDA_scaleB_scalec              	   C   s*   t j }t jjj|||||| | d S r
   )r   cudacurrent_blas_handler   r   bmm_fp8r   )r   r   r    r!   r"   r#   cublas_handler   r   r   _bmm_fp8_internal.   s   

r(   dtypeoutc                 C   sV   |d u rt j| jd | jd |jd f| j|d}tdd| j}t|| |||| |S )Nr         devicer)   bmm_fp8_workspacei   )r   emptyshaper.   r   r(   )r   r    r"   r#   r)   r*   r   r   r   r   r&   B   s   r&   r   r   outputc                 C   sD   |d u rt j| jd |jd f| j| jd}t jjj|| | |S )Nr   r+   r-   )	r   r0   r1   r.   r)   r   r   dsv3_fused_a_gemmr   )r   r   r2   r   r   r   r3   U   s   r3   Finputoutput_qoutput_s
group_sizeepsfp8_minfp8_maxscale_ue8m0fuse_silu_and_mulmasked_m	enable_v2c                 C   s   |
d u rddl m} |d}
|
r!tjjj| |||||||||	
S |r'J d|	d u s/J dtjjj| ||||||| d S )Nr   )get_bool_env_var$SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2z!only v2 support fuse_silu_and_mulzonly v2 support masked_m)sglang.srt.utilsr?   r   r   r   !sgl_per_token_group_quant_8bit_v2r   sgl_per_token_group_quant_8bit)r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r   r   r   rC   d   s*   

rC   	is_staticc                 C   s   t jjj| ||| d S r
   )r   r   r   sgl_per_tensor_quant_fp8r   )r4   r5   r6   rD   r   r   r   rE      s   
rE   c                 C   s   t jjj| || d S r
   )r   r   r   sgl_per_token_quant_fp8r   )r4   r5   r6   r   r   r   rF      s   rF   abblock_scale_ablock_scale_balphar   c           	      C   sb   | j dkr
|j dksJ | jd |jd }}tj||f|| jd}tjjj|| |||| |S )Nr,   r   r)   r.   )	ndimr1   r   r0   r.   r   r   cutlass_scaled_fp4_mmr   )	rG   rH   rI   rJ   rK   r   mnr*   r   r   r   rN      s   
rN   input_global_scalec                 C   sF  | j dksJ d| j  d| j dkrdnd}| || jd } | j\}}d}| j}|| dks8J d| d| jtjtjfv sJJ d| j dtj||d	 f|tj	d
}|d d d d }|| }	|	d d d d }
|
|	krtj
||
d f|tjd
}ntj||
d f|tjd
}tjjj|| || |tj}||fS )a%  
    Quantize input tensor to FP4 and return quantized tensor and scale.

    This function quantizes the last dimension of the given tensor `input`. For
    every 16 consecutive elements, a single dynamically computed scaling factor
    is shared. This scaling factor is quantized using the `input_global_scale`
    and is stored in a swizzled layout (see
    https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-scale-factor-b-layout-4x).

    Args:
        input: The input tensor to be quantized to FP4
        input_global_scale: A scalar scaling factor for the entire tensor.

    Returns:
        Tuple[torch.Tensor, torch.Tensor]: The output tensor in FP4 but every
            two values are packed into a uint8 and float8_e4m3 scaling factors
            in a sizzled layout.
    r+   z%input.ndim needs to be >= 1, but got .   r   z+last dim has to be multiple of 16, but got z-input.dtype needs to be fp16 or bf16 but got r,   r-         )rM   reshaper1   r.   r)   r   float16bfloat16r0   uint8zerosint32r   r   scaled_fp4_quantr   viewfloat8_e4m3fn)r4   rQ   
other_dimsrO   rP   
block_sizer.   r2   	rounded_mscale_n	rounded_noutput_scaler   r   r   r]      s:   

r]   in_featskernelwscalesascalesw_szsa_ssums	out_featsc              	   C   L   |d u rt j| jd |jd f| jt jd}t jjj| |||||| |S Nr   r-   )	r   r0   r1   r.   rX   r   r   qserve_w4a8_per_chn_gemmr   )rf   rg   rh   ri   rj   rk   rl   r   r   r   ro         	
ro   r[   	scales_i8c              	   C   rm   rn   )	r   r0   r1   r.   rX   r   r   qserve_w4a8_per_group_gemmr   )rf   rg   r[   rq   rh   ri   rl   r   r   r   rr     rp   rr   hidden_statesrouter_weightsc                 C   s6   t j| jd |jd | j|d}t jj|| | |S rn   )r   r0   r1   r.   r   r   dsv3_router_gemm)rs   rt   r   r2   r   r   r   ru     s   ru   c                 C   s,   t j|| j| jd}t jjj| || |S )Nr-   )r   r0   r.   r)   r   r   shuffle_rowsr   )input_tensordst2src_mapoutput_tensor_shapeoutput_tensorr   r   r   rv   .  s   rv   rw   maskc                 C   s$  | j }| j\}}}d}|| dksJ d| d|| }|d d d }	|	d }
|d d d }tj|||d	 |tjd
}tj|||
|tjd
}tjjjj	|
|| |d	 |
|| |
| 
|| |||dd |dd	d}|
tj
||d |	d ddd}|ddddd	d}||fS )a  
    Quantize input tensor to FP4 and return quantized tensor and scale, for
    grouped gemm inputs (e.g., grouped_gemm_nt_masked for flashinfer).
    Args:
        input: The input tensor to be quantized to FP4, with shape (l, m, k)
            l is number of groups, m is number of tokens per group, k is number of features.
        input_global_scale: A scalar scaling factor for the entire tensor, with
            shape (l,).
    Outputs:
        output: The quantized tensor in FP4, with shape (m, k // 2, l) but the physical
            layout is (l, m, k // 2). `// 2` is because two fp4 values are packed into
            an uint8.
        output_scales: The blockscale tensor in FP8-E4M3, with shape (32, 4, rm, 4, rk, l)
            but the physical layout is (l, rm, rk, 32, 4, 4).
    Note:
        For the shape of output_scales, `32 * 4 * rm` is a padded m to nearest multiple of 128.
        `4 * rk` is a padded `k // 16` to nearest multiple of 4. These layout constants are
        required by the NVIDIA Blackwell MMA operations.
    rT   r   "k must be multiple of 16, but got rR      rV      rU   r,   r-   Fuse_silu_and_mulr+          r.   r1   r   r0   rZ   r\   r   r   %silu_and_mul_scaled_fp4_experts_quantr   r^   permuter_   )rw   rQ   r{   r.   lrO   ksf_vec_sizescale_kpadded_kpadded_k_int32padded_mr2   output_scalesr   r   r   scaled_fp4_grouped_quant8  s4   

r   c                 C   s,  | j }| j\}}}|d }d}|| dksJ d| d|| }	|	d d d }
|
d }|d d	 d	 }tj|||d |tjd
}tj||||tjd
}tjjjj	|
|| |d |
|| || 
|| |||dd |ddd}|
tj
||d	 |
d ddd}|dddddd}||fS )aQ  
    Quantize input tensor to FP4 and return quantized tensor and scale, for
    grouped gemm inputs (e.g., grouped_gemm_nt_masked for flashinfer).
    Args:
        input: The input tensor to be quantized to FP4, with shape (l, m, k * 2)
            l is number of groups, m is number of tokens per group, k is number of features.
        input_global_scale: A scalar scaling factor for the entire tensor, with
            shape (l,).
        mask: The mask tensor, with shape (l,)
    Outputs:
        output: The quantized tensor in FP4, with shape (m, k // 2, l) but the physical
            layout is (l, m, k // 2). `// 2` is because two fp4 values are packed into
            an uint8.
        output_scales: The blockscale tensor in FP8-E4M3, with shape (32, 4, rm, 4, rk, l)
            but the physical layout is (l, rm, rk, 32, 4, 4).
    Note:
        For the shape of output_scales, `32 * 4 * rm` is a padded m to nearest multiple of 128.
        `4 * rk` is a padded `k // 16` to nearest multiple of 4. These layout constants are
        required by the NVIDIA Blackwell MMA operations.
    r,   rT   r   r|   rR   r}   rV   r~   rU   r-   Tr   r+   r   r   r   )rw   rQ   r{   r.   r   rO   k_by_2r   r   r   r   r   r   r2   r   r   r   r   %silu_and_mul_scaled_fp4_grouped_quants  s6   

r   expert_offsetsblockscale_offsetstopk
expert_mapc                 C   s&  | j dksJ d| j  d|dur#| j\}}|| |f}t| ||} | j\}	}ddl}
t|
jdd}|	|| ksFJ d| d	|	 d
|d }|d d }tj|	|d | j	tj
d}||krotj|| |tj| j	d}ntj|| |tj| j	d}tjjj||| ||| |tj}||fS )a  
    Quantize input tensor to FP4 and return quantized tensor and scale, for
    packed MoE Inputs.
    Args:
        input: The input tensor to be quantized to FP4
        expert_map: The expert map tensor
        input_global_scale: A scalar scaling factor for the entire tensor.
        expert_offsets: The expert offsets tensor
        blockscale_offsets: The blockscale offsets tensor
    Outputs:
        output: The quantized tensor in FP4
        output_scales: The blockscale tensor in FP8-E4M3
    r,   z%input.ndim needs to be == 2, but got rR   Nr   MODELOPT_MAX_TOKENS_PER_EXPERTi   z2m_numtopk must be less than MAX_TOKENS_PER_EXPERT(z,) for cutlass_moe_fp4, observed m_numtopk = z7. Use MODELOPT_MAX_TOKENS_PER_EXPERT to set this value.rT   r}   rV   r-   rL   )rM   r1   rv   osintenvirongetr   r0   r.   rZ   r[   r\   r   r   scaled_fp4_experts_quantr   r^   r_   )rw   rQ   r   r   r   r   rO   r   ry   	m_numtopkr   MAX_TOKENS_PER_EXPERTscales_kr   r2   r   r   r   r   r     sZ   


r   Tc
b_q_weightb_scalesglobal_scaleb_zerosg_idxperm	workspaceb_q_typesize_msize_nsize_k	is_k_fulluse_atomic_adduse_fp32_reduceis_zp_floatc                 C   s0   t jj| |||||||||	j|
||||||S r
   )r   r   r   gptq_marlin_gemmid)rG   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r     s&   r   b_gptq_qzerosb_gptq_scalesb_g_idxuse_shufflebitc              	   C   s   t jj| ||||||S r
   )r   r   r   	gptq_gemm)rG   r   r   r   r   r   r   r   r   r   r   %  s   	r   q_weightq_permc                 C   s   t j jj| || d S r
   )r   r   r   gptq_shuffle)r   r   r   r   r   r   r   3  s   r   r
   )FFNN)TFFF)(typingr   r   r   sgl_kernel.scalar_typer   sgl_kernel.utilsr   Tensor
ByteTensorr   r   r   r   r(   r)   r&   r3   r   floatboolrC   sgl_per_token_group_quant_fp8sgl_per_token_group_quant_int8rE   rF   rN   r]   ro   rr   rY   ru   rv   r   r   tupler   r   r   r   r   r   r   r   <module>   s   






	

(



B




;
C
[	

(
"