o
    پir1                     @   s   d dl Z d dlZd dlmZ g dZejdejdejfddZejdejdejd	ejd
ejfddZ			d de j
de j
de j
dedede j
fddZ			d!de j
de j
de j
de j
dedededede j
fddZde j
de j
de j
de j
fddZdS )"    N)    @      BLOCK_SIZE_XBLOCK_SIZE_Yc	           (      C   s  t jdd}	t jdd}
|
| t d| }|	| t d| }||d d d f  |d d d f  }||k }||k }|d d d f |d d d f @ }|
| t d| }|	| d t d|d  }d| |d d d f  |d d d f  }||k }||d k }|d d d f |d d d f @ }t | | |d}t ||}t ||}t ||}t ddd d d d f t ddd d d f  d}|d }t |d d d f || df}t |||d f}||? d@ }|
| | t dd }|	| t d| }||d d d f  |d d d f  }||| k }||k }|d d d f |d d d f @ }t || |d} t | | } t | | } t | | } t | ||d f} | |? d@ } |
| | t dd }!|	| d t d|d  }"|d |!d d d f  |"d d d f  }#|!|| k }$|"|d k }%|$d d d f |%d d d f @ }&t ||# |&d}'t |'||d f}'||  |' }||jj	}t 
|| || d S )	Nr   axis                       )tl
program_idarangeload
interleavereshapebroadcast_tototype
element_tystore)(qweight_ptr
scales_ptr	zeros_ptr
group_size
result_ptrnum_colsnum_rowsr   r   pid_xpid_y	offsets_y	offsets_xoffsetsmasks_ymasks_xmasksresult_offsets_yresult_offsets_xresult_offsetsresult_masks_yresult_masks_xresult_masksiweightsreverse_awq_order_tensorshiftszero_offsets_yzero_offsets_xzero_offsetszero_masks_yzero_masks_x
zero_maskszerosscale_offsets_yscale_offsets_xscale_offsetsscale_masks_yscale_masks_xscale_masksscales rA   ]/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/quantization/awq_triton.pyawq_dequantize_kernel   sb   $ & 2 $ ( rC   BLOCK_SIZE_MBLOCK_SIZE_NBLOCK_SIZE_KSPLIT_Kc           9      C   s  t jdd}t d}t ||
}|| }|| }|jj}t j|	|
f|d}t ddd d d d f t ddd d d f  d}|d }t |d d d f ||
d  df}t |||
f}||	 t d|	 }||k }||
d  t d|
d  }||d k }||
d  t d|
d  }||d k }||
 t d|
 }||k }|| t d| }||d d d f  |d d d f  }|d |d d d f  |d d d f  } | | }!||  }"t	dt ||| D ]}#||k }$|d d d f |$d d d f @ }%t j
|!|%dd	}&|$d d d f |d d d f @ }'t j
|"|'dd	}(t |(|(}(t |(|(}(t |(|(}(|| |# ||  | t dd })|d |)d d d f  |d d d f  }*|)|| k }+|+d d d f |d d d f @ },||* }-t j
|-|,dd	}.t |.|.}.t |.|.}.t |.|.}.t |.||
f}.||)d d d f  |d d d f  }/|)|| k }0|0d d d f |d d d f @ }1||/ }2t j
|2|1dd	}3t |3||
f}3|(|? d
@ }(|.|? d
@ }.|(|. |3 }(|(|jj}(t j|&|(||d}||| 7 }|!|| 7 }!|"|| |d  7 }"q||jj}4||	 t d|	 }5||
 t d|
 }6||| |  ||5d d d f   |6d d d f  }7|5d d d f |k |6d d d f |k @ }8t j|7|4|8d d S )Nr   r   r
   )dtyper   r   r   r   )maskotherr   )	out_dtype)rI   )r   r   cdivr   r   r9   r   r   r   ranger   r   r   dotr   )9a_ptrb_ptrc_ptrr   r   MNKr   rD   rE   rF   rG   pidpid_z	num_pid_npid_mpid_naccumulator_dtypeaccumulatorr1   r2   
offsets_ammasks_am
offsets_bnmasks_bn
offsets_znmasks_zn
offsets_snmasks_sn	offsets_k	offsets_a	offsets_ba_ptrsb_ptrskmasks_kmasks_aamasks_bboffsets_szk	offsets_zmasks_zkmasks_z
zeros_ptrsr9   	offsets_smasks_skmasks_sscales_ptrsr@   coffs_cmoffs_cnc_ptrsc_maskrA   rA   rB   awq_gemm_kerneln   s   
2$$(  
( $ 4(r}   r   qweightr@   r9   block_size_xblock_size_yreturnc           
         s  | j d }|j d }| j d |j d  }|dkr|dksJ |j d || kr.|j d |ks0J |j d || krB|j d |d ksDJ ||ksJJ |tv sT||ksTJ tj| j d | j d d | j|jd}| j d | j d   fdd}	t|	 | |||| ||d	 |S )Nr   r
   r   )devicerH   c                    s    t  | d t | d fS )Nr   r   tritonrL   METAXYrA   rB   <lambda>	  s   z'awq_dequantize_triton.<locals>.<lambda>)r   r   )shape AWQ_TRITON_SUPPORTED_GROUP_SIZEStorchemptyr   rH   rC   )
r~   r@   r9   r   r   rT   rR   r   resultgridrA   r   rB   awq_dequantize_triton   s:   

$(

r   inputqzerossplit_k_itersblock_size_mblock_size_nblock_size_kc                    s^  | j \ }|j d d |j d |j d  }	dkr"|dkr" dks$J |j d |kr4|j d d ks6J |j d ||	 krH|j d d ksJJ |j d ||	 krZ|j d ks\J d @ dkrhdksjJ dkspJ |	|ksvJ |	tv s|	|ksJ  fdd}
tj f|j| jd}t|
 | |||| ||	|||d |d}|S )	Nr
   r   r   r   c                    s$   t  | d t | d  fS )NrD   rE   r   r   rR   rS   r   rA   rB   r   8  s   z!awq_gemm_triton.<locals>.<lambda>)rH   r   )rD   rE   rF   rG   )r   r   r   r9   rH   r   r}   sum)r   r~   r@   r   r   r   r   r   rT   r   r   r   rA   r   rB   awq_gemm_triton!  s<   

$($
r   c                 C   s   | }|}g }g }g d}t ddD ]"}|| d }	||dd|	? d@  ||dd|	? d@  qtj|dd|jd d|j}tj|dd|jd d|j}||jd d|jd |d |d }
|
|jd dS )	N)r   r   r
      r            r   r   r   r   r
   r   )dim)	rM   appendr   r   catr   r   rH   	unsqueeze)r~   r@   r9   qweight_tmp
qzeros_tmpqweight_listqzeros_listr2   i	shift_numresrA   rA   rB   awq_dequantize_decompositionV  s*   "r   )r   r   )r   r   r   )r   r   triton.languagelanguager   r   jit	constexprrC   r}   Tensorintr   r   r   rA   rA   rA   rB   <module>   s   	`
 
<	
5