o
    i                     @   s*  d dl Z d dlZd dlmZ d dlmZ d dlmZmZm	Z	 d dl
Z
ddlmZmZ daedg d	Zed
g dd dZdefddZdefddZe Zdd Ze Zdd Ze jdefddZdd ZeG dd dZdedefd d!ZdJd#e
jd$ed%ed&efd'd(Z dJd#e
jd$ed%ed&efd)d*Z!e
j"j#d+d,d-	"dJd#e
jd$ed%ed&ede
jf
d.d/Z$e
j"%d+dJd0d1Z&d2edefd3d4Z'd2edefd5d6Z(d2ed7ee defd8d9Z)de	e fd:d;Z*	<dKde	ed=f fd>d?Z+dLdAdBZ,dCeee  fdDdEZ-d@a.defdFdGZ/defdHdIZ0dS )M    N)	dataclass)Enum)DictListTuple   )ceil_divround_upFAuxStreamType)	Attention	MoeSharedMoeChunkingOverlap	EventType)Mainr   r   r   )startenablec                 C      | a d S Nis_torch_compiling_flagr    r   P/home/ubuntu/vllm_env/lib/python3.10/site-packages/flashinfer/fused_moe/utils.pyset_torch_compiling      r   returnc                   C      t S r   r   r   r   r   r   is_torch_compiling      r   c                   C   r   r   )_global_attrsr   r   r   r   get_global_attrs%   s   r    c                   C   s   t tdd S Nattrs)getattr_model_extra_attrsr   r   r   r   get_model_extra_attrs,   s   r%   r"   c                 c   s0    t tdd }| t_z	d V  W |t_d S |t_w r!   )r#   r$   r"   )r"   	old_attrsr   r   r   model_extra_attrs0   s   r'   c                    s    fdd}|S )Nc                    s    fdd}|S )Nc                    sF   t |   | g|R i |W  d    S 1 sw   Y  d S r   )r'   )selfargskwargs)func	get_attrsr   r   wrapper<   s   $z:with_model_extra_attrs.<locals>.decorator.<locals>.wrapperr   )r+   r-   r,   )r+   r   	decorator;   s   z)with_model_extra_attrs.<locals>.decoratorr   )r,   r/   r   r.   r   with_model_extra_attrs:   s   r0   c                   @   s:   e Zd ZU ejed< ejed< dZeed< edd Z	dS )Fp4QuantizedTensor
fp4_tensorscaling_factorTis_sf_swizzledc                 C   s   | j jS r   )r2   shape)r(   r   r   r   r5   K   r   zFp4QuantizedTensor.shapeN)
__name__
__module____qualname__torchTensor__annotations__r4   boolpropertyr5   r   r   r   r   r1   E   s   
 

r1   rowcolc                 C   s   t | d}t |d}||fS N      )r	   )r>   r?   
padded_row
padded_colr   r   r   compute_swizzled_sf_shapeP   s   

rE      sfrowscolsscaling_vector_sizec                 C   s&   t ||}| d||} tjj| S )a  Swizzle FP4 scaling factors using C++ torch op implementation
    Args:
        sf: [b, rows, cols_sf] or [rows, cols_sf]. The original unswizzled scaling factors.
        rows: rows of the original unquantized tensor
        cols_sf: ceil_div(cols, scaling_vector_size) where cols is the number of columns of the original unquantized tensor
        scaling_vector_size: the size of the scaling vector
    Returns:
        [b * round_up(rows, 128) * round_up(cols_sf, 4), ] 1D swizzled scaling factors, possibly with rows and cols padded.
    )r   viewr9   opstrtllmblock_scale_interleaverG   rH   rI   rJ   sf_colsr   r   r   
swizzle_sfV   s   

rR   c                 C   s.   t ||}| d||} tjj| d|S )a^  Swizzle FP4 scaling factors using C++ torch op implementation
    Args:
        sf: The (padded and) swizzled scaling factors.
        rows: rows of the original unquantized tensor
        cols: cols of the original unquantized tensor
        scaling_vector_size: the size of the scaling vector
    Returns:
        2D unswizzled scaling factors
    rK   )r   rL   r9   rM   rN   block_scale_interleave_reverserP   r   r   r   unswizzle_sfe   s   

rT   ztrtllm::reswizzle_sfr   )mutates_argsc                 C   s   t ||}t||\}}|| }|  ||  dksJ |  ||  }| |||}	t|	|||}
|| }|
|||}
|
ddd|d|f  }|||}t||||S )a  Reswizzle FP4 scaling factors using C++ torch op implementation.
       It unswizzles the scaling factors in each partition first, then concatenates them together, and finally swizzles them back.
    Args:
        sf: The (padded and) swizzled scaling factors.
        rows: rows of the original unquantized tensor
        cols: cols of the original unquantized tensor
        scaling_vector_size: the size of the scaling vector
    Returns:
        1D reswizzled scaling factors
    r   N)r   rE   numelrL   rT   
contiguousrR   )rG   rH   rI   rJ   rQ   padded_rowspadded_sf_colspadded_colsnum_partitionssf_reshapedsf_unswizzled
total_rowssf_concatenatedr   r   r   reswizzle_sft   s$   
r`   c           
      C   sN   t ||}t||\}}|  ||  }|| }t|dt|d }	| |	S r@   )r   rE   rV   r	   	new_empty)
rG   rH   rI   rJ   rQ   rX   rY   r[   r^   szr   r   r   _   s   

rc   xc                 C   sd   | dk rdS | d }||d? O }||d? O }||d? O }||d? O }||d? O }||d? O }|d S )N   r   rB      rF       r   )rd   nr   r   r   next_positive_power_of_2   s   ri   c                 C   s   t | }|| kr
|S |d S Nr   )ri   )rd   nextr   r   r   last_positive_power_of_2   s   rl   bucketsc                 C   s   t tt| |d |d S )Nr   rK   )minmaxri   )rd   rm   r   r   r   nearest_in_buckets   s   rp   c                 C   s:   t | } g }| }|dkr|| |d }|dkst|S )Nre   r   )ri   appendtuple)max_num_tokensnum_token_bucketsmr   r   r   !get_power_of_2_num_tokens_buckets   s   
rv   re   .c                 C   s:   t | } g }| }||kr|| |d }||kst|S rj   )rl   rq   rr   )rs   min_num_tokensrt   ru   r   r   r   &get_last_power_of_2_num_tokens_buckets   s   
rx   Tc                 C   s|   d}t t| d D ]}|| | 9 }q
dd | D }|d  d  < |r2t|dt| d | d n|| d |  }||fS )Nre   c                 S   s   g | ]}|qS r   r   ).0ir   r   r   
<listcomp>   s    z!get_fp4_shape.<locals>.<listcomp>rK   r   rA   rB   )rangelenr	   )input_shapesf_vec_sizeis_swizzled_layoutru   rz   output_shapescale_shaper   r   r   get_fp4_shape   s   r   input_shapesc                 C   s   t | d dd\}}|d S )z1Calculate the dimensions of the fp4 scale tensor.r   rF   )r   r   )r   )r   	out_shaper   r   r   r   fp4_scale_infer_shape   s   r   c                 C   r   r   _enable_piecewise_cuda_graphr   r   r   r   set_piecewise_cuda_graph_flag   r   r   c                   C   r   r   r   r   r   r   r   get_piecewise_cuda_graph_flag   r   r   )rF   )re   )T)1
contextlib	threadingdataclassesr   enumr   typingr   r   r   r9   utilsr   r	   r   r
   r   r<   r   r   localr   r    r$   r%   contextmanagerr'   r0   r1   intrE   r:   rR   rT   library	custom_opr`   register_fakerc   ri   rl   rp   rv   rx   r   r   r   r   r   r   r   r   r   <module>   sv    	

'	


