o
    -i                  !   @   s  U d dl mZ d dlmZ d dlZd dlm  mZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZmZ d dlmZ d dlmZ d dlmZ d dl m!Z! ddl"m#Z# ddl$m%Z%m&Z&m'Z' ddl(m)Z)m*Z* e+ Z,dZ-edB e.d< edrzd dl/m0Z1 e2e1dre1Z-W n	 e3y   Y nw ee4Z5e2ej6j7drej6j7j8j9Z:G dd dZ;G dd de;Z<G dd de;Z=G dd de;Z>G dd  d e;Z?G d!d" d"e;Z@G d#d$ d$e;ZAG d%d& d&e*ZBd'd(d)d*d'd+dd*d,ZCeDeEeDeEeFf f e.d-< d+d(d)d*d+d.dd*d,ZGeDeEeDeEeFf f e.d/< e-durdaHd0ZI				dXd1ejJd2ejJd3ejJd4eFd5eEd6eEd7eKd8eKd9eKd:eEd;eEd<ejJdB d=ejJdB d>ejJdB d?ejJdB d@df dAdBZL				dXd1ejJd2ejJd3ejJd4eFd5eEd6eEd7eKd8eKd9eKd:eEd;eEd<ejJdB d=ejJdB d>ejJdB d?ejJdB d@df dCdDZMe!dEeLg dFeMdG ej6jNjOj9ZOG dHdI dIZPG dJdK dKe;ZQG dLdM dMe;ZRG dNdO dOe;ZSG dPdQ dQe;ZTG dRdS dSe;ZUG dTdU dUe;ZVG dVdW dWe*ZWdS )Y    )	find_spec)
ModuleTypeN)auto_functionalized)PatternMatcherPass)enable_symm_mem_for_group)
VllmConfig)Range)get_tp_group tensor_model_parallel_all_reduce)get_tensor_model_parallel_rank$get_tensor_model_parallel_world_size)init_logger)kFp8StaticTensorSym)current_platform)direct_register_custom_op   )enable_fake_mode)MatcherFusedAddRMSNormMatcherQuantFP8MatcherRMSNorm)VllmInductorPassVllmPatternMatcherPassflashinfer_comm
flashinfertrtllm_allreduce_fusionscaled_fp4_quantc                   @   s(   e Zd ZdejdedB ddfddZdS )BasePatterndtypedeviceNreturnc                 C   s    || _ || _t | _t | _d S N)r   r   r	   tpr   tp_size)selfr   r    r$   _/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/compilation/collective_fusion.py__init__2   s   zBasePattern.__init__)__name__
__module____qualname__torchr   strr&   r$   r$   r$   r%   r   1   s     r   c                   @   2   e Zd Zdeej fddZdeddfddZdS )GEMMReduceScatterPatternr   c                 C   s8   t jddg| j| jd}t jddg| j| jd}||gS )N      r   r   r*   emptyr   r   )r#   mul	mm_weightr$   r$   r%   
get_inputs:   s   z#GEMMReduceScatterPattern.get_inputspm_passNc                    \   dt jdt jdt jf fdd}dt jdt jdt jf fdd}t||  tj| d S )Nr3   r4   r   c                    s4   t jjj| |}t jjjj|d j jj	d}|S Nr   dim
world_size
group_name)
r*   opsatenmmdefaultvllmreduce_scatterr"   r!   unique_name)r3   r4   r?   rB   r#   r$   r%   pattern@   s   
z2GEMMReduceScatterPattern.register.<locals>.patternc                    s"   t jjj| |dd jjjd}|S )Navgr   )scatter_dimr<   )r*   r=   symm_memfused_matmul_reduce_scatterr!   device_groupr<   )r3   r4   gemm_rsrD   r$   r%   replacementJ   s   z6GEMMReduceScatterPattern.register.<locals>.replacementr*   Tensorpmregister_replacementr5   fwd_onlyr#   r6   rE   rL   r$   rD   r%   register?   s
     
z!GEMMReduceScatterPattern.register	r'   r(   r)   listr*   rN   r5   r   rS   r$   r$   r$   r%   r-   9   s    r-   c                   @   r,   )AllGatherGEMMPatternr   c                 C   s8   t jddg| j| jd}t jddg| j| jd}||gS )Nr/   r0   r1   )r#   xweightr$   r$   r%   r5   [   s   zAllGatherGEMMPattern.get_inputsr6   Nc                    r7   )NrW   rX   r   c                    s0   t jjjj| d j jjd}t jjj	||S r8   )
r*   r=   rA   
all_gatherr@   r"   r!   rC   r>   r?   )rW   rX   rY   rD   r$   r%   rE   b   s   
z.AllGatherGEMMPattern.register.<locals>.patternc                    s&   t jjj| |gd jjjd\}}|S )Nr   )
gather_dimr<   )r*   r=   rH   fused_all_gather_matmulr!   rJ   r<   )rW   rX   	ag_output
mm_outputsrD   r$   r%   rL   o   s   
z2AllGatherGEMMPattern.register.<locals>.replacementrM   rR   r$   rD   r%   rS   a   s    	zAllGatherGEMMPattern.registerrT   r$   r$   r$   r%   rV   Z   s    rV   c                   @   r,   )ScaledMMReduceScatterPatternr   c                 C   st   t jddg| jtd}t jddg| jtd dd}t jddg| jt jd}t jddg| jt jd}||||gS Nr.   r0   r   r   )r*   r2   r   	FP8_DTYPE
contiguous	transposefloat32)r#   inputr4   scale_ascale_br$   r$   r%   r5   ~   s   z'ScaledMMReduceScatterPattern.get_inputsr6   Nc              
      t   dt jdt jdt jdt jdt jf
 fdd}dt jdt jdt jdt jdt jf
 fdd	}t||  tj| d S )
Nrd   mat2re   rf   r   c              	      sB   t jjjj| |||d d  jd}t jjjj|d j j	j
d}|S )Nrh   re   rf   biasscale_result	out_dtyper   r9   )r*   r=   r>   
_scaled_mmr@   r   rA   rB   r"   r!   rC   )rd   rh   re   rf   	scaled_mmrB   rD   r$   r%   rE      s    

	z6ScaledMMReduceScatterPattern.register.<locals>.patternc                    sR   g | j d d |j d }d}tjj| |||d|| jjj|d d  jd}|S Nr   r   rF   F	shaper*   r=   rA   *patched_fused_scaled_matmul_reduce_scatterr!   rJ   r<   r   )rd   rh   re   rf   output_shaperG   rK   rD   r$   r%   rL      s$   z:ScaledMMReduceScatterPattern.register.<locals>.replacementrM   rR   r$   rD   r%   rS      s2   z%ScaledMMReduceScatterPattern.registerrT   r$   r$   r$   r%   r^   }   s    r^   c                   @   r,   )AllGatherScaledMMPatternr   c                 C   s   t jddg| jtd}t jddg| jtd dd}|jd | j }t j|dg| jt jd}t jddg| jt jd}||||gS N   r.   r0   r   r   )	r*   r2   r   r`   ra   rb   rr   r"   rc   )r#   rW   rX   s1re   rf   r$   r$   r%   r5      s   z#AllGatherScaledMMPattern.get_inputsr6   Nc              
      rg   )
NrW   rX   re   rf   r   c              	      s>   t jjjj| d j jjd}t jjj	j||||d d  j
dS )Nr   r9   ri   )r*   r=   rA   rY   r@   r"   r!   rC   r>   rm   r   )rW   rX   re   rf   rY   rD   r$   r%   rE      s   

z2AllGatherScaledMMPattern.register.<locals>.patternc                    s>   t jjj| |g||gdd gd g jgdg jjjd
\}}|S Nr   F)rZ   biasesresult_scales
out_dtypesuse_fast_accumr<   r*   r=   rH   fused_all_gather_scaled_matmulr   r!   rJ   r<   )rW   rX   re   rf   r\   r]   rD   r$   r%   rL      s   
z6AllGatherScaledMMPattern.register.<locals>.replacementrM   rR   r$   rD   r%   rS      s2   z!AllGatherScaledMMPattern.registerrT   r$   r$   r$   r%   ru      s    ru   c                   @   r,   )#CutlassScaledMMReduceScatterPatternr   c                 C   s   t jddg| jtd}t jddg| jtd dd}t jddg| jt jd}t jddg| jt jd}t jddg| j| jd}|||||gS r_   )r*   r2   r   r`   ra   rb   rc   r   )r#   rd   r4   re   rf   cutlass_mm_outputr$   r$   r%   r5      s   z.CutlassScaledMMReduceScatterPattern.get_inputsr6   Nc                    s   dt jdt jdt jdt jdt jdt jf fdd}dt jd	t jdt jdt jdt jdt jf fd
d}t||  tj| d S )Nrd   rX   re   rf   r   r   c              	      sJ   t jjjt jjjj|| |||d d}t jjjj|d d j	 j
jd}|S )Noutaba_scalesb_scalesrj   r   r   r9   )r*   r=   higher_orderr   _Ccutlass_scaled_mmr@   rA   rB   r"   r!   rC   )rd   rX   re   rf   r   r   rB   rD   r$   r%   rE     s    


z=CutlassScaledMMReduceScatterPattern.register.<locals>.patternrh   c                    sR   g | j d d |j d }d}tjj| |||d|| jjj|d d  jd}|S ro   rq   )rd   rh   re   rf   r   rt   rG   rK   rD   r$   r%   rL   '  s$   zACutlassScaledMMReduceScatterPattern.register.<locals>.replacementrM   rR   r$   rD   r%   rS     s:   z,CutlassScaledMMReduceScatterPattern.registerrT   r$   r$   r$   r%   r      s    r   c                   @   r,   )AllGatherCutlassScaledMMPatternr   c                 C   s   t jddg| jtd}t jddg| jtd dd}|jd | j }t j|dg| jt jd}t jddg| jt jd}|jd }t j||g| j| j	d}|||||gS rv   )
r*   r2   r   r`   ra   rb   rr   r"   rc   r   )r#   rW   rX   rx   re   rf   s2outputr$   r$   r%   r5   I  s   
z*AllGatherCutlassScaledMMPattern.get_inputsr6   Nc                    s   dt jdt jdt jdt jdt jdt jf fdd}dt jdt jdt jdt jdt jdt jf fd	d
}t||  tj| d S )NrW   rX   re   rf   r   r   c              	      sJ   t jjjj| d j jjd}t jjj	t jj
jj|||||d d}|d S )Nr   r9   r   r   )r*   r=   rA   rY   r@   r"   r!   rC   r   r   r   r   )rW   rX   re   rf   r   rY   r   rD   r$   r%   rE   \  s   

	z9AllGatherCutlassScaledMMPattern.register.<locals>.patternc                    s>   t jjj| |g||gdd gd g jgdg jjjd
\}}|S ry   r~   )rW   rX   re   rf   r   r\   r]   rD   r$   r%   rL   r  s   
z=AllGatherCutlassScaledMMPattern.register.<locals>.replacementrM   rR   r$   rD   r%   rS   [  s:   z(AllGatherCutlassScaledMMPattern.registerrT   r$   r$   r$   r%   r   H  s    r   c                       sV   e Zd Zededdf fddZdedefddZe	j
d	ejddfd
dZ  ZS )AsyncTPPassconfigr   Nc                    s   t  | tt jj tdd| _t| j	| j
| j t| j	| j
| j | j	tjkr[t| j	| j
| j t| j	| j
| j t| j	| j
| j t| j	| j
| j | || j d S )Nasync_tp_pass	pass_name)superr&   r   r	   rJ   r<   r   patternsr-   model_dtyper   rS   rV   r*   bfloat16r^   ru   r   r   dump_patterns)r#   r   	__class__r$   r%   r&     s*   zAsyncTPPass.__init__compile_rangec                 C   s4   | j jr| j jr
dS t }t| o|j| dkS )NTr   )compilation_configsplitting_opsuse_inductor_graph_partitionr   boolis_single_sizeend)r#   r   r"   r$   r$   r%   is_applicable_for_range  s   z#AsyncTPPass.is_applicable_for_rangegraphc                 C   s    | j || _td| j d S )NReplaced %s patterns)r   applymatched_countloggerdebugr#   r   r$   r$   r%   __call__  s   zAsyncTPPass.__call__)r'   r(   r)   r   r   r&   r   r   r   r   time_and_logfxGraphr   __classcell__r$   r$   r   r%   r     s     r   @      g      ?)r   r/   rw       )Z   d   FI_ALLREDUCE_FUSION_MAX_SIZE_MBr/   #_FI_ALLREDUCE_ONE_SHOT_MAX_SIZES_MB   allreduce_inresidual	rms_gammarms_eps
world_rankr;   launch_with_pdltrigger_completion_at_endfp32_accmax_token_numpattern_codenorm_out	quant_out	scale_outscale_factorr   c              	   C   sZ  | j \}}|  }|| | }|	| | }||ks)J d| d|	 d| d| t }|d ur5| nd }t|i |d }|d u pJ||t k}td usSJ d|d u r\| }|}n| }t	j
di d| d| j d d	|d
|d|d|d|d|d|d| j d dtd|d|d|d|d|
dd d|d|dt	jjd| d S )NzCurrent tensor size z is larger than max token num z * hidden size z * element size z0Flashinfer must be enabled when using flashinferr   	token_numr   residual_inresidual_outr   r   r   r   r;   
hidden_dimrp   workspace_ptrsr   use_oneshotr   r   r   allreduce_outr   r   layout_coder   r$   )rr   element_sizer   get_device_capabilityto_intr   getMiB_FI_WORKSPACE_TENSORr   r   QuantizationSFLayoutSWIZZLED_128x4)r   r   r   r   r   r;   r   r   r   r   r   r   r   r   r   
num_tokenshidden_sizer   current_tensor_sizemax_tensor_sizecurr_devicedevice_capabilitymax_one_shot_sizer   r   r$   r$   r%    call_trtllm_fused_allreduce_norm  s   




	

r   c                 C   s   d S r    r$   )r   r   r   r   r   r;   r   r   r   r   r   r   r   r   r   r$   r$   r%   %call_trtllm_fused_allreduce_norm_fake1  s   r   &flashinfer_trtllm_fused_allreduce_norm)r   r   r   r   r   )op_nameop_funcmutates_args	fake_implc                   @   sN   e Zd ZdZ		ddededededd	f
d
dZdeeeeB f fddZ	d	S )FlashInferFusedAllReduceParamsz5Parameters for FlashInfer fused allreduce operations.F   rankr;   use_fp32_lamportr   r   Nc                 C   s.   || _ || _|| _d| _d| _d| _|| _d S )NT)r   r;   r   r   r   r   r   )r#   r   r;   r   r   r$   r$   r%   r&   X  s   
z'FlashInferFusedAllReduceParams.__init__c                 C   s   | j | j| j| j| j| jdS )N)r   r;   r   r   r   r   )r   r;   r   r   r   r   rD   r$   r$   r%   !get_trtllm_fused_allreduce_kwargsg  s   z@FlashInferFusedAllReduceParams.get_trtllm_fused_allreduce_kwargs)Fr   )
r'   r(   r)   __doc__intr   r&   dictr+   r   r$   r$   r$   r%   r   U  s     
r   c                
       b   e Zd ZdZdedejdedB deddf
 fdd	Z	de
ej fd
dZdeddfddZ  ZS )AllReduceRMSNormPatternz
    This pattern replaces the allreduce + rms norm (without residual)
    with fused flashinfer implementation.
    Applies to allreduce + rmsnorm before attn in the first Transformer block.
    epsilonr   r   Nallreduce_paramsr   c                    (   t  || || _|| _t|| _d S r    r   r&   r   r   r   rmsnorm_matcherr#   r   r   r   r   r   r$   r%   r&   y     z AllReduceRMSNormPattern.__init__c                 C   s   | j  \}}|| j|gS r    r   inputstor   )r#   rd   rX   r$   r$   r%   r5     s   z"AllReduceRMSNormPattern.get_inputsr6   c                    sp   dt jdt jdtt jt jf f fdd}dt jdt jdtt jt jf f fdd}t||  tj| d S )Nrd   rX   r   c                    s   t | } ||}||fS r    r
   r   )rd   rX   allreduce_outputrmsrD   r$   r%   rE     s   z1AllReduceRMSNormPattern.register.<locals>.patternc                    sb   t | }t | }td usJ dttf| ||d d | jtjjd j	
 }|d |d fS )NFlashInfer must be enabledr   r   r   r   r   r   r   r      r   )r*   
zeros_like
empty_liker   r   r   r   AllReduceFusionPatternkARResidualRMSNormr   r   )rd   rX   r   
rms_result	allreducerD   r$   r%   rL     s$   


z5AllReduceRMSNormPattern.register.<locals>.replacementr*   rN   tuplerO   rP   r5   rQ   rR   r$   rD   r%   rS     s"   z AllReduceRMSNormPattern.registerr'   r(   r)   r   floatr*   r   r+   r   r&   rU   rN   r5   r   rS   r   r$   r$   r   r%   r   r      r   c                
       r   )AllReduceFusedAddRMSNormPatternz
    This pattern replaces the allreduce + rms norm (with residual)
    with fused flashinfer implementation.
    Applies to o_proj + rmsnorm after attn and mlp + rmsnorm before attn.
    r   r   r   Nr   r   c                    r   r    r   r&   r   r   r   r   r   r   r$   r%   r&     r   z(AllReduceFusedAddRMSNormPattern.__init__c                 C   s"   | j  \}}}||| j|gS r    r   )r#   rd   r   rX   r$   r$   r%   r5     s   z*AllReduceFusedAddRMSNormPattern.get_inputsr6   c              
      s   dt jdt jdt jdtt jt jf f fdd}dt jdt jdt jdtt jt jf f fdd}t||  tj| d	d
 }t||||  tj| d S )Nr   rd   rX   r   c                    s"   t |} ||| \}} || fS r    r   )r   rd   rX   r   r   rD   r$   r%   rE     s   z9AllReduceFusedAddRMSNormPattern.register.<locals>.patternc                    sN   t d usJ dttf|| d d d | jt jjd j }|d |d fS )Nr   r   r   r   )r   r   r   r   r   r   r   r   )r   rd   rX   r  rD   r$   r%   rL     s    
z=AllReduceFusedAddRMSNormPattern.register.<locals>.replacementc                    s    fddS )Nc                    s    | ||d S )Nr   r$   )r   r   cfnr$   r%   <lambda>  s    zLAllReduceFusedAddRMSNormPattern.register.<locals>.<lambda>.<locals>.<lambda>r$   r
  r$   r
  r%   r    s    z:AllReduceFusedAddRMSNormPattern.register.<locals>.<lambda>r  )r#   r6   rE   rL   first_return_onlyr$   rD   r%   rS     s:   z(AllReduceFusedAddRMSNormPattern.registerr  r$   r$   r   r%   r    r  r  c                
       r   )*AllReduceFusedRMSNormStaticQuantFP8Patternz
    This pattern replaces the allreduce + rms norm (without residual)
    + static fp8 quant with fused flashinfer implementation.
    Applies to allreduce + rmsnorm + quant before attn
    in the first Transformer block.
    r   r   r   Nr   r   c                    :   t  || || _|| _tj| _t|| _t	t
| _d S r    )r   r&   r   r   r*   float8_e4m3fnquant_dtyper   r   r   r   quant_matcherr   r   r$   r%   r&     s   
z3AllReduceFusedRMSNormStaticQuantFP8Pattern.__init__c                 C   s.   | j  \}}| j \}}|| j||gS r    r   r   r  r   r   )r#   rd   rX   _scaler$   r$   r%   r5   	  s   z5AllReduceFusedRMSNormStaticQuantFP8Pattern.get_inputsr6   c              
      s|   dt jdt jdt jdtt jt jf f fdd}dt jdt jdt jdtt jt jf f fdd}t||  tj| d S )	Nrd   rX   r  r   c                    s,   t | } ||} ||\}}||fS r    r
   r   r  )rd   rX   r  
all_reducer   quantr  rD   r$   r%   rE     s   zDAllReduceFusedRMSNormStaticQuantFP8Pattern.register.<locals>.patternc                    st   t | }t | }t j|  jd}td usJ dttf| |||d | jtjj	|d	 j
 }|d |d fS )Nr   r   	r   r   r   r   r   r   r   r   r   r/   r   )r*   r   r   r  r   r   r   r   r   kARResidualRMSNormFP8Quantr   r   )rd   rX   r  r   
result_rmsresult_quantr  rD   r$   r%   rL     s(   

zHAllReduceFusedRMSNormStaticQuantFP8Pattern.register.<locals>.replacementr  rR   r$   rD   r%   rS     s*   
z3AllReduceFusedRMSNormStaticQuantFP8Pattern.registerr  r$   r$   r   r%   r    s    r  c                
       r   )-AllReduceFusedAddRMSNormStaticQuantFP8Patternz
    This pattern replaces the allreduce + rms norm (with residual)
    + static fp8 quant with fused flashinfer implementation.
    Applies to o_proj + rmsnorm after attn + quant and
    mlp + rmsnorm + quant before attn.
    r   r   r   Nr   r   c                    r  r    )r   r&   r   r   r*   r  r  r   r   r   r   r  r   r   r$   r%   r&   C  s   
z6AllReduceFusedAddRMSNormStaticQuantFP8Pattern.__init__c                 C   s2   | j  \}}}| j \}}||| j||gS r    r  )r#   rd   r   rX   r  r  r$   r$   r%   r5   R  s   z8AllReduceFusedAddRMSNormStaticQuantFP8Pattern.get_inputsr6   c                    s   dt jdt jdt jdt jdtt jt jf f
 fdd}dt jdt jdt jdt jdtt jt jf f
 fdd	}t||  tj| d S )
Nr   rd   rX   r  r   c           	         s2   t |} ||| \}} ||\}}||fS r    r  )	r   rd   rX   r  r   r   resr  r  rD   r$   r%   rE   Z  s   zGAllReduceFusedAddRMSNormStaticQuantFP8Pattern.register.<locals>.patternc                    s`   t j| jd}td usJ dttf|| d |d | jtjj|d	 j	
 }|d |d fS )Nr  r   r  r/   r   )r*   r   r  r   r   r   r   r   r  r   r   )r   rd   rX   r  r  r  rD   r$   r%   rL   f  s$   zKAllReduceFusedAddRMSNormStaticQuantFP8Pattern.register.<locals>.replacementr  rR   r$   rD   r%   rS   Y  s2   z6AllReduceFusedAddRMSNormStaticQuantFP8Pattern.registerr  r$   r$   r   r%   r  ;  s    r  c                
       r   ),AllReduceFusedRMSNormStaticQuantNVFP4Patternz
    This pattern replaces the allreduce + rms norm (without residual)
    + static nvfp4 quant with fused flashinfer implementation.
    Applies to allreduce + rmsnorm + quant before attn
    in the first Transformer block.
    r   r   r   Nr   r   c                    r   r    r   r   r   r$   r%   r&     r   z5AllReduceFusedRMSNormStaticQuantNVFP4Pattern.__init__c                 C   s   t jg d| j| jd}t jd| jt jd}t jddg| jt jd}t jdg| j| jd}t jddg| jt jd}|||||gS )N)r   r.   r.   r0   r.   rw   r   r.      r/   r*   r2   r   r   uint8rc   int32)r#   rd   quant_resultinput_global_scalerX   output_scaler$   r$   r%   r5     s   z7AllReduceFusedRMSNormStaticQuantNVFP4Pattern.get_inputsr6   c                    s   dt jdt jdt jdt jdt jdtt jt jt jf f fdd}dt jdt jdt jdt jdt jdtt jt jt jf f fd	d
}t||  tj| d S )Nrd   r&  rX   r'  r(  r   c                    s:   t | } ||}tt||||dd}|d ||d fS NT)r   rd   r(  input_scaleis_sf_swizzled_layoutr   r   r
   r   r   STATIC_FP4_QUANT_OP)rd   r&  rX   r'  r(  r  r   quant_out_tuplerD   r$   r%   rE     s   
zFAllReduceFusedRMSNormStaticQuantNVFP4Pattern.register.<locals>.patternc                    sj   t | }t | }td usJ dttf| ||||| jtjj|d	 j	
 }|d |d |d fS )Nr   r  r/   r      )r*   r   r   r   r   r   r   r   kARResidualRMSNormFP4Quantr   r   )rd   r&  rX   r'  r(  r   r  r  rD   r$   r%   rL     s&   

zJAllReduceFusedRMSNormStaticQuantNVFP4Pattern.register.<locals>.replacementr  rR   r$   rD   r%   rS     s:   z5AllReduceFusedRMSNormStaticQuantNVFP4Pattern.registerr  r$   r$   r   r%   r     s    r   c                
       r   )/AllReduceFusedAddRMSNormStaticQuantNVFP4Patternz
    This pattern replaces the allreduce + rms norm (with residual)
    + static nvfp4 quant with fused flashinfer implementation.
    Applies to o_proj + rmsnorm after attn + quant and
    mlp + rmsnorm + quant before attn.
    r   r   r   Nr   r   c                    r   r    r  r   r   r$   r%   r&     r   z8AllReduceFusedAddRMSNormStaticQuantNVFP4Pattern.__init__c                 C   s   t jddg| j| jd}t jddg| j| jd}t jddg| j| jd}t jd| jt jd}t jddg| jt jd}t jddg| jt jd}||||||gS )Nr.   r0   r!  r   r"  r/   r#  )r#   rd   r   rX   r&  r'  r(  r$   r$   r%   r5     s   z:AllReduceFusedAddRMSNormStaticQuantNVFP4Pattern.get_inputsr6   c                    s   dt jdt jdt jdt jdt jdt jdtt jt jt jf f fdd	}dt jdt jdt jdt jdt jdt jdtt jt jt jf f fd
d}t||  tj| d S )Nr&  r   rd   r(  rX   r'  r   c           	         s@   t |} |||\}}tt| |||dd}|d ||d fS r)  r,  )	r&  r   rd   r(  rX   r'  r   r   r.  rD   r$   r%   rE     s   
zIAllReduceFusedAddRMSNormStaticQuantNVFP4Pattern.register.<locals>.patternc                    sV   t d usJ dttf||d | || jt jj|d	 j }|d |d |d fS )Nr   r  r/   r   r/  )r   r   r   r   r   r0  r   r   )r&  r   rd   r(  rX   r'  r  rD   r$   r%   rL     s"   zMAllReduceFusedAddRMSNormStaticQuantNVFP4Pattern.register.<locals>.replacementr  rR   r$   rD   r%   rS     sB   z8AllReduceFusedAddRMSNormStaticQuantNVFP4Pattern.registerr  r$   r$   r   r%   r1    s    r1  c                       sj   e Zd Zdeddf fddZedddZdedefd	d
Z	e
jdejddfddZdddZ  ZS )AllReduceFusionPassr   r   Nc                    sd  t  | d| _t | _| jdkrtd d S tdd| _|j	d u r+td d S |j	
 | _t j| _t }| jtjk}td u rJtd d S |jj| j}|d u r_td| j d S |rcd	nd
}|| j|  | _t| j|jj| _tjd|d  d| j dd tj|| j| j| j| j|d\| _}|at || j|| jd| _!| "  | #|| j d S )NTr   z3AllReduce fusion pass is disabled for tp_size <= 1.all_reduce_fusion_passr   z;AllReduce fusion pass is disabled for missing model_config.zTFlashinfer is not installed or comm module not found, skipping allreduce fusion passzZFlashinfer allreduce fusion is not supported for world size %s or max size is not providedr/   r   zFlashinfer max size: r   zB MB,Maximal number of tokens used by Flashinfer Allreduce Fusion: global)scope)tp_rankr"   r   r   groupr   )r   r;   r   r   )$r   r&   disabledr   r"   r   warning_oncer   r   model_configget_hidden_sizer   r	   rJ   r7  r   r   r*   rc   r   warningr   pass_configflashinfer_max_sizer   minscheduler_configmax_num_batched_tokens
debug_once1trtllm_create_ipc_workspace_for_all_reduce_fusionipc_handlesr   r   r   register_patternsr   )r#   r   r   r   max_sizer   workspace_tensorr   r$   r%   r&   ?  sz   




zAllReduceFusionPass.__init__c                 C   s   dD ]b}t || j| j| j| j t|| j| j| j| j tdrAt	|| j| j| j| j t
|| j| j| j| j t|| j| j| j| j t|| j| j| j| j tjjj  qd| _d S )N)gh㈵>gư>r   F)r  r   r   r   rS   r   r  r   has_device_capabilityr   r1  r   r  r*   	_inductorpattern_matcher_seen_patternsclearr8  )r#   r   r$   r$   r%   rE    s\   







z%AllReduceFusionPass.register_patternsr   c                 C   s$   | j r
td dS t|j| jkS )Nz"AllReduce fusion pass is disabled.F)r8  r   r9  r   r   r   )r#   r   r$   r$   r%   r     s   
z+AllReduceFusionPass.is_applicable_for_ranger   c                 C   s4   | j r
td d S | j|| _td| j d S )NzAllReduceFusionPass disabledr   )r8  r   r   r   r   r   r   r$   r$   r%   r     s
   
zAllReduceFusionPass.__call__c                 C   s0   t | ddrd S td urt| j| j d S d S )Nr8  T)getattrr   +trtllm_destroy_ipc_workspace_for_all_reducerD  r7  rD   r$   r$   r%   __del__  s   zAllReduceFusionPass.__del__)r   N)r'   r(   r)   r   r&   r   rE  r   r   r   r   r   r   r   r   rO  r   r$   r$   r   r%   r2  >  s    I.r2  )NNNN)Ximportlib.utilr   typesr   r*   torch._inductor.pattern_matcherrI  rJ  rO   torch.fxr   *torch._higher_order_ops.auto_functionalizer   r   #torch.distributed._symmetric_memoryr   vllm.configr   vllm.config.utilsr   vllm.distributedr	   r
   vllm.distributed.parallel_stater   r   vllm.loggerr   9vllm.model_executor.layers.quantization.utils.quant_utilsr   vllm.platformsr   vllm.utils.torch_utilsr   inductor_passr   matcher_utilsr   r   r   vllm_inductor_passr   r   	fp8_dtyper`   r   __annotations__flashinfer.commcomm_flashinfer_commhasattrImportErrorr'   r   r=   r   r   r@   r-  r   r-   rV   r^   ru   r   r   r   r   r   r   r  r   r   r   rN   r   r   r   rA   r   r   r   r  r  r  r   r1  r2  r$   r$   r$   r%   <module>   s0  
!#D>ID8
	

Y	


<EHKX`