o
    
۾izy                  !   @   sv  U d dl mZ d dlmZ d dlZd dlm  mZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ d dlmZmZ d d	lmZmZ d d
lmZ d dlmZ d dlmZ d dlmZ ddl m!Z! ddl"m#Z#m$Z$ ddl%m&Z&m'Z'm(Z( e) Z*ee+Z,dZ-edB e.d< edrzd dl/m0Z1 e2e1dre1Z-W n	 e3y   Y nw ee+Z,e2ej4j5drej4j5j6j7Z8dddddddddZ9e:e;e:e;e<f f e.d< dddddddddZ=e:e;e:e;e<f f e.d< e-dursda>dZ?				dId ej@d!ej@d"ej@d#e<d$e;d%e;d&eAd'eAd(eAd)e;d*e;d+ej@dB d,ej@dB d-ej@dB d.ej@dB d/df d0d1ZB				dId ej@d!ej@d"ej@d#e<d$e;d%e;d&eAd'eAd(eAd)e;d*e;d+ej@dB d,ej@dB d-ej@dB d.ej@dB d/df d2d3ZCed4eBg d5eCd6 ej4jDjEj7ZEG d7d8 d8ZFG d9d: d:ZGG d;d< d<eGZHG d=d> d>eGZIG d?d@ d@eGZJG dAdB dBeGZKG dCdD dDeGZLG dEdF dFeGZMG dGdH dHe$ZNdS )J    )	find_spec)
ModuleTypeN)auto_functionalized)PatternMatcherPass)
VllmConfig)Range)get_tp_group tensor_model_parallel_all_reduce)get_tensor_model_parallel_rank$get_tensor_model_parallel_world_size)init_logger)kFp8StaticTensorSym)current_platform)direct_register_custom_op   )enable_fake_mode)VllmInductorPassVllmPatternMatcherPass   )MatcherFusedAddRMSNormMatcherQuantFP8MatcherRMSNormflashinfer_comm
flashinfertrtllm_allreduce_fusionscaled_fp4_quant@   g      ?)r             )Z   d   FI_ALLREDUCE_FUSION_MAX_SIZE_MBr   #_FI_ALLREDUCE_ONE_SHOT_MAX_SIZES_MB   allreduce_inresidual	rms_gammarms_eps
world_rank
world_sizelaunch_with_pdltrigger_completion_at_endfp32_accmax_token_numpattern_codenorm_out	quant_out	scale_outscale_factorreturnc              	   C   sZ  | j \}}|  }|| | }|	| | }||ks)J d| d|	 d| d| t }|d ur5| nd }t|i |d }|d u pJ||t k}td usSJ d|d u r\| }|}n| }t	j
di d| d| j d d	|d
|d|d|d|d|d|d| j d dtd|d|d|d|d|
dd d|d|dt	jjd| d S )NzCurrent tensor size z is larger than max token num z * hidden size z * element size z0Flashinfer must be enabled when using flashinferr%   	token_numr   residual_inresidual_outr0   r'   r(   r)   r*   
hidden_dimworkspace_ptrsr+   use_oneshotr,   r-   r/   allreduce_outr1   r2   layout_coder3    )shapeelement_sizer   get_device_capabilityto_intr#   getMiB_FI_WORKSPACE_TENSORr   r   QuantizationSFLayoutSWIZZLED_128x4)r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   
num_tokenshidden_sizer@   current_tensor_sizemax_tensor_sizecurr_devicedevice_capabilitymax_one_shot_sizer;   r7   r>   r>   g/home/ubuntu/.local/lib/python3.10/site-packages/vllm/compilation/passes/fusion/allreduce_rms_fusion.py call_trtllm_fused_allreduce_normU   s   




	

rP   c                 C   s   d S Nr>   )r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r>   r>   rO   %call_trtllm_fused_allreduce_norm_fake   s   rR   &flashinfer_trtllm_fused_allreduce_norm)r%   r&   r0   r1   r2   )op_nameop_funcmutates_args	fake_implc                   @   sN   e Zd ZdZ		ddededededd	f
d
dZdeeeeB f fddZ	d	S )FlashInferFusedAllReduceParamsz5Parameters for FlashInfer fused allreduce operations.F   rankr*   use_fp32_lamportr.   r4   Nc                 C   s.   || _ || _|| _d| _d| _d| _|| _d S )NT)rZ   r*   r[   r,   r+   r-   r.   )selfrZ   r*   r[   r.   r>   r>   rO   __init__   s   
z'FlashInferFusedAllReduceParams.__init__c                 C   s   | j | j| j| j| j| jdS )N)r)   r*   r+   r,   r-   r.   )rZ   r*   r+   r,   r-   r.   r\   r>   r>   rO   !get_trtllm_fused_allreduce_kwargs   s   z@FlashInferFusedAllReduceParams.get_trtllm_fused_allreduce_kwargs)FrY   )
__name__
__module____qualname____doc__intboolr]   dictstrr_   r>   r>   r>   rO   rX      s     
rX   c                   @   s(   e Zd ZdejdedB ddfddZdS )BasePatterndtypedeviceNr4   c                 C   s    || _ || _t | _t | _d S rQ   )ri   rj   r   tpr   tp_size)r\   ri   rj   r>   r>   rO   r]      s   zBasePattern.__init__)r`   ra   rb   torchri   rg   r]   r>   r>   r>   rO   rh      s     rh   c                
       b   e Zd ZdZdedejdedB deddf
 fdd	Z	de
ej fd
dZdeddfddZ  ZS )AllReduceRMSNormPatternz
    This pattern replaces the allreduce + rms norm (without residual)
    with fused flashinfer implementation.
    Applies to allreduce + rmsnorm before attn in the first Transformer block.
    epsilonri   rj   Nallreduce_paramsr4   c                    (   t  || || _|| _t|| _d S rQ   superr]   rp   rq   r   rmsnorm_matcherr\   rp   ri   rj   rq   	__class__r>   rO   r]         z AllReduceRMSNormPattern.__init__c                 C   s   | j  \}}|| j|gS rQ   ru   inputstori   )r\   inputweightr>   r>   rO   
get_inputs   s   z"AllReduceRMSNormPattern.get_inputspm_passc                    sp   dt jdt jdtt jt jf f fdd}dt jdt jdtt jt jf f fdd}t||  tj| d S )Nr}   r~   r4   c                    s   t | } ||}||fS rQ   r	   ru   )r}   r~   allreduce_outputrmsr^   r>   rO   pattern  s   z1AllReduceRMSNormPattern.register.<locals>.patternc                    sb   t | }t | }td usJ dttf| ||d d | jtjjd j	
 }|d |d fS )NFlashInfer must be enabledr%   r&   r0   r1   r2   r'   r(   r/      r   )rm   
zeros_like
empty_liker   r   rS   rp   AllReduceFusionPatternkARResidualRMSNormrq   r_   )r}   r~   r&   
rms_result	allreducer^   r>   rO   replacement  s$   


z5AllReduceRMSNormPattern.register.<locals>.replacementrm   Tensortuplepmregister_replacementr   fwd_onlyr\   r   r   r   r>   r^   rO   register  s"   z AllReduceRMSNormPattern.registerr`   ra   rb   rc   floatrm   ri   rg   rX   r]   listr   r   r   r   __classcell__r>   r>   rw   rO   ro          ro   c                
       rn   )AllReduceFusedAddRMSNormPatternz
    This pattern replaces the allreduce + rms norm (with residual)
    with fused flashinfer implementation.
    Applies to o_proj + rmsnorm after attn and mlp + rmsnorm before attn.
    rp   ri   rj   Nrq   r4   c                    rr   rQ   rt   r]   rp   rq   r   ru   rv   rw   r>   rO   r]   /  ry   z(AllReduceFusedAddRMSNormPattern.__init__c                 C   s"   | j  \}}}||| j|gS rQ   rz   )r\   r}   r&   r~   r>   r>   rO   r   ;  s   z*AllReduceFusedAddRMSNormPattern.get_inputsr   c              
      s   dt jdt jdt jdtt jt jf f fdd}dt jdt jdt jdtt jt jf f fdd}t||  tj| d	d
 }t||||  tj| d S )Nr&   r}   r~   r4   c                    s"   t |} ||| \}} || fS rQ   r   )r&   r}   r~   r   r   r^   r>   rO   r   B  s   z9AllReduceFusedAddRMSNormPattern.register.<locals>.patternc                    sN   t d usJ dttf|| d d d | jt jjd j }|d |d fS )Nr   r   r   r   )r   r   rS   rp   r   r   rq   r_   )r&   r}   r~   r   r^   r>   rO   r   I  s    
z=AllReduceFusedAddRMSNormPattern.register.<locals>.replacementc                    s    fddS )Nc                    s    | ||d S )Nr   r>   )abcfnr>   rO   <lambda>b  s    zLAllReduceFusedAddRMSNormPattern.register.<locals>.<lambda>.<locals>.<lambda>r>   r   r>   r   rO   r   b  s    z:AllReduceFusedAddRMSNormPattern.register.<locals>.<lambda>r   )r\   r   r   r   first_return_onlyr>   r^   rO   r   A  s:   z(AllReduceFusedAddRMSNormPattern.registerr   r>   r>   rw   rO   r   (  r   r   c                
       rn   )*AllReduceFusedRMSNormStaticQuantFP8Patternz
    This pattern replaces the allreduce + rms norm (without residual)
    + static fp8 quant with fused flashinfer implementation.
    Applies to allreduce + rmsnorm + quant before attn
    in the first Transformer block.
    rp   ri   rj   Nrq   r4   c                    :   t  || || _|| _tj| _t|| _t	t
| _d S rQ   )rt   r]   rp   rq   rm   float8_e4m3fnquant_dtyper   ru   r   r   quant_matcherrv   rw   r>   rO   r]   u  s   
z3AllReduceFusedRMSNormStaticQuantFP8Pattern.__init__c                 C   s.   | j  \}}| j \}}|| j||gS rQ   ru   r{   r   r|   ri   )r\   r}   r~   _scaler>   r>   rO   r     s   z5AllReduceFusedRMSNormStaticQuantFP8Pattern.get_inputsr   c              
      s|   dt jdt jdt jdtt jt jf f fdd}dt jdt jdt jdtt jt jf f fdd}t||  tj| d S )	Nr}   r~   r   r4   c                    s,   t | } ||} ||\}}||fS rQ   r	   ru   r   )r}   r~   r   
all_reducer   quantr   r^   r>   rO   r     s   zDAllReduceFusedRMSNormStaticQuantFP8Pattern.register.<locals>.patternc                    st   t | }t | }t j|  jd}td usJ dttf| |||d | jtjj	|d	 j
 }|d |d fS )Nri   r   	r%   r&   r0   r1   r2   r'   r(   r/   r3   r   r   )rm   r   r   r   r   r   rS   rp   r   kARResidualRMSNormFP8Quantrq   r_   )r}   r~   r   r&   
result_rmsresult_quantr   r^   r>   rO   r     s(   

zHAllReduceFusedRMSNormStaticQuantFP8Pattern.register.<locals>.replacementr   r   r>   r^   rO   r     s*   
z3AllReduceFusedRMSNormStaticQuantFP8Pattern.registerr   r>   r>   rw   rO   r   m  s    r   c                
       rn   )-AllReduceFusedAddRMSNormStaticQuantFP8Patternz
    This pattern replaces the allreduce + rms norm (with residual)
    + static fp8 quant with fused flashinfer implementation.
    Applies to o_proj + rmsnorm after attn + quant and
    mlp + rmsnorm + quant before attn.
    rp   ri   rj   Nrq   r4   c                    r   rQ   )rt   r]   rp   rq   rm   r   r   r   ru   r   r   r   rv   rw   r>   rO   r]     s   
z6AllReduceFusedAddRMSNormStaticQuantFP8Pattern.__init__c                 C   s2   | j  \}}}| j \}}||| j||gS rQ   r   )r\   r}   r&   r~   r   r   r>   r>   rO   r     s   z8AllReduceFusedAddRMSNormStaticQuantFP8Pattern.get_inputsr   c                    s   dt jdt jdt jdt jdtt jt jf f
 fdd}dt jdt jdt jdt jdtt jt jf f
 fdd	}t||  tj| d S )
Nr&   r}   r~   r   r4   c           	         s2   t |} ||| \}} ||\}}||fS rQ   r   )	r&   r}   r~   r   r   r   resr   r   r^   r>   rO   r     s   zGAllReduceFusedAddRMSNormStaticQuantFP8Pattern.register.<locals>.patternc                    s`   t j| jd}td usJ dttf|| d |d | jtjj|d	 j	
 }|d |d fS )Nr   r   r   r   r   )rm   r   r   r   r   rS   rp   r   r   rq   r_   )r&   r}   r~   r   r   r   r^   r>   rO   r     s$   zKAllReduceFusedAddRMSNormStaticQuantFP8Pattern.register.<locals>.replacementr   r   r>   r^   rO   r     s2   z6AllReduceFusedAddRMSNormStaticQuantFP8Pattern.registerr   r>   r>   rw   rO   r     s    r   c                
       rn   ),AllReduceFusedRMSNormStaticQuantNVFP4Patternz
    This pattern replaces the allreduce + rms norm (without residual)
    + static nvfp4 quant with fused flashinfer implementation.
    Applies to allreduce + rmsnorm + quant before attn
    in the first Transformer block.
    rp   ri   rj   Nrq   r4   c                    rr   rQ   rs   rv   rw   r>   rO   r]     ry   z5AllReduceFusedRMSNormStaticQuantNVFP4Pattern.__init__c                 C   s   t jg d| j| jd}t jd| jt jd}t jddg| jt jd}t jdg| j| jd}t jddg| jt jd}|||||gS )N)r      r   rj   ri   r   r   r   r      r   rm   emptyrj   ri   uint8float32int32)r\   r}   quant_resultinput_global_scaler~   output_scaler>   r>   rO   r     s   z7AllReduceFusedRMSNormStaticQuantNVFP4Pattern.get_inputsr   c                    s   dt jdt jdt jdt jdt jdtt jt jt jf f fdd}dt jdt jdt jdt jdt jdtt jt jt jf f fd	d
}t||  tj| d S )Nr}   r   r~   r   r   r4   c                    s:   t | } ||}tt||||dd}|d ||d fS NT)outputr}   r   input_scaleis_sf_swizzled_layoutr   r   r	   ru   r   STATIC_FP4_QUANT_OP)r}   r   r~   r   r   r   r   quant_out_tupler^   r>   rO   r      s   
zFAllReduceFusedRMSNormStaticQuantNVFP4Pattern.register.<locals>.patternc                    sj   t | }t | }td usJ dttf| ||||| jtjj|d	 j	
 }|d |d |d fS )Nr   r   r   r      )rm   r   r   r   r   rS   rp   r   kARResidualRMSNormFP4Quantrq   r_   )r}   r   r~   r   r   r&   r   r   r^   r>   rO   r   5  s&   

zJAllReduceFusedRMSNormStaticQuantNVFP4Pattern.register.<locals>.replacementr   r   r>   r^   rO   r     s:   z5AllReduceFusedRMSNormStaticQuantNVFP4Pattern.registerr   r>   r>   rw   rO   r      s    r   c                
       rn   )/AllReduceFusedAddRMSNormStaticQuantNVFP4Patternz
    This pattern replaces the allreduce + rms norm (with residual)
    + static nvfp4 quant with fused flashinfer implementation.
    Applies to o_proj + rmsnorm after attn + quant and
    mlp + rmsnorm + quant before attn.
    rp   ri   rj   Nrq   r4   c                    rr   rQ   r   rv   rw   r>   rO   r]   `  ry   z8AllReduceFusedAddRMSNormStaticQuantNVFP4Pattern.__init__c                 C   s   t jddg| j| jd}t jddg| j| jd}t jddg| j| jd}t jd| jt jd}t jddg| jt jd}t jddg| jt jd}||||||gS )Nr   r   r   r   r   r   r   )r\   r}   r&   r~   r   r   r   r>   r>   rO   r   l  s   z:AllReduceFusedAddRMSNormStaticQuantNVFP4Pattern.get_inputsr   c                    s   dt jdt jdt jdt jdt jdt jdtt jt jt jf f fdd	}dt jdt jdt jdt jdt jdt jdtt jt jt jf f fd
d}t||  tj| d S )Nr   r&   r}   r   r~   r   r4   c           	         s@   t |} |||\}}tt| |||dd}|d ||d fS r   r   )	r   r&   r}   r   r~   r   r   r   r   r^   r>   rO   r     s   
zIAllReduceFusedAddRMSNormStaticQuantNVFP4Pattern.register.<locals>.patternc                    sV   t d usJ dttf||d | || jt jj|d	 j }|d |d |d fS )Nr   r   r   r   r   )r   r   rS   rp   r   r   rq   r_   )r   r&   r}   r   r~   r   r   r^   r>   rO   r     s"   zMAllReduceFusedAddRMSNormStaticQuantNVFP4Pattern.register.<locals>.replacementr   r   r>   r^   rO   r     sB   z8AllReduceFusedAddRMSNormStaticQuantNVFP4Pattern.registerr   r>   r>   rw   rO   r   X  s    r   c                       sj   e Zd Zdeddf fddZedddZdedefd	d
Z	e
jdejddfddZdddZ  ZS )AllReduceFusionPassconfigr4   Nc                    sd  t  | d| _t | _| jdkrtd d S tdd| _|j	d u r+td d S |j	
 | _t j| _t }| jtjk}td u rJtd d S |jj| j}|d u r_td| j d S |rcd	nd
}|| j|  | _t| j|jj| _tjd|d  d| j dd tj|| j| j| j| j|d\| _}|at || j|| jd| _!| "  | #|| j d S )NTr   z3AllReduce fusion pass is disabled for tp_size <= 1.all_reduce_fusion_pass)	pass_namez;AllReduce fusion pass is disabled for missing model_config.zTFlashinfer is not installed or comm module not found, skipping allreduce fusion passzZFlashinfer allreduce fusion is not supported for world size %s or max size is not providedr   r   zFlashinfer max size: r$   zB MB,Maximal number of tokens used by Flashinfer Allreduce Fusion: global)scope)tp_rankrl   r.   r8   groupr[   )rZ   r*   r[   r.   )$rt   r]   disabledr   rl   loggerwarning_oncer   patternsmodel_configget_hidden_sizer8   r   device_groupr   r
   model_dtyperm   r   r   warningcompilation_configpass_configflashinfer_max_sizer.   minscheduler_configmax_num_batched_tokens
debug_once1trtllm_create_ipc_workspace_for_all_reduce_fusionipc_handlesrE   rX   rq   register_patternsdump_patterns)r\   r   rZ   r[   max_sizer@   workspace_tensorrw   r>   rO   r]     sz   




zAllReduceFusionPass.__init__c                 C   s   dD ]b}t || j| j| j| j t|| j| j| j| j tdrAt	|| j| j| j| j t
|| j| j| j| j t|| j| j| j| j t|| j| j| j| j tjjj  qd| _d S )N)gh㈵>gư>r!   F)r   r   rj   rq   r   r   r   r   has_device_capabilityr   r   ro   r   rm   	_inductorpattern_matcher_seen_patternsclearr   )r\   rp   r>   r>   rO   r     s\   







z%AllReduceFusionPass.register_patternscompile_rangec                 C   s$   | j r
td dS t|j| jkS )Nz"AllReduce fusion pass is disabled.F)r   r   r   re   endr.   )r\   r   r>   r>   rO   is_applicable_for_range1  s   
z+AllReduceFusionPass.is_applicable_for_rangegraphc                 C   s4   | j r
td d S | j|| _td| j d S )NzAllReduceFusionPass disabledzReplaced %s patterns)r   r   debugr   applymatched_count)r\   r   r>   r>   rO   __call__7  s
   
zAllReduceFusionPass.__call__c                 C   s0   t | ddrd S td urt| j| j d S d S )Nr   T)getattrr   +trtllm_destroy_ipc_workspace_for_all_reducer   r   r^   r>   r>   rO   __del__@  s   zAllReduceFusionPass.__del__)r4   N)r`   ra   rb   r   r]   r   r   r   re   r   r   time_and_logfxGraphr   r   r   r>   r>   rw   rO   r     s    I.r   )NNNN)Oimportlib.utilr   typesr   rm   torch._inductor.pattern_matcherr   r   r   torch.fxr   *torch._higher_order_ops.auto_functionalizer   r   vllm.configr   vllm.config.utilsr   vllm.distributedr   r	   vllm.distributed.parallel_stater
   r   vllm.loggerr   9vllm.model_executor.layers.quantization.utils.quant_utilsr   vllm.platformsr   vllm.utils.torch_utilsr   inductor_passr   vllm_inductor_passr   r   matcher_utilsr   r   r   	fp8_dtype	FP8_DTYPEr`   r   r   __annotations__flashinfer.commcomm_flashinfer_commhasattrImportErrorops_Cr   defaultr   r"   rf   rd   r   r#   rE   rD   r   re   rP   rR   vllmrS   rX   rh   ro   r   r   r   r   r   r   r>   r>   r>   rO   <module>   s"  

	

Y	


<EHKX`