o
    i:                     @   s4  d dl Z d dlm  mZ d dlmZ d dlmZ d dlm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d
dlmZ d
dlmZmZ e ZeeZG dd dZG dd deZ G dd deZ!G dd deZ"G dd deZ#G dd deZ$G dd deZ%G dd deZ&dS )    N)PatternMatcherPass)enable_symm_mem_for_group)
VllmConfig)Range)get_tp_group)$get_tensor_model_parallel_world_size)init_logger)current_platform   )enable_fake_mode)VllmInductorPassVllmPatternMatcherPassc                   @   s(   e Zd ZdejdedB ddfddZdS )BasePatterndtypedeviceNreturnc                 C   s    || _ || _t | _t | _d S )N)r   r   r   tpr   tp_size)selfr   r    r   f/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/compilation/passes/fusion/collective_fusion.py__init__   s   zBasePattern.__init__)__name__
__module____qualname__torchr   strr   r   r   r   r   r      s     r   c                   @   2   e Zd Zdeej fddZdeddfddZdS )GEMMReduceScatterPatternr   c                 C   s8   t jddg| j| jd}t jddg| j| jd}||gS )N      r   r   r   emptyr   r   )r   mul	mm_weightr   r   r   
get_inputs$   s   z#GEMMReduceScatterPattern.get_inputspm_passNc                    \   dt jdt jdt jf fdd}dt jdt jdt jf fdd}t||  tj| d S )Nr$   r%   r   c                    s4   t jjj| |}t jjjj|d j jj	d}|S Nr   dim
world_size
group_name)
r   opsatenmmdefaultvllmreduce_scatterr   r   unique_name)r$   r%   r0   r3   r   r   r   pattern*   s   
z2GEMMReduceScatterPattern.register.<locals>.patternc                    s"   t jjj| |dd jjjd}|S )Navgr   )scatter_dimr-   )r   r.   symm_memfused_matmul_reduce_scatterr   device_groupr-   )r$   r%   gemm_rsr5   r   r   replacement4   s   z6GEMMReduceScatterPattern.register.<locals>.replacementr   Tensorpmregister_replacementr&   fwd_onlyr   r'   r6   r=   r   r5   r   register)   s
     
z!GEMMReduceScatterPattern.register	r   r   r   listr   r?   r&   r   rD   r   r   r   r   r   #   s    r   c                   @   r   )AllGatherGEMMPatternr   c                 C   s8   t jddg| j| jd}t jddg| j| jd}||gS )Nr    r!   r"   )r   xweightr   r   r   r&   E   s   zAllGatherGEMMPattern.get_inputsr'   Nc                    r(   )NrH   rI   r   c                    s0   t jjjj| d j jjd}t jjj	||S r)   )
r   r.   r2   
all_gatherr1   r   r   r4   r/   r0   )rH   rI   rJ   r5   r   r   r6   L   s   
z.AllGatherGEMMPattern.register.<locals>.patternc                    s&   t jjj| |gd jjjd\}}|S )Nr   )
gather_dimr-   )r   r.   r9   fused_all_gather_matmulr   r;   r-   )rH   rI   	ag_output
mm_outputsr5   r   r   r=   Y   s   
z2AllGatherGEMMPattern.register.<locals>.replacementr>   rC   r   r5   r   rD   K   s    	zAllGatherGEMMPattern.registerrE   r   r   r   r   rG   D   s    rG   c                   @   r   )ScaledMMReduceScatterPatternr   c                 C   st   t jddg| jtd}t jddg| jtd dd}t jddg| jt jd}t jddg| jt jd}||||gS Nr   r!   r      )r   r#   r   	FP8_DTYPE
contiguous	transposefloat32)r   inputr%   scale_ascale_br   r   r   r&   h   s   z'ScaledMMReduceScatterPattern.get_inputsr'   Nc              
      t   dt jdt jdt jdt jdt jf
 fdd}dt jdt jdt jdt jdt jf
 fdd	}t||  tj| d S )
NrV   mat2rW   rX   r   c              	      sB   t jjjj| |||d d  jd}t jjjj|d j j	j
d}|S )NrZ   rW   rX   biasscale_result	out_dtyper   r*   )r   r.   r/   
_scaled_mmr1   r   r2   r3   r   r   r4   )rV   rZ   rW   rX   	scaled_mmr3   r5   r   r   r6   t   s    

	z6ScaledMMReduceScatterPattern.register.<locals>.patternc                    sR   g | j d d |j d }d}tjj| |||d|| jjj|d d  jd}|S NrQ   r   r7   F	shaper   r.   r2   *patched_fused_scaled_matmul_reduce_scatterr   r;   r-   r   )rV   rZ   rW   rX   output_shaper8   r<   r5   r   r   r=      s$   z:ScaledMMReduceScatterPattern.register.<locals>.replacementr>   rC   r   r5   r   rD   s   s2   z%ScaledMMReduceScatterPattern.registerrE   r   r   r   r   rO   g   s    rO   c                   @   r   )AllGatherScaledMMPatternr   c                 C   s   t jddg| jtd}t jddg| jtd dd}|jd | j }t j|dg| jt jd}t jddg| jt jd}||||gS N   r   r!   r   rQ   )	r   r#   r   rR   rS   rT   rd   r   rU   )r   rH   rI   s1rW   rX   r   r   r   r&      s   z#AllGatherScaledMMPattern.get_inputsr'   Nc              
      rY   )
NrH   rI   rW   rX   r   c              	      s>   t jjjj| d j jjd}t jjj	j||||d d  j
dS )Nr   r*   r[   )r   r.   r2   rJ   r1   r   r   r4   r/   r_   r   )rH   rI   rW   rX   rJ   r5   r   r   r6      s   

z2AllGatherScaledMMPattern.register.<locals>.patternc                    s>   t jjj| |g||gdd gd g jgdg jjjd
\}}|S Nr   F)rK   biasesresult_scales
out_dtypesuse_fast_accumr-   r   r.   r9   fused_all_gather_scaled_matmulr   r   r;   r-   )rH   rI   rW   rX   rM   rN   r5   r   r   r=      s   
z6AllGatherScaledMMPattern.register.<locals>.replacementr>   rC   r   r5   r   rD      s2   z!AllGatherScaledMMPattern.registerrE   r   r   r   r   rg      s    rg   c                   @   r   )#CutlassScaledMMReduceScatterPatternr   c                 C   s   t jddg| jtd}t jddg| jtd dd}t jddg| jt jd}t jddg| jt jd}t jddg| j| jd}|||||gS rP   )r   r#   r   rR   rS   rT   rU   r   )r   rV   r%   rW   rX   cutlass_mm_outputr   r   r   r&      s   z.CutlassScaledMMReduceScatterPattern.get_inputsr'   Nc                    s   dt jdt jdt jdt jdt jdt jf fdd}dt jd	t jdt jdt jdt jdt jf fd
d}t||  tj| d S )NrV   rI   rW   rX   rs   r   c              	      sJ   t jjjt jjjj|| |||d d}t jjjj|d d j	 j
jd}|S )Noutaba_scalesb_scalesr\   rQ   r   r*   )r   r.   higher_orderauto_functionalized_Ccutlass_scaled_mmr1   r2   r3   r   r   r4   )rV   rI   rW   rX   rs   r}   r3   r5   r   r   r6      s    


z=CutlassScaledMMReduceScatterPattern.register.<locals>.patternrZ   c                    sR   g | j d d |j d }d}tjj| |||d|| jjj|d d  jd}|S ra   rc   )rV   rZ   rW   rX   rs   rf   r8   r<   r5   r   r   r=     s$   zACutlassScaledMMReduceScatterPattern.register.<locals>.replacementr>   rC   r   r5   r   rD      s:   z,CutlassScaledMMReduceScatterPattern.registerrE   r   r   r   r   rr      s    rr   c                   @   r   )AllGatherCutlassScaledMMPatternr   c                 C   s   t jddg| jtd}t jddg| jtd dd}|jd | j }t j|dg| jt jd}t jddg| jt jd}|jd }t j||g| j| j	d}|||||gS rh   )
r   r#   r   rR   rS   rT   rd   r   rU   r   )r   rH   rI   rj   rW   rX   s2outputr   r   r   r&   3  s   
z*AllGatherCutlassScaledMMPattern.get_inputsr'   Nc                    s   dt jdt jdt jdt jdt jdt jf fdd}dt jdt jdt jdt jdt jdt jf fd	d
}t||  tj| d S )NrH   rI   rW   rX   r   r   c              	      sJ   t jjjj| d j jjd}t jjj	t jj
jj|||||d d}|d S )Nr   r*   rt   rQ   )r   r.   r2   rJ   r1   r   r   r4   rz   r{   r|   r}   )rH   rI   rW   rX   r   rJ   r}   r5   r   r   r6   F  s   

	z9AllGatherCutlassScaledMMPattern.register.<locals>.patternc                    s>   t jjj| |g||gdd gd g jgdg jjjd
\}}|S rk   rp   )rH   rI   rW   rX   r   rM   rN   r5   r   r   r=   \  s   
z=AllGatherCutlassScaledMMPattern.register.<locals>.replacementr>   rC   r   r5   r   rD   E  s:   z(AllGatherCutlassScaledMMPattern.registerrE   r   r   r   r   r~   2  s    r~   c                       sV   e Zd Zededdf fddZdedefddZe	j
d	ejddfd
dZ  ZS )AsyncTPPassconfigr   Nc                    s   t  | tt jj tdd| _t| j	| j
| j t| j	| j
| j | j	tjkr[t| j	| j
| j t| j	| j
| j t| j	| j
| j t| j	| j
| j | || j d S )Nasync_tp_pass)	pass_name)superr   r   r   r;   r-   r   patternsr   model_dtyper   rD   rG   r   bfloat16rO   rg   rr   r~   dump_patterns)r   r   	__class__r   r   r   w  s*   zAsyncTPPass.__init__compile_rangec                 C   s4   | j jr| j jr
dS t }t| o|j| dkS )NTr   )compilation_configsplitting_opsuse_inductor_graph_partitionr   boolis_single_sizeend)r   r   r   r   r   r   is_applicable_for_range  s   z#AsyncTPPass.is_applicable_for_rangegraphc                 C   s    | j || _td| j d S )NzReplaced %s patterns)r   applymatched_countloggerdebug)r   r   r   r   r   __call__  s   zAsyncTPPass.__call__)r   r   r   r   r   r   r   r   r   r   time_and_logfxGraphr   __classcell__r   r   r   r   r   v  s     r   )'r   torch._inductor.pattern_matcher	_inductorpattern_matcherr@   torch.fxr   r   #torch.distributed._symmetric_memoryr   vllm.configr   vllm.config.utilsr   vllm.distributedr   vllm.distributed.parallel_stater   vllm.loggerr   vllm.platformsr	   inductor_passr   vllm_inductor_passr   r   	fp8_dtyperR   r   r   r   r   rG   rO   rg   rr   r~   r   r   r   r   r   <module>   s.   !#D>ID