o
    
۾isC                     @   s  d dl Z d dlm  mZ d dl mZ d dlmZ d dlmZ d dl	Z
d dlmZ d dlmZ d dlmZ d dlmZmZmZ d d	lmZ d
dlmZ d
dlmZmZ ddlmZ ddlmZm Z m!Z!m"Z" ddl#m$Z$ ee%Z&e' Z(G dd dZ)G dd de)Z*G dd de)Z+G dd de)Z,G dd de)Z-G dd deZ.G dd deZ/G dd  d eZ0G d!d" d"Z1G d#d$ d$eZ2dS )%    N)fx)PatternMatcherPass)
OpOverload)rocm_aiter_ops)
VllmConfig)init_logger)
GroupShapeQuantKey	ScaleDesc)current_platform   )enable_fake_mode)VllmInductorPassVllmPatternMatcherPass   )ActivationQuantPattern)MatcherFusedAddRMSNormMatcherQuantFP8MatcherRMSNormMatcherSiluAndMul)FusedRMSQuantKeyc                   @   s&   e Zd Z	ddededefddZdS )	AiterRMSNormQuantPatternTepsilonkeymatch_aiter_quantc                 C   sD   || _ |jj| _|jst|ddnt|dd| _t|j|d| _	d S NT)match_rocm_aiter)
r   quantdtypequant_dtype	fused_addr   r   rmsnorm_matcherr   quant_matcher)selfr   r   r    r$   d/home/ubuntu/.local/lib/python3.10/site-packages/vllm/compilation/passes/fusion/rocm_aiter_fusion.py__init__'   s   

z!AiterRMSNormQuantPattern.__init__N)T)__name__
__module____qualname__floatr   boolr&   r$   r$   r$   r%   r   &   s    r   c                       `   e Zd ZdZe Zdejdfde	de
jdedededd	f fd
dZdedd	fddZ  ZS )AiterRMSNormDynamicQuantPatternz-AITER RMSNorm + Dynamic Quantization pattern.Tr   r   r   group_shape	symmetricreturnNc                    s8   t tjd|}tdt|||dd}t ||| d S NFr   scaler/   r    r   r
   torchfloat32r   r	   superr&   r#   r   r   r   r.   r/   r3   r   	__class__r$   r%   r&   =      z(AiterRMSNormDynamicQuantPattern.__init__pm_passc                    r   dt jdt jdtt jt jf f fdd}dt jdt jdtt jt jf f fdd}t|| j tj| d S )Ninputweightr0   c                    "     | |} |\}}||fS Nr!   r"   r?   r@   
result_rmsresultr3   r#   r$   r%   patternN      z9AiterRMSNormDynamicQuantPattern.register.<locals>.patternc                    s&    j | | j jd}|d |d fS )N)xr@   r   r   r   r   FUSED_OPr   r   )r?   r@   rF   rG   r$   r%   replacementV   s   z=AiterRMSNormDynamicQuantPattern.register.<locals>.replacementr6   Tensortuplepmregister_replacementr!   inputsfwd_onlyr#   r=   rH   rM   r$   rG   r%   registerM   s*   z(AiterRMSNormDynamicQuantPattern.register)r'   r(   r)   __doc__r   "get_rmsnorm_fused_dynamic_quant_oprL   r   	PER_TOKENr*   r6   r   r+   r&   r   rV   __classcell__r$   r$   r:   r%   r-   8   (    r-   c                       r,   )'AiterFusedAddRMSNormDynamicQuantPatternz7AITER RMSNorm Fused Add + Dynamic Quantization pattern.Tr   r   r   r.   r/   r0   Nc                    s8   t tjd|}tdt|||dd}t ||| d S NFTr2   r4   r5   r9   r:   r$   r%   r&   q   r<   z0AiterFusedAddRMSNormDynamicQuantPattern.__init__r=   c                       dt jdt jdt jdtt jt jt jf f fdd}dt jdt jdt jdtt jt jt jf f fdd}t|| j tj| d S )	Nr?   r@   residualr0   c                    *     | ||\}} |\}}|||fS rB   rC   r?   r@   r_   rE   residual_outrF   r3   rG   r$   r%   rH         
zAAiterFusedAddRMSNormDynamicQuantPattern.register.<locals>.patternc                    s.    j | || j jd}|d |d |d fS )N)rJ   r_   r@   r   r   r   r   r   rK   )r?   r@   r_   rF   rG   r$   r%   rM      s   zEAiterFusedAddRMSNormDynamicQuantPattern.register.<locals>.replacementrN   rU   r$   rG   r%   rV      s2   
z0AiterFusedAddRMSNormDynamicQuantPattern.register)r'   r(   r)   rW   r   &get_rmsnorm_fused_add_dynamic_quant_oprL   r   rY   r*   r6   r   r+   r&   r   rV   rZ   r$   r$   r:   r%   r\   l   r[   r\   c                       \   e Zd ZdZe Z		ddedej	de
dededd	f fd
dZdedd	fddZ  ZS )AiterRMSFp8GroupQuantPatternzw
    This pattern fuses aiter rms_norm & group fp8 quant custom
    ops into an aiter rms_norm_group_fp8_quant op.
    Tr   r   r.   r   r/   r0   Nc                    s8   t tjd|}tdt|||dd}t ||| d S r1   r5   r#   r   r   r.   r   r/   r3   r   r:   r$   r%   r&      r<   z%AiterRMSFp8GroupQuantPattern.__init__r=   c                    r>   )Nr?   r@   r0   c                    rA   rB   rC   rD   rG   r$   r%   rH      rI   z6AiterRMSFp8GroupQuantPattern.register.<locals>.patternc                    s$    j | | jdd}|d |d fS )N   )rJ   r@   variance_epsilon
group_sizer   r   rL   r   )r?   r@   atrG   r$   r%   rM      s   z:AiterRMSFp8GroupQuantPattern.register.<locals>.replacementrN   rU   r$   rG   r%   rV      s"   z%AiterRMSFp8GroupQuantPattern.registerTT)r'   r(   r)   rW   r    get_rmsnorm_group_fused_quant_oprL   r*   r6   r   r   r+   r&   r   rV   rZ   r$   r$   r:   r%   rf      &    rf   c                       re   )$AiterFusedAddRMSFp8GroupQuantPatternz
    This pattern fuses aiter rms_norm_with_add & group fp8 quant custom ops
    into a aiter rms_norm_with_add_group_fp8_quant op.
    Tr   r   r.   r   r/   r0   Nc                    s8   t tjd|}tdt|||dd}t ||| d S r]   r5   rg   r:   r$   r%   r&      r<   z-AiterFusedAddRMSFp8GroupQuantPattern.__init__r=   c                    r^   )	Nr?   r@   r_   r0   c                    r`   rB   rC   ra   rG   r$   r%   rH      rc   z>AiterFusedAddRMSFp8GroupQuantPattern.register.<locals>.patternc                    s,    j | || jdd}|d |d |d fS )Nrh   )rJ   r_   r@   ri   rj   r   r   r   rk   )r?   r@   r_   rl   rG   r$   r%   rM      s   	zBAiterFusedAddRMSFp8GroupQuantPattern.register.<locals>.replacementrN   rU   r$   rG   r%   rV      s*   
z-AiterFusedAddRMSFp8GroupQuantPattern.registerrm   )r'   r(   r)   rW   r   $get_rmsnorm_group_add_fused_quant_oprL   r*   r6   r   r   r+   r&   r   rV   rZ   r$   r$   r:   r%   rp      ro   rp   c                       sV   e Zd ZdZededdf fddZejde	j
ddfdd	Zdefd
dZ  ZS )RocmAiterRMSNormQuantFusionPassz
    This pass fuses aiter rms_norm & vllm/aiter quant custom ops
    into a fused rms_norm_quant op.
    It also supports fused_add_rms_norm.
    configr0   Nc                    s   t  | tdd| _dD ]7}t|ttdd| j t|ttdd| j dD ]}t	|t|d| j t
|t|d| j q,q| || j d S )N%rocm_aiter_rms_norm_quant_fusion_pass	pass_namegh㈵>gư>r   rh   )TF)r   )r8   r&   r   patternsrf   	FP8_DTYPEr   rV   rp   r-   r\   dump_patterns)r#   rs   r   r   r:   r$   r%   r&     s0   


z(RocmAiterRMSNormQuantFusionPass.__init__graphc                 C       | j || _td| j d S NzReplaced %s patternsrx   applymatched_countloggerdebugr#   r{   r$   r$   r%   __call__8     z(RocmAiterRMSNormQuantFusionPass.__call__c                 C   s   t tttg}| j| g|R  S rB   )r-   r\   rf   rp   hash_sourcer#   fusion_patternsr$   r$   r%   uuid=  s   z$RocmAiterRMSNormQuantFusionPass.uuid)r'   r(   r)   rW   r   r   r&   r   time_and_logr   Graphr   strr   rZ   r$   r$   r:   r%   rr     s    #rr   c                   @   sP   e Zd ZdZe ZdeddfddZde	e
j fddZd	eddfd
dZdS ) AiterSiluMulFp8GroupQuantPatternz
    This pattern fuses aiter silu_and_mul & group fp8 quant custom
    ops into an aiter silu_and_mul_group_fp8_quant op.
    quant_opr0   Nc                 C   s   t  | _|| _d S rB   )r   silu_and_mul_matcherr   )r#   r   r$   r$   r%   r&   O  s   
z)AiterSiluMulFp8GroupQuantPattern.__init__c                 C   s   | j  d gS )Nr   )r   rS   rG   r$   r$   r%   
get_inputsS  s   z+AiterSiluMulFp8GroupQuantPattern.get_inputsr=   c                    sd   dt jdtt jt jf f fdd}dt jdtt jt jf f fdd}t||  tj| d S )Nr?   r0   c                    s&     | } |d}|d |d fS )Nrh   r   r   )r   r   )r?   at1at2rG   r$   r%   rH   Y  s   
z:AiterSiluMulFp8GroupQuantPattern.register.<locals>.patternc                    s    j | dd}|d |d fS )Nrh   )rJ   rj   r   r   )FUSED_SILU_MUL_QUANT_OP)r?   rl   rG   r$   r%   rM   `  s   z>AiterSiluMulFp8GroupQuantPattern.register.<locals>.replacementr6   rO   rP   rQ   rR   r   rT   rU   r$   rG   r%   rV   X  s   z)AiterSiluMulFp8GroupQuantPattern.register)r'   r(   r)   rW   r   $get_act_mul_fused_fp8_group_quant_opr   r   r&   listr6   rO   r   r   rV   r$   r$   r$   r%   r   G  s    r   c                       st   e Zd ZdZe Zejj	j
jZeegZededdf fddZejdejjddfdd	Zdefd
dZ  ZS )'RocmAiterSiluMulFp8GroupQuantFusionPassah  
    This pass fuses a pre-defined set of custom ops into fused ops.
    It uses the torch pattern matcher to find the patterns and replace them.

    Because patterns can only be registered once, the pass is a singleton.
    This will be addressed in a future version of PyTorch:
    https://github.com/pytorch/pytorch/pull/139321#issuecomment-2452354980
    rs   r0   Nc                    sF   t  | tdd| _| jD ]
}t|| j q| || j d S )N/rocm_aiter_silu_mul_fp8_group_quant_fusion_passru   )r8   r&   r   rx   	QUANT_OPSr   rV   rz   )r#   rs   r   r:   r$   r%   r&   z  s   
z0RocmAiterSiluMulFp8GroupQuantFusionPass.__init__r{   c                 C   r|   r}   r~   r   r$   r$   r%   r     r   z0RocmAiterSiluMulFp8GroupQuantFusionPass.__call__c                 C   s   t tg}tj| g|R  S rB   )r   r   r   r   r   r$   r$   r%   r     s   z,RocmAiterSiluMulFp8GroupQuantFusionPass.uuid)r'   r(   r)   rW   r   get_group_quant_opAITER_GROUP_FP8_QUANT_OPr6   opsvllm triton_per_token_group_quant_fp8defaultTRITON_GROUP_FP8_QUANT_OPr   r   r   r&   r   r   r   r   r   r   r   rZ   r$   r$   r:   r%   r   k  s    	r   c                   @   sT   e Zd ZdZe ZdededefddZ	de
ej fdd	Zd
eddfddZdS )AddAiterRMSNormPadPatternz|
    This pattern replaces an aiter_rmsnorm_with_add & a pad op
    with a custom triton_add_rmsnorm_pad op from AITER.
    r   hidden_sizex_pad_to_multiplec                 C   s$   || _ || _|| _t|dd| _d S r   )r   r   r   r   r!   )r#   r   r   r   r$   r$   r%   r&     s   z"AddAiterRMSNormPadPattern.__init__r0   c                 C   sL   | j  \}}}tjddg|j|jd}tjdg|j|jd}|||||gS )N      )r   device)r!   rS   r6   emptyr   r   )r#   r?   r@   r_   router_weightrouter_biasr$   r$   r%   r     s   z$AddAiterRMSNormPadPattern.get_inputsr=   Nc                    s   dt jdt jdt jdt jdt jdtt jt jt jf f fdd}dt jdt jdt jdt jdt jdtt jt jt jf f fd	d
}t||  tj| d S )Nr?   r@   r_   r   r   r0   c           
         sZ    j  j j   } | ||\}}tjj|||}tjjj	|d|fddd}	|	||fS )Nr   constantg        )modevalue)
r   r   r!   r6   r   r   rocm_unquantized_gemmnn
functionalpad)
r?   r@   r_   r   r   pad_sizerE   rb   router_logitsrF   rG   r$   r%   rH     s   

z3AddAiterRMSNormPadPattern.register.<locals>.patternc           	         sV    j | | j| jd}|d }tjj|d d d  jf ||}|d }|||fS )N)rJ   r@   ri   r_   r   r   r   )AITER_TRITON_ADD_RMSNORM_PAD_OPr   r   r6   r   r   r   r   )	r?   r@   r_   r   r   rl   result_paddedr   rb   rG   r$   r%   rM     s   
z7AddAiterRMSNormPadPattern.register.<locals>.replacementr   rU   r$   rG   r%   rV     s:   z"AddAiterRMSNormPadPattern.register)r'   r(   r)   rW   r   get_triton_add_rmsnorm_pad_opr   r*   intr&   r   r6   rO   r   r   rV   r$   r$   r$   r%   r     s    
r   c                       sP   e Zd ZdZdef fddZejdej	j
ddfdd	Zdefd
dZ  ZS )&RocmAiterTritonAddRMSNormPadFusionPassz~
    This pass replaces an AITER CK RMSNorm + residual add and a pad op
    with an triton_add_rmsnorm_pad op from AITER.
    rs   c                    sV   t  | tdd| _d}dD ]}dD ]}t|||| j qq| || j d S )N-rocm_aiter_triton_add_rmsnorm_pad_fusion_passru   i@  rw   )rh      )r8   r&   r   rx   r   rV   rz   )r#   rs   r   r   r   r:   r$   r%   r&     s   z/RocmAiterTritonAddRMSNormPadFusionPass.__init__r{   r0   Nc                 C   r|   r}   r~   r   r$   r$   r%   r     r   z/RocmAiterTritonAddRMSNormPadFusionPass.__call__c                 C   s   t | tS rB   )r   r   r   rG   r$   r$   r%   r     s   z+RocmAiterTritonAddRMSNormPadFusionPass.uuid)r'   r(   r)   rW   r   r&   r   r   r6   r   r   r   r   r   rZ   r$   r$   r:   r%   r     s    r   )3r6   torch._inductor.pattern_matcher	_inductorpattern_matcherrQ   r   r   
torch._opsr   7vllm.model_executor.layers.quantization.utils.fp8_utilsr   vllm._aiter_opsr   vllm.configr   vllm.loggerr   9vllm.model_executor.layers.quantization.utils.quant_utilsr   r	   r
   vllm.platformsr   inductor_passr   vllm_inductor_passr   r   act_quant_fusionr   matcher_utilsr   r   r   r   rms_quant_fusionr   r'   r   	fp8_dtypery   r   r-   r\   rf   rp   rr   r   r   r   r   r$   r$   r$   r%   <module>   s8   4638:$)G