o
    
۾iN                     @   s.  U d dl mZmZ d dlZd dlm  mZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZmZ d dlmZ d d	lmZmZmZmZmZmZmZmZmZmZ d d
lmZ ddl m!Z! ddl"m#Z#m$Z$ ddl%m&Z&m'Z'm(Z( ee)Z*e+ Z,ej-Z.dededej/fddZ0dededej/fddZ1dededej/fddZ2dededej/fddZ3ej4j5j6j7Z8ej4j5j9j7Z:eej4j5j;j7eej4j5j<j7eej4j5j=j7iZ>e?eef e@d< eA reBej4j5drej4j5jCj7e>e< eA rej4j5jDj7e>e< ej4j5jDj7e>e< G dd deZEeEedej4j5jFj7eEed ej4j5jGj7eEedej4j5jHj7eEed ej4j5jHj7eEedej4j5jIj7eEed ej4j5jIj7eEedej4j5jIj7eEed ej4j5jIj7iZJe?eEef e@d!< G d"d# d#ZKG d$d% d%eKZLG d&d' d'eKZMG d(d) d)eKZNG d*d+ d+eKZOG d,d- d-eKZPG d.d/ d/eKZQG d0d1 d1e$ZRdS )2    )Any
NamedTupleN)fx)auto_functionalized)PatternMatcherPass)
OpOverload)
VllmConfigget_current_vllm_config)init_logger)

GroupShapeQuantKey	ScaleDesckFp8Dynamic64SymkFp8Dynamic128SymkFp8DynamicTensorSymkFp8DynamicTokenSymkFp8StaticTensorSymkNvfp4DynamickStaticTensorScale)current_platform   )enable_fake_mode)VllmInductorPassVllmPatternMatcherPass   )MatcherFusedAddRMSNormMatcherQuantFP8MatcherRMSNormargskwargsreturnc                  O      t j| i |t jddS Ncuda)dtypedevice)torchemptybfloat16r   r    r*   c/home/ubuntu/.local/lib/python3.10/site-packages/vllm/compilation/passes/fusion/rms_quant_fusion.py
empty_bf16)      r,   c                  O   r!   r"   )r&   r'   float32r)   r*   r*   r+   
empty_fp32-   r-   r/   c                  O   r!   r"   )r&   r'   int32r)   r*   r*   r+   	empty_i321   r-   r1   c                  O   r!   r"   )r&   r'   int64r)   r*   r*   r+   	empty_i645   r-   r3   	QUANT_OPSscaled_fp4_quantc                   @   s0   e Zd ZU dZeed< eed< defddZdS )FusedRMSQuantKeyz
    Named tuple for identifying the type of RMSNorm + quant fusion.
    quant: type of quantization
    fused_add: does the op also perform the residual add
    quant	fused_addr    c                 C   s$   d| j  d| jrd dS d dS )NzFusedQuantKey(z, with outz
 residual))r7   r8   selfr*   r*   r+   __str__R   s   
zFusedRMSQuantKey.__str__N)	__name__
__module____qualname____doc__r   __annotations__boolstrr=   r*   r*   r*   r+   r6   H   s
   
 r6   FT	FUSED_OPSc                   @   s0   e Zd Z		d
dededededdf
dd	ZdS )RMSNormQuantPatternFepsilonkeyhas_col_major_scalesis_e8m0r    Nc                 C   sx   || _ |jj| _t }|jr|jjnd | _|tv s J d| t| | _|j	s,t
|nt|| _t|j||d| _d S )Nz'unsupported fused rmsnorm+quant op for rI   rJ   )rG   r7   r$   quant_dtyper	   model_configmodel_dtyperE   FUSED_OPr8   r   r   rmsnorm_matcherr   quant_matcher)r<   rG   rH   rI   rJ   configr*   r*   r+   __init__v   s   


zRMSNormQuantPattern.__init__)FF)r>   r?   r@   floatr6   rC   rS   r*   r*   r*   r+   rF   u   s    rF   c                	       F   e Zd Z	ddedejdeddf fddZd	eddfd
dZ	  Z
S )RMSNormStaticQuantPatternTrG   rL   	symmetricr    Nc                    (   t dt|t|dd}t || d S NFr$   scalerW   r8   r7   r6   r   r   superrS   )r<   rG   rL   rW   	fused_key	__class__r*   r+   rS         z"RMSNormStaticQuantPattern.__init__pm_passc                    s   dt jdt jdt jdt jf fdd}dt jdt jdt jdt jf fdd}g  j  j d	 }||  t|||tj| d S )
Ninputweightr[   r    c                    s     | |} ||d S )Nr   rP   rQ   )rd   re   r[   
result_rmsr;   r*   r+   pattern   s   z3RMSNormStaticQuantPattern.register.<locals>.patternc                    sD   | j  jd} tj| j| j jd}t j|| || j	d}|d S )Nr$   )r%   r$   )resultrd   re   r[   rG   r   )
torN   r&   r'   shaper%   rL   r   rO   rG   )rd   re   r[   rj   atr;   r*   r+   replacement   s   
z7RMSNormStaticQuantPattern.register.<locals>.replacementr   )r&   TensorrP   inputsrQ   pmregister_replacementfwd_onlyr<   rc   rh   rn   rp   r*   r;   r+   register   s2   z"RMSNormStaticQuantPattern.registerTr>   r?   r@   rT   r&   r$   rC   rS   r   ru   __classcell__r*   r*   r`   r+   rV          rV   c                	       rU   )!FusedAddRMSNormStaticQuantPatternTrG   rL   rW   r    Nc                    rX   )NTrZ   r\   r]   )r<   rG   rL   rW   rH   r`   r*   r+   rS      rb   z*FusedAddRMSNormStaticQuantPattern.__init__rc   c                    s   dt jdt jdt jdt jdtt jt jf f
 fdd}dt jdt jdt jdt jdtt jt jf f
 fdd	}g  j  j d
 }t|||tj| d S )Nrd   re   residualr[   r    c                    s*     | ||\}} ||\}}||fS Nrf   )rd   re   r{   r[   rg   rj   _r;   r*   r+   rh      s   z;FusedAddRMSNormStaticQuantPattern.register.<locals>.patternc              	      sH   | j  jd} tj|  jd}t j|| ||| jd}|d |d fS )Nri   )rj   rd   r{   re   r[   rG   r   r   )rk   rN   r&   
empty_likerL   r   rO   rG   )rd   re   r{   r[   rj   rm   r;   r*   r+   rn      s   z?FusedAddRMSNormStaticQuantPattern.register.<locals>.replacementr   )	r&   ro   tuplerP   rp   rQ   rq   rr   rs   rt   r*   r;   r+   ru      sD   z*FusedAddRMSNormStaticQuantPattern.registerrv   rw   r*   r*   r`   r+   rz      ry   rz   c                       V   e Zd Z			ddedejdedededed	d
f fddZde	d	d
fddZ
  ZS ) FusedAddRMSNormGroupQuantPatternTFrG   rL   group_shaperW   rI   rJ   r    Nc           	         sN   t tjd|}tdt|||dd}|| _|| _|| _t j	||||d d S )NFTrZ   r\   rK   )
r   r&   r.   r6   r   r   rI   rJ   r^   rS   	r<   rG   rL   r   rW   rI   rJ   r[   rH   r`   r*   r+   rS     s   	
z)FusedAddRMSNormGroupQuantPattern.__init__rc   c                       dt jdt jdt jdtt jt jt jf f fdd}dt jdt jdt jdtt jt jt jf f fdd}t|| j tj| d S )	Nrd   re   r{   r    c                    *     | ||\}} |\}}|||fS r|   rf   rd   re   r{   rg   rj   r[   r;   r*   r+   rh     s   
z:FusedAddRMSNormGroupQuantPattern.register.<locals>.patternc                    sl   | j  jd} tj|  jd} j|  j}t j	|| || j
d | jd  jd
}|d |d |d fS )Nri   r   	rj   rd   re   r[   rG   scale_ubr{   
group_sizeis_scale_transposed   r   rk   rN   r&   r~   rL   rQ   
make_scalerI   r   rO   rG   r   rd   re   r{   rj   r[   rm   r;   r*   r+   rn      s    z>FusedAddRMSNormGroupQuantPattern.register.<locals>.replacementr&   ro   r   rq   rr   rP   rp   rs   r<   rc   rh   rn   r*   r;   r+   ru     s2   z)FusedAddRMSNormGroupQuantPattern.registerTFFr>   r?   r@   rT   r&   r$   r   rC   rS   r   ru   rx   r*   r*   r`   r+   r     s(    r   c                       r   )RMSNormGroupQuantPatternTFrG   rL   r   rW   rI   rJ   r    Nc           	         sB   t tjd|}tdt|||dd}|| _t j||||d d S )NFrZ   r\   rK   )r   r&   r.   r6   r   r   r^   rS   r   r`   r*   r+   rS   C  s   	
z!RMSNormGroupQuantPattern.__init__rc   c                    r   dt jdt jdtt jt jf f fdd}dt jdt jdtt jt jf f fdd}t|| j tj| d S )Nrd   re   r    c                    s"     | |} |\}}||fS r|   rf   )rd   re   rg   rj   r[   r;   r*   r+   rh   W  s   z2RMSNormGroupQuantPattern.register.<locals>.patternc                    sl   | j  jd} tj|  jd} jj|  jjd}t j	|| || j
d d  jd  jjd
}|d |d fS )Nri   )
transposedr   r   r   r   rd   re   rj   r[   rm   r;   r*   r+   rn   ^  s$   z6RMSNormGroupQuantPattern.register.<locals>.replacementr   r   r*   r;   r+   ru   V  s*   z!RMSNormGroupQuantPattern.registerr   r   r*   r*   r`   r+   r   B  s(    r   c                       N   e Zd Zejdfdedejdededdf
 fdd	Z	d
e
ddfddZ  ZS )RMSNormDynamicQuantPatternTrG   rL   r   rW   r    Nc                    s6   t tjd|}tdt|||dd}t || d S rY   r   r&   r.   r6   r   r^   rS   r<   rG   rL   r   rW   r[   rH   r`   r*   r+   rS        z#RMSNormDynamicQuantPattern.__init__rc   c                    r   )Nrd   re   r    c                    s     | |} |S r|   rf   )rd   re   rg   r;   r*   r+   rh     s   
z4RMSNormDynamicQuantPattern.register.<locals>.patternc              
      sV   | j  jd} tj|  jd} j| }t j|| || j	d d d}|d |d fS )Nri   rj   rd   re   r[   rG   r   r{   r   r   
rk   rN   r&   r~   rL   rQ   r   r   rO   rG   r   r;   r*   r+   rn     s   z8RMSNormDynamicQuantPattern.register.<locals>.replacementr   r   r*   r;   r+   ru     s*   z#RMSNormDynamicQuantPattern.registerr>   r?   r@   r   	PER_TOKENrT   r&   r$   rC   rS   r   ru   rx   r*   r*   r`   r+   r         r   c                       r   )"FusedAddRMSNormDynamicQuantPatternTrG   rL   r   rW   r    Nc                    s6   t tjd|}tdt|||dd}t || d S )NFTrZ   r\   r   r   r`   r*   r+   rS     r   z+FusedAddRMSNormDynamicQuantPattern.__init__rc   c                    r   )	Nrd   re   r{   r    c                    r   r|   rf   r   r;   r*   r+   rh     s   
z<FusedAddRMSNormDynamicQuantPattern.register.<locals>.patternc              
      s\   | j  jd} tj|  jd} j| }t j|| || j	d |d}|d |d |d fS )Nri   r   r   r   r   r   r   r;   r*   r+   rn     s   z@FusedAddRMSNormDynamicQuantPattern.register.<locals>.replacementr   r   r*   r;   r+   ru     s2   z+FusedAddRMSNormDynamicQuantPattern.registerr   r*   r*   r`   r+   r     r   r   c                       sV   e Zd ZdZededdf fddZejde	j
ddfdd	Zdefd
dZ  ZS )RMSNormQuantFusionPassz~
    This pass fuses rms_norm & quant custom ops into a fused rms_norm_quant op.
    It also supports fused_add_rms_norm.
    rR   r    Nc                    s   t  | tdd| _dD ][}t|t| j t|t| j t|t| j t	|t| j t
 ritddtddfD ]&}dD ]!}dD ]}t|t|||d| j t|t|||d| j qJqFqBq| || j d S )	Nrmsnorm_quant_fusion_pass)	pass_name)gh㈵>gư>r      @   )TF)r   rI   rJ   )r^   rS   r   patternsrz   	FP8_DTYPEru   rV   r   r   r   is_cudar   r   r   dump_patterns)r<   rR   rG   r   rI   rJ   r`   r*   r+   rS     sJ   


zRMSNormQuantFusionPass.__init__graphc                 C   s    | j || _td| j d S )NzReplaced %s patterns)r   applymatched_countloggerdebug)r<   r   r*   r*   r+   __call__+  s   zRMSNormQuantFusionPass.__call__c              
   C   s   |  | tttttttS r|   )hash_sourcer   rF   rV   r   rz   r   r   r;   r*   r*   r+   uuid0  s   zRMSNormQuantFusionPass.uuid)r>   r?   r@   rA   r   r   rS   r   time_and_logr   Graphr   rD   r   rx   r*   r*   r`   r+   r     s    3r   )Stypingr   r   r&   torch._inductor.pattern_matcher	_inductorpattern_matcherrq   r   *torch._higher_order_ops.auto_functionalizer   r   
torch._opsr   vllm.configr   r	   vllm.loggerr
   9vllm.model_executor.layers.quantization.utils.quant_utilsr   r   r   r   r   r   r   r   r   r   vllm.platformsr   inductor_passr   vllm_inductor_passr   r   matcher_utilsr   r   r   r>   r   	fp8_dtyper   uint8	FP4_DTYPEro   r,   r/   r1   r3   ops_Crms_normdefaultRMS_OPfused_add_rms_norm
RMS_ADD_OPstatic_scaled_fp8_quantdynamic_scaled_fp8_quant"dynamic_per_token_scaled_fp8_quantr4   dictrB   r   hasattrr5   per_token_group_fp8_quantr6   rms_norm_static_fp8_quant#fused_add_rms_norm_static_fp8_quant rms_norm_dynamic_per_token_quantrms_norm_per_block_quantrE   rF   rV   rz   r   r   r   r   r   r*   r*   r*   r+   <module>   s   0







4?@@78