o
    -iN                     @   s.  U d dl mZmZ d dlZd dlm  mZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZmZ d dlmZ d d	lmZmZmZmZmZmZmZmZmZmZ d d
lmZ ddl m!Z! ddl"m#Z#m$Z$m%Z% ddl&m'Z'm(Z( ee)Z*e+ Z,ej-Z.dededej/fddZ0dededej/fddZ1dededej/fddZ2dededej/fddZ3ej4j5j6j7Z8ej4j5j9j7Z:eej4j5j;j7eej4j5j<j7eej4j5j=j7iZ>e?eef e@d< eA reBej4j5drej4j5jCj7e>e< eA rej4j5jDj7e>e< ej4j5jDj7e>e< G dd deZEeEedej4j5jFj7eEedej4j5jGj7eEedej4j5jHj7eEedej4j5jHj7eEedej4j5jIj7eEedej4j5jIj7eEedej4j5jIj7eEedej4j5jIj7iZJe?eEef e@d < G d!d" d"ZKG d#d$ d$eKZLG d%d& d&eKZMG d'd( d(eKZNG d)d* d*eKZOG d+d, d,eKZPG d-d. d.eKZQG d/d0 d0e(ZRdS )1    )Any
NamedTupleN)fx)auto_functionalized)PatternMatcherPass)
OpOverload)
VllmConfigget_current_vllm_config)init_logger)

GroupShapeQuantKey	ScaleDesckFp8Dynamic64SymkFp8Dynamic128SymkFp8DynamicTensorSymkFp8DynamicTokenSymkFp8StaticTensorSymkNvfp4DynamickStaticTensorScale)current_platform   )enable_fake_mode)MatcherFusedAddRMSNormMatcherQuantFP8MatcherRMSNorm)VllmInductorPassVllmPatternMatcherPassargskwargsreturnc                  O      t j| i |t jddS Ncuda)dtypedevice)torchemptybfloat16r   r    r)   T/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/compilation/fusion.py
empty_bf16)      r+   c                  O   r    r!   )r%   r&   float32r(   r)   r)   r*   
empty_fp32-   r,   r.   c                  O   r    r!   )r%   r&   int32r(   r)   r)   r*   	empty_i321   r,   r0   c                  O   r    r!   )r%   r&   int64r(   r)   r)   r*   	empty_i645   r,   r2   	QUANT_OPSscaled_fp4_quantc                   @   s0   e Zd ZU dZeed< eed< defddZdS )FusedRMSQuantKeyz
    Named tuple for identifying the type of RMSNorm + quant fusion.
    quant: type of quantization
    fused_add: does the op also perform the residual add
    quant	fused_addr   c                 C   s$   d| j  d| jrd dS d dS )NzFusedQuantKey(z, with outz
 residual))r6   r7   selfr)   r)   r*   __str__R   s   
zFusedRMSQuantKey.__str__N)	__name__
__module____qualname____doc__r   __annotations__boolstrr<   r)   r)   r)   r*   r5   H   s
   
 r5   FT	FUSED_OPSc                   @   s0   e Zd Z		d
dededededdf
dd	ZdS )RMSNormQuantPatternFepsilonkeyhas_col_major_scalesis_e8m0r   Nc                 C   sx   || _ |jj| _t }|jr|jjnd | _|tv s J d| t| | _|j	s,t
|nt|| _t|j||d| _d S )Nz'unsupported fused rmsnorm+quant op for rH   rI   )rF   r6   r#   quant_dtyper	   model_configmodel_dtyperD   FUSED_OPr7   r   r   rmsnorm_matcherr   quant_matcher)r;   rF   rG   rH   rI   configr)   r)   r*   __init__v   s   


zRMSNormQuantPattern.__init__)FF)r=   r>   r?   floatr5   rB   rR   r)   r)   r)   r*   rE   u   s    rE   c                	       F   e Zd Z	ddedejdeddf fddZd	eddfd
dZ	  Z
S )RMSNormStaticQuantPatternTrF   rK   	symmetricr   Nc                    (   t dt|t|dd}t || d S NFr#   scalerV   r7   r6   r5   r   r   superrR   )r;   rF   rK   rV   	fused_key	__class__r)   r*   rR         z"RMSNormStaticQuantPattern.__init__pm_passc                    s   dt jdt jdt jdt jf fdd}dt jdt jdt jdt jf fdd}g  j  j d	 }||  t|||tj| d S )
NinputweightrZ   r   c                    s     | |} ||d S )Nr   rO   rP   )rc   rd   rZ   
result_rmsr:   r)   r*   pattern   s   z3RMSNormStaticQuantPattern.register.<locals>.patternc                    sD   | j  jd} tj| j| j jd}t j|| || j	d}|d S )Nr#   )r$   r#   )resultrc   rd   rZ   rF   r   )
torM   r%   r&   shaper$   rK   r   rN   rF   )rc   rd   rZ   ri   atr:   r)   r*   replacement   s   
z7RMSNormStaticQuantPattern.register.<locals>.replacementr   )r%   TensorrO   inputsrP   pmregister_replacementfwd_onlyr;   rb   rg   rm   ro   r)   r:   r*   register   s2   z"RMSNormStaticQuantPattern.registerTr=   r>   r?   rS   r%   r#   rB   rR   r   rt   __classcell__r)   r)   r_   r*   rU          rU   c                	       rT   )!FusedAddRMSNormStaticQuantPatternTrF   rK   rV   r   Nc                    rW   )NTrY   r[   r\   )r;   rF   rK   rV   rG   r_   r)   r*   rR      ra   z*FusedAddRMSNormStaticQuantPattern.__init__rb   c                    s   dt jdt jdt jdt jdtt jt jf f
 fdd}dt jdt jdt jdt jdtt jt jf f
 fdd	}g  j  j d
 }t|||tj| d S )Nrc   rd   residualrZ   r   c                    s*     | ||\}} ||\}}||fS Nre   )rc   rd   rz   rZ   rf   ri   _r:   r)   r*   rg      s   z;FusedAddRMSNormStaticQuantPattern.register.<locals>.patternc              	      sH   | j  jd} tj|  jd}t j|| ||| jd}|d |d fS )Nrh   )ri   rc   rz   rd   rZ   rF   r      )rj   rM   r%   
empty_likerK   r   rN   rF   )rc   rd   rz   rZ   ri   rl   r:   r)   r*   rm      s   z?FusedAddRMSNormStaticQuantPattern.register.<locals>.replacementr   )	r%   rn   tuplerO   ro   rP   rp   rq   rr   rs   r)   r:   r*   rt      sD   z*FusedAddRMSNormStaticQuantPattern.registerru   rv   r)   r)   r_   r*   ry      rx   ry   c                       V   e Zd Z			ddedejdedededed	d
f fddZde	d	d
fddZ
  ZS ) FusedAddRMSNormGroupQuantPatternTFrF   rK   group_shaperV   rH   rI   r   Nc           	         sN   t tjd|}tdt|||dd}|| _|| _|| _t j	||||d d S )NFTrY   r[   rJ   )
r   r%   r-   r5   r   r   rH   rI   r]   rR   	r;   rF   rK   r   rV   rH   rI   rZ   rG   r_   r)   r*   rR     s   	
z)FusedAddRMSNormGroupQuantPattern.__init__rb   c                       dt jdt jdt jdtt jt jt jf f fdd}dt jdt jdt jdtt jt jt jf f fdd}t|| j tj| d S )	Nrc   rd   rz   r   c                    *     | ||\}} |\}}|||fS r{   re   rc   rd   rz   rf   ri   rZ   r:   r)   r*   rg     s   
z:FusedAddRMSNormGroupQuantPattern.register.<locals>.patternc                    sl   | j  jd} tj|  jd} j|  j}t j	|| || j
d | jd  jd
}|d |d |d fS )Nrh   r   	ri   rc   rd   rZ   rF   scale_ubrz   
group_sizeis_scale_transposed   r}   rj   rM   r%   r~   rK   rP   
make_scalerH   r   rN   rF   r   rc   rd   rz   ri   rZ   rl   r:   r)   r*   rm      s    z>FusedAddRMSNormGroupQuantPattern.register.<locals>.replacementr%   rn   r   rp   rq   rO   ro   rr   r;   rb   rg   rm   r)   r:   r*   rt     s2   z)FusedAddRMSNormGroupQuantPattern.registerTFFr=   r>   r?   rS   r%   r#   r   rB   rR   r   rt   rw   r)   r)   r_   r*   r     s(    r   c                       r   )RMSNormGroupQuantPatternTFrF   rK   r   rV   rH   rI   r   Nc           	         sB   t tjd|}tdt|||dd}|| _t j||||d d S )NFrY   r[   rJ   )r   r%   r-   r5   r   r   r]   rR   r   r_   r)   r*   rR   C  s   	
z!RMSNormGroupQuantPattern.__init__rb   c                    r   dt jdt jdtt jt jf f fdd}dt jdt jdtt jt jf f fdd}t|| j tj| d S )Nrc   rd   r   c                    s"     | |} |\}}||fS r{   re   )rc   rd   rf   ri   rZ   r:   r)   r*   rg   W  s   z2RMSNormGroupQuantPattern.register.<locals>.patternc                    sl   | j  jd} tj|  jd} jj|  jjd}t j	|| || j
d d  jd  jjd
}|d |d fS )Nrh   )
transposedr   r   r}   r   rc   rd   ri   rZ   rl   r:   r)   r*   rm   ^  s$   z6RMSNormGroupQuantPattern.register.<locals>.replacementr   r   r)   r:   r*   rt   V  s*   z!RMSNormGroupQuantPattern.registerr   r   r)   r)   r_   r*   r   B  s(    r   c                       N   e Zd Zejdfdedejdededdf
 fdd	Z	d
e
ddfddZ  ZS )RMSNormDynamicQuantPatternTrF   rK   r   rV   r   Nc                    s6   t tjd|}tdt|||dd}t || d S rX   r   r%   r-   r5   r   r]   rR   r;   rF   rK   r   rV   rZ   rG   r_   r)   r*   rR        z#RMSNormDynamicQuantPattern.__init__rb   c                    r   )Nrc   rd   r   c                    s     | |} |S r{   re   )rc   rd   rf   r:   r)   r*   rg     s   
z4RMSNormDynamicQuantPattern.register.<locals>.patternc              
      sV   | j  jd} tj|  jd} j| }t j|| || j	d d d}|d |d fS )Nrh   ri   rc   rd   rZ   rF   r   rz   r   r}   
rj   rM   r%   r~   rK   rP   r   r   rN   rF   r   r:   r)   r*   rm     s   z8RMSNormDynamicQuantPattern.register.<locals>.replacementr   r   r)   r:   r*   rt     s*   z#RMSNormDynamicQuantPattern.registerr=   r>   r?   r   	PER_TOKENrS   r%   r#   rB   rR   r   rt   rw   r)   r)   r_   r*   r         r   c                       r   )"FusedAddRMSNormDynamicQuantPatternTrF   rK   r   rV   r   Nc                    s6   t tjd|}tdt|||dd}t || d S )NFTrY   r[   r   r   r_   r)   r*   rR     r   z+FusedAddRMSNormDynamicQuantPattern.__init__rb   c                    r   )	Nrc   rd   rz   r   c                    r   r{   re   r   r:   r)   r*   rg     s   
z<FusedAddRMSNormDynamicQuantPattern.register.<locals>.patternc              
      s\   | j  jd} tj|  jd} j| }t j|| || j	d |d}|d |d |d fS )Nrh   r   r   r   r}   r   r   r:   r)   r*   rm     s   z@FusedAddRMSNormDynamicQuantPattern.register.<locals>.replacementr   r   r)   r:   r*   rt     s2   z+FusedAddRMSNormDynamicQuantPattern.registerr   r)   r)   r_   r*   r     r   r   c                       sV   e Zd ZdZededdf fddZejde	j
ddfdd	Zdefd
dZ  ZS )RMSNormQuantFusionPassz~
    This pass fuses rms_norm & quant custom ops into a fused rms_norm_quant op.
    It also supports fused_add_rms_norm.
    rQ   r   Nc                    s   t  | tdd| _dD ][}t|t| j t|t| j t|t| j t	|t| j t
 ritddtddfD ]&}dD ]!}dD ]}t|t|||d| j t|t|||d| j qJqFqBq| || j d S )	Nrmsnorm_quant_fusion_pass)	pass_name)gh㈵>gư>r      @   )TF)r   rH   rI   )r]   rR   r   patternsry   	FP8_DTYPErt   rU   r   r   r   is_cudar   r   r   dump_patterns)r;   rQ   rF   r   rH   rI   r_   r)   r*   rR     sJ   


zRMSNormQuantFusionPass.__init__graphc                 C   s    | j || _td| j d S )NzReplaced %s patterns)r   applymatched_countloggerdebug)r;   r   r)   r)   r*   __call__+  s   zRMSNormQuantFusionPass.__call__c              
   C   s   |  | tttttttS r{   )hash_sourcer   rE   rU   r   ry   r   r   r:   r)   r)   r*   uuid0  s   zRMSNormQuantFusionPass.uuid)r=   r>   r?   r@   r   r   rR   r   time_and_logr   Graphr   rC   r   rw   r)   r)   r_   r*   r     s    3r   )Stypingr   r   r%   torch._inductor.pattern_matcher	_inductorpattern_matcherrp   r   *torch._higher_order_ops.auto_functionalizer   r   
torch._opsr   vllm.configr   r	   vllm.loggerr
   9vllm.model_executor.layers.quantization.utils.quant_utilsr   r   r   r   r   r   r   r   r   r   vllm.platformsr   inductor_passr   matcher_utilsr   r   r   vllm_inductor_passr   r   r=   r   	fp8_dtyper   uint8	FP4_DTYPErn   r+   r.   r0   r2   ops_Crms_normdefaultRMS_OPfused_add_rms_norm
RMS_ADD_OPstatic_scaled_fp8_quantdynamic_scaled_fp8_quant"dynamic_per_token_scaled_fp8_quantr3   dictrA   r   hasattrr4   per_token_group_fp8_quantr5   rms_norm_static_fp8_quant#fused_add_rms_norm_static_fp8_quant rms_norm_dynamic_per_token_quantrms_norm_per_block_quantrD   rE   rU   ry   r   r   r   r   r   r)   r)   r)   r*   <module>   s   0







4?@@78