o
    -i1                     @   s  d dl mZmZ d dlmZ d dlmZmZ d dlZd dl	m
  mZ d dlmZ d dlmZ d dl	mZ d dlmZ d d	lmZmZ d d
lmZ d dlmZmZmZ d dlmZ d dlmZ ddl m!Z!m"Z"m#Z#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z,m-Z- ee.Z/edZ0e1 Z2ej3Z4ej5j6j7j8Z9ej5j:j;j8Z<G dd deZ=G dd de=Z>G dd de=Z?G dd de-Z@dS )    )ABCabstractmethod)Callable)Any	ParamSpecN)fx)auto_functionalized)PatternMatcherPass)	Attention)
VllmConfigget_layers_from_vllm_config)init_logger)QuantKeykNvfp4DynamickStaticTensorScale)current_platform)round_up   )	QUANT_OPS
empty_bf16
empty_fp32	empty_i32)is_func)enable_fake_mode)MatcherQuantFP8)VllmInductorPassVllmPatternMatcherPassPc                	   @   s   e Zd ZdZdededejddfddZd	e	d
e	dej
fddZd	e	d
e	dej
fddZedeeejf deejgdf deeejf fddZedejjddfddZedejjddfddZdeddfddZededdfddZdS )AttentionQuantPatternzQ
    The base class for Attn+Quant fusions.
    Should not be used directly.
    layer	quant_keydtypereturnNc                 C   s\   || _ |j| _|j| _|j| _|| _|j| _|| _| jtv s&J d| j t| j | _d S )Nz unsupported quantization scheme )	r   
layer_name	num_heads	head_sizer    r!   quant_dtyper   QUANT_OP)selfr   r    r!    r)   Y/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/compilation/fusion_attn.py__init__.   s   
zAttentionQuantPattern.__init__argskwargsc                 O       | j dd|}tj|i |S Ncudar!   device)r!   torchemptyr(   r,   r-   r)   r)   r*   r4   A      zAttentionQuantPattern.emptyc                 O   r.   r/   )r&   r3   r4   r5   r)   r)   r*   empty_quantE   r6   z!AttentionQuantPattern.empty_quanttrace_fnprocess_fx_fnsc                    s&   dt jdt jdtjf fdd}|S )Nr,   r-   r"   c                     s$   | i |} D ]}|| q	|S Nr)   )r,   r-   gm
process_fxr9   r8   r)   r*   wrappedN   s   
z4AttentionQuantPattern.wrap_trace_fn.<locals>.wrapped)r   r,   r-   r   GraphModule)r8   r9   r>   r)   r=   r*   wrap_trace_fnI   s   "z#AttentionQuantPattern.wrap_trace_fnr;   c                 C   s   ddl m} ||  d S )Nr   )view_to_reshape)#torch._inductor.fx_passes.post_gradrA   )r;   rA   r)   r)   r*   fx_view_to_reshapeW   s   z(AttentionQuantPattern.fx_view_to_reshapec                 C   sd   | j jD ]+}t|tjjjjsq|jd }t	dd t
|D r!q||jd  | j | qd S )Nr   c                 s   s    | ]	\}}||kV  qd S r:   r)   ).0idimr)   r)   r*   	<genexpr>d   s    z=AttentionQuantPattern.remove_noop_permutes.<locals>.<genexpr>r   )graphnodesr   r3   opsatenpermutedefaultr,   any	enumeratereplace_all_uses_with
erase_node)r;   nodedimsr)   r)   r*   remove_noop_permutes]   s   
z*AttentionQuantPattern.remove_noop_permutespm_passc                 C   s"   | j j| jr| | d S d S r:   )r   implfused_output_quant_supportedr    	_registerr(   rU   r)   r)   r*   register_if_supportedk   s   z+AttentionQuantPattern.register_if_supportedc                 C   s   t r:   )NotImplementedErrorrY   r)   r)   r*   rX   o   s   zAttentionQuantPattern._register)__name__
__module____qualname____doc__r
   r   r3   r!   r+   r   Tensorr4   r7   staticmethodr   r   r   r?   r@   rC   rT   r	   rZ   r   rX   r)   r)   r)   r*   r   (   s8    
r   c                	       sJ   e Zd ZdZ	ddedejdeddf fdd	Zd
e	ddfddZ
  ZS )AttentionFp8StaticQuantPatterna?  
    Fusion for Attention+Fp8StaticQuant.

    Only triggers when the attention implementation returns True in
    `fused_output_quant_supported()`. If the pattern is found, the
    Fp8StaticQuant op will be removed from the graph, and its scale
    will be passed into Attention op as the `output_scale` argument.
    Tr   r!   	symmetricr"   Nc                    s,   t tt|d}t ||| t|| _d S )N)r!   scalerc   )r   	FP8_DTYPEr   superr+   r   quant_matcher)r(   r   r!   rc   r    	__class__r)   r*   r+   ~   s
   z'AttentionFp8StaticQuantPattern.__init__rU   c                    s   dt jdt jdt jdt jdt jdt jf fdd}dt jdt jdt jdt jdt jdt jf fd	d
} d j j d j j d j j d j jtddg}t|||t	tj
tjtj| d S )Nqkvoutput_attnrd   r"   c              
      sJ   t t| ||| jd d d}t|d | jd  j j g} ||d S )Nquerykeyvalueoutputr#   output_scaleoutput_block_scaler   r   )r   ATTN_OPr#   
RESHAPE_OPshaper$   r%   rg   )rj   rk   rl   rm   rd   at1attn_out_viewr(   r)   r*   pattern   s   
z9AttentionFp8StaticQuantPattern._register.<locals>.patternc              
      s`   t jjjj| jd  j jgd j| j	d}t
t| ||| j|d d}t|d d j j gS )Nr           r1   rn   r   )r3   rJ   rK   fullrM   rw   r$   r%   r&   r2   r   ru   r#   rv   )rj   rk   rl   rm   rd   rx   rz   r)   r*   replacement   s"   

z=AttentionFp8StaticQuantPattern._register.<locals>.replacement   r   )r3   r`   r4   r$   r%   r   pmregister_replacementr   r@   fwd_onlyrC   rT   r(   rU   r{   r   inputsr)   rz   r*   rX      sV   z(AttentionFp8StaticQuantPattern._register)T)r\   r]   r^   r_   r
   r3   r!   boolr+   r	   rX   __classcell__r)   r)   rh   r*   rb   t   s    rb   c                       sB   e Zd ZdZdedejddf fddZdeddfd	d
Z	  Z
S )AttentionNvfp4QuantPatterna7  
    Fusion for Attention+Nvfp4Quant.

    Only triggers when the attention implementation returns True in
    `fused_output_quant_supported()`. If the pattern is found, the
    Nvfp4Quant op will be removed from the graph, and its scale
    will be passed into Attention op as the `output_scale` argument.
    r   r!   r"   Nc                    s   t  |t| d S r:   )rf   r+   r   )r(   r   r!   rh   r)   r*   r+      s   z#AttentionNvfp4QuantPattern.__init__rU   c                    s*  dt jdt jdt jdt jdt jdt jdt jdtt jt jf f fd	d
}dt jdt jdt jdt jdt jdt jdt jdtt jt jf f fdd}td j jtd j jtd j jtd j j d j j d tdt j j d dt	ddg}t
|||tt
jtjtj| d S )Nrj   rk   rl   rm   output_quantrs   input_scaler"   c              
      sr   t t| ||| jd d d}t|d | jd  j j g}t  j||||dd}	tj	j
j|	d t}
|	d |
fS )Nrn   r   r   T)rr   inputrs   r   is_sf_swizzled_layout   )r   ru   r#   rv   rw   r$   r%   r'   r3   rJ   rK   viewr!   re   )rj   rk   rl   rm   r   rs   r   rx   ry   at2output_scale_viewrz   r)   r*   r{      s.   	
z5AttentionNvfp4QuantPattern._register.<locals>.patternc           
   
      s   t jjjj| jd  j jd gd j| j	d}t jjj
|t}tt| ||| j||d}t|d d j j d g}	|	|d fS )Nr   r   r|   r1   rn   r   r}   )r3   rJ   rK   r~   rM   rw   r$   r%   r&   r2   r   r!   re   r   ru   r#   rv   )
rj   rk   rl   rm   r   rs   r   r   r   rr   rz   r)   r*   r      s&   


z9AttentionNvfp4QuantPattern._register.<locals>.replacementr   r            r   )r3   r`   tupler   r$   r%   r7   r   r   r   r   r   r   r@   r   rC   rT   r   r)   rz   r*   rX      sn   ! z$AttentionNvfp4QuantPattern._register)r\   r]   r^   r_   r
   r3   r!   r+   r	   rX   r   r)   r)   rh   r*   r      s    	r   c                       sZ   e Zd ZdZededdf fddZejde	j
jjddfdd	Zdefd
dZ  ZS )AttnFusionPassa  
    This pass fuses post-attention quantization onto attention if supported.

    It uses the pattern matcher and matches each layer manually, as strings
    cannot be wildcarded. This also lets us check support on attention layers
    upon registration instead of during pattern matching.

    Currently, only static fp8 quant is supported, but patterns could easily be
    added for other quant schemes and dtypes. The bigger hurdle for wider
    support are attention kernels, which need to support fusing output quant.
    configr"   Nc                    s   t  | tdd| _t|t}| D ])\}}t||jj	}|
| j t r>ttjjdr>t||jj	}|
| j qt|dkrJtd | || j d S )Nattn_fusion_pass)	pass_namescaled_fp4_quantr   zAttention + quant fusion is enabled, but no attention layers were found in CompilationConfig.static_forward_context so no fusion patterns were registered.)rf   r+   r	   patternsr   r
   itemsrb   model_configr!   rZ   r   is_cudahasattrr3   rJ   _Cr   lenloggerwarningdump_patterns)r(   r   attn_layersr#   r   pattern_fp8pattern_nvfp4rh   r)   r*   r+   E  s&   
zAttnFusionPass.__init__rH   c                 C   s    | j || _td| j d S )Nz#Fused quant onto %s attention nodes)r   applymatched_countr   debug)r(   rH   r)   r)   r*   __call__a  s   zAttnFusionPass.__call__c                 C   s   t | tttS r:   )r   hash_sourcer   rb   r   rz   r)   r)   r*   uuidf  s   zAttnFusionPass.uuid)r\   r]   r^   r_   r   r   r+   r   time_and_logr3   r   rH   Graphr   strr   r   r)   r)   rh   r*   r   8  s    r   )Aabcr   r   collections.abcr   typingr   r   r3   torch._inductor.pattern_matcher	_inductorpattern_matcherr   r   *torch._higher_order_ops.auto_functionalizer   r	   vllm.attention.layerr
   vllm.configr   r   vllm.loggerr   9vllm.model_executor.layers.quantization.utils.quant_utilsr   r   r   vllm.platformsr   vllm.utils.math_utilsr   fusionr   r   r   r   fx_utilsr   inductor_passr   matcher_utilsr   vllm_inductor_passr   r   r\   r   r   	fp8_dtypere   uint8	FP4_DTYPErJ   vllmunified_attention_with_outputrM   ru   rK   reshaperv   r   rb   r   r   r)   r)   r)   r*   <module>   s:   L]g