o
    -i;                     @   s  U d dl mZmZ d dlmZ d dlZd dlmZ d dlm	Z	 d dl
mZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZmZmZmZmZmZmZmZmZ d dlmZ d dl m!Z! ej"j#j$j%Z&ej"j#j'j%Z(ej"j#j)j%Z*ej"j+j,j%Z-eej"j#j.j%eej"j#j/j%eej"j#j0j%iZ1e2ee	f e3d< e!4 re5ej"j#drej"j#j6j%e1e< e!4 rej"j#j7j%e1e< ej"j#j7j%e1e< ej"j#j8j%Z9G dd deZ:G dd de:Z;G dd de:Z<G dd de:Z=G dd de:Z>G dd de:Z?dS )    )ABCabstractmethod)AnyN)auto_functionalized)
OpOverload)rocm_aiter_ops)get_current_vllm_config)
SiluAndMul)RMSNorm)QuantFP8)	
GroupShapeQuantKey_normalize_quant_group_shapekFp8Dynamic64SymkFp8Dynamic128SymkFp8DynamicTensorSymkFp8DynamicTokenSymkFp8StaticTensorSymkNvfp4Dynamic)RotaryEmbedding)current_platform	QUANT_OPSscaled_fp4_quantc                   @   s   e Zd ZdeddfddZedededefdd	Zedededefd
dZdededefddZ	dedede
jfddZdedede
jfddZdedede
jfddZdee
j fddZdS )MatcherCustomOpenabledreturnNc                 C   sP   t  }|jr
|jjnd | _|jr|jjnd | _|| _|r"| j| _	d S | j| _	d S N)
r   model_configdtypemodel_dtypedevice_configdevicer   forward_customforward_nativeforward)selfr   config r'   [/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/compilation/matcher_utils.py__init__3   s
   zMatcherCustomOp.__init__argskwargsc                 O      d S r   r'   r%   r*   r+   r'   r'   r(   r"   ;      zMatcherCustomOp.forward_customc                 O   r,   r   r'   r-   r'   r'   r(   r#   ?   r.   zMatcherCustomOp.forward_nativec                 O   s   | j |i |S r   )r$   r-   r'   r'   r(   __call__C   s   zMatcherCustomOp.__call__c                 O   s   t j|| j| jd|S Nr   r!   )torchemptyr   r!   r-   r'   r'   r(   r3   F      zMatcherCustomOp.emptyc                 O      t j|t j| jd|S r0   )r2   r3   int64r!   r-   r'   r'   r(   empty_int64I   r4   zMatcherCustomOp.empty_int64c                 O   r5   r0   )r2   r3   float32r!   r-   r'   r'   r(   	empty_f32L   r4   zMatcherCustomOp.empty_f32c                 C   s   t )z!Utility for inputs to the pattern)NotImplementedError)r%   r'   r'   r(   inputsO   r.   zMatcherCustomOp.inputs)__name__
__module____qualname__boolr)   r   r   r"   r#   r/   r2   Tensorr3   r7   r9   listr;   r'   r'   r'   r(   r   2   s    r   c                       s   e Zd Z		ddededededededB d	df fd
dZd	eej fddZ	dejdejdejdB dejd	e
ejejdB f f
ddZdejdejdejdB dejd	e
ejejdB f f
ddZ  ZS )MatcherRotaryEmbeddingFNis_neox	head_size	num_headsnum_kv_headsuse_flashinferr   r   c                    sn   |d u rt  }t | || _|| _|| _|| _| j| j | _| j| j | _	|| _
|r2t| _d S t| _d S r   )r   r   superr)   rC   rD   rE   rF   q_sizekv_size
rotary_dimFLASHINFER_ROTARY_OP	rotary_op	ROTARY_OP)r%   rC   rD   rE   rF   rG   r   	__class__r'   r(   r)   U   s   	

zMatcherRotaryEmbedding.__init__c                 C   s@   |  d}| d| j}| d| j}| d| j}||||gS )N   i   )r7   r3   rI   rJ   rK   )r%   	positionsquerykeycos_sin_cacher'   r'   r(   r;   n   s
   
zMatcherRotaryEmbedding.inputsrR   rS   rT   rU   c              	   C   sD   t | j|||| j|| jd}|d }t|dkr|d nd }||fS )N)rR   rS   rT   rD   rU   rC         )r   rM   rD   rC   len)r%   rR   rS   rT   rU   result	query_outkey_outr'   r'   r(   r"   u   s   	z%MatcherRotaryEmbedding.forward_customc              	   C   s    t |||| j| j|| j}|S r   )r   forward_staticrD   rK   rC   )r%   rR   rS   rT   rU   rY   r'   r'   r(   r#      s   z%MatcherRotaryEmbedding.forward_native)FN)r<   r=   r>   r?   intr)   rA   r2   r@   r;   tupler"   r#   __classcell__r'   r'   rO   r(   rB   T   sR    
rB   c                	       s   e Zd Z		ddededB deddf fddZdeej fd	d
Z	dejdejdejfddZ
dejdejdejfddZdejdejdejfddZ  ZS )MatcherRMSNormNFepsilonr   match_rocm_aiterr   c                    sD   |d u rt  }t | || _t| _|| _|r t	 | _d S d S r   )
r
   r   rH   r)   ra   RMS_OP_rmsnorm_oprb   r   get_rmsnorm_opr%   ra   r   rb   rO   r'   r(   r)      s   zMatcherRMSNorm.__init__c                 C   s0   | j r	| ddn| dd}| d}||gS NrQ      r   r3   r9   r%   inputweightr'   r'   r(   r;      s   
zMatcherRMSNorm.inputsrk   rl   c                 C   s   | j ||| jdS )N)xrl   variance_epsilonrd   ra   rj   r'   r'   r(   forward_rocm_aiter   s
   z!MatcherRMSNorm.forward_rocm_aiterc                 C   s:   | j r	| ||S t|}t| j|||| jd\}}|S )N)rY   rk   rl   ra   )rb   rp   r2   
empty_liker   rd   ra   )r%   rk   rl   rY   _r'   r'   r(   r"      s   

zMatcherRMSNorm.forward_customc                 C   s   t || j|d| j|S Nr
   r\   ra   sizer   rj   r'   r'   r(   r#      s   zMatcherRMSNorm.forward_nativeNF)r<   r=   r>   floatr?   r)   rA   r2   r@   r;   rp   r"   r#   r_   r'   r'   rO   r(   r`      sD    

r`   c                
       s   e Zd Z		ddededB deddf fddZdeej fd	d
Z	dejdejdejde
ejejf fddZdejdejdejde
ejejf fddZdejdejdejde
ejejf fddZ  ZS )MatcherFusedAddRMSNormNFra   r   rb   r   c                    sD   |d u rt  }t | || _|| _t| _|r t	 | _d S d S r   )
r
   r   rH   r)   ra   rb   
RMS_ADD_OPrd   r   get_rmsnorm_fused_add_oprf   rO   r'   r(   r)      s   zMatcherFusedAddRMSNorm.__init__c                 C   s>   | j r	| ddn| dd}| d}| dd}|||gS rg   ri   r%   rk   rl   residualr'   r'   r(   r;      s   

zMatcherFusedAddRMSNorm.inputsrk   rl   r}   c                 C   s   | j |||| jdS )N)rm   r}   rl   rn   ro   r|   r'   r'   r(   rp      s   
z)MatcherFusedAddRMSNorm.forward_rocm_aiterc                 C   s8   | j r
| |||S t| j|||| jd\}}}||fS )N)rk   r}   rl   ra   )rb   rp   r   rd   ra   )r%   rk   rl   r}   rr   rY   r'   r'   r(   r"      s   z%MatcherFusedAddRMSNorm.forward_customc                 C   s"   t || j|d| j||}|S rs   ru   )r%   rk   rl   r}   rY   r'   r'   r(   r#     s   z%MatcherFusedAddRMSNorm.forward_nativerw   )r<   r=   r>   rx   r?   r)   rA   r2   r@   r;   r^   rp   r"   r#   r_   r'   r'   rO   r(   ry      sP    


ry   c                       s   e Zd Z				ddededB dedededdf fd	d
Z	ddejdejdB deejejf fddZ		ddejdejdB deejejf fddZ
	ddejdejdB deejejf fddZddejdedejfddZdeej fddZ  ZS )MatcherQuantFP8NF	quant_keyr   has_col_major_scalesis_e8m0rb   r   c                    s  |d u rt  }t | || _|| _|| _|| _|rO|jj	
 r&J d|jj	 r2t | _n?|jj	jdks=J dt rGt | _n*tjjjj| _n"|tv sZJ d| t| | _|jt ksjJ d|jd u sqJ t |jj|jj	||dd| _d S )Nz?ROCm aiter fusion pass does not support per tensor quantization   zTROCm aiter fusion pass currently supports quantization operation with group_size 128z unsupported quantization scheme zOnly QuantFP8 supported byF)column_major_scales	use_ue8m0compile_native)r   r   rH   r)   r   r   r   rb   scalegroup_shapeis_per_tensoris_per_tokenr   get_per_token_quant_opQUANT_OPcolr   is_fp8_fnuzget_group_quant_opr2   opsvllm triton_per_token_group_quant_fp8defaultr   r   	fp8_dtypescale2static	quant_fp8)r%   r   r   r   r   rb   rO   r'   r(   r)      sF   


zMatcherQuantFP8.__init__rk   r   c                 C   s6   | j jj}|tjkr| j|| j j|dS | ||jS )N)rm   quant_dtyper   )r   r   r   r   	PER_TOKENr   r   r   )r%   rk   r   quant_key_group_shaper'   r'   r(   rp   V  s   

z"MatcherQuantFP8.forward_rocm_aiterc                 C   s  | j r	| ||S tj|j|j| jjd}| jjj	
 rR|d u s"J | j|| jd}t| jj}|j}|j}t| j|||| jjj	d d||| jd	\}}}||fS | jjjrl|d us]J t| j|||d\}}||fS |d u srJ | |}t| j|||d d\}}}||fS )Nr!   r   )
transposedrV   g|=)rk   output_qoutput_s
group_sizeepsfp8_minfp8_maxscale_ue8m0)rY   rk   r   )rY   rk   r   scale_ub)rb   rp   r2   r3   shaper!   r   r   r   r   is_per_group
make_scaler   finfominmaxr   r   r   r   )r%   rk   r   rY   r   r   r   rr   r'   r'   r(   r"   e  sF   



zMatcherQuantFP8.forward_customc                 C   s   |  ||S r   )r   )r%   rk   r   r'   r'   r(   r#     s   zMatcherQuantFP8.forward_nativer   c                 C   st   t || jjj}|jd |d  |jd |d  f}|r0tt|}tj||j	tj
dddS tj||j	tj
dS )Nr   rV   r   rt   )r   r   r   r   r   r^   reversedr2   r3   r!   r8   permute)r%   rk   r   normalized_group_shapescale_shaper'   r'   r(   r     s   

zMatcherQuantFP8.make_scalec                 C   s,   |  dd}| jjjr|| ddgS |gS )NrQ   rh   rV   )r3   r   r   r   r9   r%   rk   r'   r'   r(   r;     s   
zMatcherQuantFP8.inputs)NFFFr   )F)r<   r=   r>   r   r?   r)   r2   r@   r^   rp   r"   r#   r   rA   r;   r_   r'   r'   rO   r(   r~     sX    9

2
r~   c                       sl   e Zd ZddedB ddf fddZdeej fddZdejdejfd	d
Z	dejdejfddZ
  ZS )MatcherSiluAndMulNr   r   c                    s    |d u rt  }t | d S r   )r	   r   rH   r)   )r%   r   rO   r'   r(   r)     s   zMatcherSiluAndMul.__init__c                 C   s   |  dd}|gS )NrQ      )r3   r   r'   r'   r(   r;     s   zMatcherSiluAndMul.inputsrm   c                 C   sL   |j d d }|j d d |f }tj||j|jd}tt||d}|d S )Nrt   rW   r1   )rY   rk   rV   )r   r2   r3   r   r!   r   SILU_MUL_OP)r%   rm   doutput_shapeoutrY   r'   r'   r(   r"     s
   z MatcherSiluAndMul.forward_customc                 C   s
   t |S r   )r	   r#   )r%   rm   r'   r'   r(   r#     s   
z MatcherSiluAndMul.forward_nativer   )r<   r=   r>   r?   r)   rA   r2   r@   r;   r"   r#   r_   r'   r'   rO   r(   r     s    

r   )@abcr   r   typingr   r2   torch._higher_order_opsr   
torch._opsr   vllm._aiter_opsr   vllm.configr   %vllm.model_executor.layers.activationr	   $vllm.model_executor.layers.layernormr
   7vllm.model_executor.layers.quantization.input_quant_fp8r   9vllm.model_executor.layers.quantization.utils.quant_utilsr   r   r   r   r   r   r   r   r   +vllm.model_executor.layers.rotary_embeddingr   vllm.platformsr   r   _Crms_normr   rc   fused_add_rms_normrz   rotary_embeddingrN   r   flashinfer_rotary_embeddingrL   static_scaled_fp8_quantdynamic_scaled_fp8_quant"dynamic_per_token_scaled_fp8_quantr   dict__annotations__is_cudahasattrr   per_token_group_fp8_quantsilu_and_mulr   r   rB   r`   ry   r~   r   r'   r'   r'   r(   <module>   sD   ,"J?B 