o
    
۾i                     @   s  U d dl Z d dlmZ d dlmZ d dlZd dlmZ d dlm	Z	 d dl
mZmZ d dlmZ d dlmZ eeZd	ed
ef dedeeef deeef fddZejdd ZejeddejdejdejdejdejdejdejdejdejfddZ	dZdejdejd ejdB fd!d"Zejdejdejdejdejdejdejfd#d$Zejd%ejfd&d'Zd[d)ejd*edejfd+d,Zejd%ejfd-d.Z 	/	d\d)ejd*ed0e!d1ej"dB dejf
d2d3Z#d4d5 Z$dd6d7d8Z%dd6d9d:Z&d;d< Z'd=d> Z(dZd?d@Z)d\d1ej"dB fdAdBZ*ejd%ejfdCdDZ+	Ed]d)ejdFejdGe,dejfdHdIZ-	Ed]d)ejdFejdGe,dejfdJdKZ.dZdLdMZ/d/a0da1da2da3da4da5da6dNdO Z7de!fdPdQZ8e8 Z9e!e:dR< de!fdSdTZ;dUedB fdVdWZ<dUedB fdXdYZ=dS )^    N)Callable)Any)init_logger)current_platform)tltriton)is_torch_equal_or_newer)AttentionBackendEnumgrid.kernelargsreturnc              
   C   s   i }|d |d |d }}}|j  d| d| d| d|d< d	|v r<|j  d| d| d| d
|d	 dd
|d< d|v rG|d  }n|d rMdnd}d| | | |d|d  < ||| ||  ||   |d< |S )NMNKz [M=z, N=z, K=]nametiles_per_updatez, tiles_per_update=02c_ptr
FP8_OUTPUT      g       @flops   bytes)r   element_size)r
   r   r   retmnkbytes_per_elem r"   ^/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/layers/batch_invariant.py_matmul_launch_metadata   s   " r$   c           
      C   s>   | | }|| }t || |}|| |  }| | | }	||	fS N)min)
tile_idnum_pid_in_group	num_pid_mGROUP_SIZE_MNUM_SMSgroup_idfirst_pid_mgroup_size_mpid_mpid_nr"   r"   r#   _compute_pid&   s   r1   )launch_metadataBLOCK_SIZE_MBLOCK_SIZE_NBLOCK_SIZE_Kr*   r+   A_LARGEB_LARGEC_LARGEHAS_BIASc           3      C   s:  t jdd}t ||}t ||}t ||}|| }|| }t d|}|| }t j|||ddD ]f}t|||||\}} || }!| | }"|!t d| }#|"t d| }$|r`|#t j}#|rh|$t j}$t |#|k |#d}#t |$|k |$d}$t 	t 
|#||}#t 	t 
|$||}$t j||ft jd}%t|D ]z}&|s|r|&| t d|t j }'n
|&| t d| }'| |#d d d f | |'d d d f |   }(||'d d d f |	 |$d d d f |
   })t j|(|d d d f ||&|  k dd}*t j|)|d d d f ||&|  k dd}+t |*|+|%}%q||7 }t|||||\}} || t d| },| | t d| }-|rH|,t j},|-t j}-|||,d d d f   ||-d d d f   }.|,d d d f |k |-d d d f |k @ }/|r||- }0t j|0|-|k ddt j}1|%|17 }%|%|jj}2t j|.|2|/d q3d S )	Nr   )axisT)flattendtype        maskotherr@   )r   
program_idcdivarangeranger1   toint64wheremax_contiguousmultiple_ofzerosfloat32loaddotr=   
element_tystore)3a_ptrb_ptrr   bias_ptrr   r   r   	stride_am	stride_ak	stride_bk	stride_bn	stride_cm	stride_cnr3   r4   r5   r*   r+   r6   r7   r8   r9   	start_pidr)   	num_pid_nk_tiles	num_tiles	tile_id_coffs_k_for_maskr(   r'   r/   r0   start_mstart_noffs_amoffs_bnaccumulatorkioffs_ka_ptrsb_ptrsaboffs_cmoffs_cnc_ptrsc_mask	bias_ptrsbiascr"   r"   r#   matmul_kernel_persistent0   sx   
&&
,(rs   rj   rk   rq   c                    sZ  | j d |j d ksJ d| j|jksJ d|d u s&| dks&J dtjdj| j \ }|j \}| j}tj f| j|d} fdd	}tj	d
d
dddddtj
d
ddddddtjd
d
dddddi}t| | ||| || d| d|d|d|d|df|  dk| dk| dk|d ud||  |S )Nr   r   zIncompatible dimensionszIncompatible dtypeszCCurrently assuming bias is 1D, let Horace know if you run into thiscudadevicer=   c                    s(   t t | d t| d  fS )Nr3   r4   )r&   r   rD   )METAr   r   r+   r"   r#   r
      s   zmatmul_persistent.<locals>.grid   @   r      )r3   r4   r5   r*   
num_stages	num_warps               )r+   r6   r7   r8   r9   )shaper=   dimtorchrt   get_device_propertiesmulti_processor_countemptyrv   bfloat16float16rM   rs   stridenumel)rj   rk   rq   r   r=   rr   r
   configsr"   rx   r#   matmul_persistent   sx   





r   c           4      C   s  t d}t d}||krdS t ||}t ||}|| }|| }||ks,||kr.dS || t d| }|| t d| }||k }||k }|sP|sP|r\|t j}|t j}t ||d}t ||d}t t |||}t t |||}| ||  } |||
  }!|||  }"t j	||ft j
d}#t ||}$t d|}%t|$D ]}&|s|r|&| t d|t j }'n
|&| t d| }'| |dddf | |'dddf |	   }(|!|'dddf | |dddf |   })|%||&|  k }*|dddf |*dddf @ }+|*dddf |dddf @ },t j|(|+dd}-t j|)|,dd}.t |-|.|#}#q|}/|}0|rH|/t j}/|0t j}0|"||/dddf   ||0dddf   }1|dddf |dddf @ }2|#|jj}3t j|1|3|2d dS )zBatched GEMM: (B, M, K) x (B, K, N) -> (B, M, N)

    Each program computes one (batch_idx, tile_m, tile_n) tile, accumulating
    along K in a fixed order to preserve batch invariance.
    r   r   Nr<   r>   r?   rB   )r   rC   rD   rE   rG   rH   rI   rJ   rK   rL   rM   rF   rN   rO   r=   rP   rQ   )4rR   rS   r   Br   r   r   	stride_abrU   rV   	stride_bbrW   rX   	stride_cbrY   rZ   r3   r4   r5   r6   r7   r8   pid_bpidr)   r\   r/   r0   offs_moffs_nmask_mmask_na_batch_ptrb_batch_ptrc_batch_ptrre   r]   offs_k_maskrf   rg   rh   ri   k_valida_maskb_maskrj   rk   c_mc_nrn   ro   rr   r"   r"   r#   
bmm_kernel   sx   

&&  , r   
BLOCK_SIZEc              	   C   s\  t dt j}| ||  }|||  }td }	td||D ]$}
|
t d| }||k }t j|| |td d}t t 	||	}	q d}td||D ]+}
|
t d| }||k }t j|| |dd}t 
||	 }|t t ||d7 }qMt |}td||D ]'}
|
t d| }||k }t j|| |d}||	 | }t j|| ||d qdS )zz
    Compute log_softmax along the last dimension of a 2D tensor.
    Each block handles one row of the input tensor.
    r   infr?   r>   rB   N)r   rC   rG   rH   floatrF   rE   rN   maxmaximumexpsumrI   logrQ   )	input_ptr
output_ptrinput_row_strideoutput_row_striden_colsr   row_idxrow_start_ptroutput_row_start_ptrmax_val
col_offsetcol_idxr@   valssum_expexp_valslog_sum_expoutputr"   r"   r#   _log_softmax_kernel_  s0   

r   inputr   c           	      C   s   |dkr|| j d krtd| j}| d| jd }| }|j\}}t|}d}|f}t| |||d|d||d ||S )a*  
    Compute log_softmax using Triton kernel.

    Args:
        input: Input tensor
        dim: Dimension along which to compute log_softmax
             (only -1 or last dim supported)
    >> Stashed changes
    Returns:
        Tensor with log_softmax applied along the specified dimension
    r   r   zFThis implementation only supports log_softmax along the last dimension   r   r   )	ndim
ValueErrorr   reshape
contiguousr   
empty_liker   r   )	r   r   original_shapeinput_2dn_rowsr   r   r   r
   r"   r"   r#   log_softmax  s(   


	r   c                 C   s   t d}||	 }||	 }||ks||	krdS d}td||
D ]+}|t d|
 }||k }|| ||  ||  }t j| | |dd}|t |7 }q|| }|| ||  }t || | dS )z
    Kernel for computing mean along a single dimension.
    Input is viewed as (M, N, K) where N is the dimension being reduced.
    r   Nr>   r?   )r   rC   rF   rE   rN   r   rQ   )r   r   input_stride0input_stride1input_stride2output_stride0output_stride1r   r   r   r   r   m_idxk_idxaccn_start	n_offsetsr@   	input_idxr   mean_val
output_idxr"   r"   r#   mean_kernel  s    
r   Fkeepdimr=   c                 C   s  | j  |  kr| j k sn J d| d| j  d|dk r#|| j  }|du r;| jtjtjtjtjfv r8tj}n| j}| j|krE| |} t	| j
}d}t|D ]}||| 9 }qP|| }d}t|d t|D ]}||| 9 }qh| |||}	|r| }
d|
|< n|d| ||d d  }
tj|
|| jd}|r||d|dn|||}|| f}d}t| |	||	d|	d|	d	|d|j dkr|dnd|||| |S )
a  
    Triton implementation of torch.mean with single dimension reduction.

    Args:
        input: Input tensor
        dim: Single dimension along which to compute mean
        keepdim: Whether to keep the reduced dimension
        dtype: Output dtype. If None, uses input dtype
               (or float32 for integer inputs)

    Returns:
        Tensor with mean values along specified dimension
    zInvalid dimension z for tensor with z dimensionsr   Nr   )r=   rv   r   r   )r   r=   r   int8int16int32rH   rM   rG   listr   rF   lenr   copyr   rv   squeezer   r   )r   r   r   r=   r   r   ir   r   input_3doutput_shaper   	output_2dr
   r   r"   r"   r#   mean_dim  sT   




$
r   c                 C   s
   t | |S r%   r   )rj   rk   r"   r"   r#   mm_batch_invariantV  s   
r   outc                C   s  | j dkr|j dkrt| |}|d ur|| |S |S | j dkr-|j dkr-t| ||dS | j dkr\|j dkr\| j\}}}| d|}t||}|||d}|d urZ|| |S |S | j dkrz|j dkrz| d|jd dd}	t|	||dS | j dkr|j dkr| j\}}
}}|j\}}}}| ||
 ||}|||
 ||}t||}|||
||}|d ur|| |S |S td| j d|j )	Nr   r{   r   r   r      zkmatmul_batch_invariant currently only supports 2D x 2D, 3D x 3D, 3D x 2D, 2D x 3D, and 4D x 4D, got shapes  and )	r   r   copy_bmm_batch_invariantr   r   	unsqueezeexpandr   )rj   rk   r   resultbatchseqhiddena_2d	result_2d
a_expandedheadsseq_adim_a_dim_bseq_ba_3db_3d	result_3dr"   r"   r#   matmul_batch_invariantZ  sL   





r   c                C   s"  | j dkr
|j dkstd| j d|j | jd |jd kr1td| jd  d|jd  d| jd |jd krHtd	| j d|j d| j|jkr[td
| j d|j d| j\}}}|j\}}}| j}|d u r{tj|||f| j|d}	n|j|||fksJ d|j|kr|j| jksJ d|}	tjddddddtjddddddtj	ddddddi}
|
| }|t
||d t
||d  f}t| | ||	||||| d| d| d|d|d|d|	d|	d|	df|  dk| dk|	 dkd| |	S )Nr{   z3bmm_batch_invariant expects 3D tensors, got shapes r   r   z0Batch dimensions of tensors must match, but got .r   r   z.Incompatible inner dimensions for matmul: got zIncompatible dtypes: got ru   zout tensor has incorrect shapezout tensor mismatchry   rz   r   )r3   r4   r5   r|   r}   r~   r   r3   r4   r   )r6   r7   r8   )r   r   r   r=   r   r   rv   r   r   rM   r   rD   r   r   r   )rj   rk   r   r   r   r   r   r   r=   rr   r   cfgr
   r"   r"   r#   r     s   


r   c                 C   s   t ||| dS )N)rq   r   )rq   rj   rk   r"   r"   r#   addmm_batch_invariant  s   r   c                 C   s   |rJ dt | |dS )Nznot implemented)r   )r   )r   r   _half_to_floatr"   r"   r#   _log_softmax_batch_invariant  s   r  c                 C   s:   t j| |dd}| | } t | }t j||dd}|| S )NTr   r   )r   amaxr   r   )r   r   r=   	input_maxexp_x	sum_exp_xr"   r"   r#   softmax_batch_invariant  s
   
r  c                    s   |d u s|t jksJ d|  t j}t|dkr(dd tt jD }t fdd|D dd}|D ]	}t||dd}q7|sM|D ]}||}qE|S )	Nzunsupported dtype: r   c                 S   s   g | ]}|qS r"   r"   ).0r   r"   r"   r#   
<listcomp>  s    z(mean_batch_invariant.<locals>.<listcomp>c                    s   g | ]}| j  qS r"   )r   )r  dr   r"   r#   r	  	  s    T)reverser  )	r   rM   rG   r   rF   r   sortedr   r   )r   r   r   r=   r   sorted_dimsr
  r"   r  r#   mean_batch_invariant  s    r  c              	   C   s^  t dt j}| ||  }	|||  }
t jdgt jd}td||D ].}|t d| }||k }t j|	| |dd}|t j}|| }|t 	t 
||d7 }q$|| }t || }d| }td||D ]D}|t d| }||k }t j|	| |dd}t j|| |dd}|t j}|t j}|| | }||j}t j|
| ||d qhdS )	z
    Compute RMS normalization along the last dimension of a 2D tensor.
    RMS Norm: y = x / sqrt(mean(x^2) + eps) * weight
    Each block handles one row of the input tensor.
    r   r   r<   r>   r?   g      ?rB   N)r   rC   rG   rH   rL   rM   rF   rE   rN   r   rI   sqrtr=   rQ   )r   
weight_ptrr   r   r   r   epsr   r   r   r   sum_sqr   r   r@   r   vals_f32sq_valsmean_sqrmsinv_rmsweight
weight_f32
output_f32r   r"   r"   r#   _rms_norm_kernel  s2   r  ư>r  r  c           
   
   C   s   |  dks
J d| jd |jd ks%J d| jd  d|jd  d| j}| d| jd }| }| }|j\}}t|}d}|f}	t|	 ||||d|d|||d	 ||S )
a  
    Compute RMS normalization using Triton kernel.

    RMS Norm normalizes the input by the root mean square and scales by weight:
    output = input / sqrt(mean(input^2) + eps) * weight

    Args:
        input: Input tensor of shape (..., hidden_size)
        weight: Weight tensor of shape (hidden_size,)
        eps: Small constant for numerical stability

    Returns:
        Tensor with RMS normalization applied along the last dimension
    r   zWeight must be 1-dimensionalr   r   zInput last dimension (z) must match weight dimension ()r   r   )r   r   r   r   r   r   r  r   )
r   r  r  r   r   r   r   r   r   r
   r"   r"   r#   rms_normJ  s2   



r  c                 C   s   t | ||dS )a  
    Batch-invariant wrapper for RMS normalization.

    This function provides a deterministic, batch-invariant implementation
    of RMS normalization for use with the batch_invariant mode.

    Args:
        input: Input tensor of shape (..., hidden_size)
        weight: Weight tensor of shape (hidden_size,)
        eps: Small constant for numerical stability

    Returns:
        RMS normalized tensor
    )r  )r  )r   r  r  r"   r"   r#   rms_norm_batch_invarianty  s   r   c                 C   s"   t | | }|d ur|| }|S r%   )r   t)r   r  rq   r   r"   r"   r#   linear_batch_invariant  s   r"  c                  C   sH  t rd S da tjddatdstdstdr9tdt	d td	t
d td
td tdtd ntjdd atjdd adtjd< dtjd< tdtd tdtd tdtd tdtd tdtd tjatt_tjjjjatjjjjatdrdnd} | tjjj_| tjjj_tjjj dd d S )NTatenIMPLd   P   Y   zaten::mmCUDAzaten::addmmzaten::matmulzaten::linearCUBLAS_WORKSPACE_CONFIGCUBLASLT_WORKSPACE_SIZEz:16:81zaten::_log_softmaxzaten::softmaxzaten::_softmaxzaten::mean.dimz	aten::bmmz
2.10.0.dev)FFFcublaslt)backend)!_batch_invariant_MODEr   libraryLibrary_batch_invariant_LIBr   is_device_capability_familyis_device_capabilityimplr   r   r   r"  osenvironget_original_cublas_workspace_cfg!_original_cublaslt_workspace_sizer  r  r  r   bmm_original_torch_bmmbackendsrt   matmul&allow_bf16_reduced_precision_reduction"_original_bf16_reduction_precision&allow_fp16_reduced_precision_reduction"_original_fp16_reduction_precisionr   preferred_blas_library)reduced_precision_valr"   r"   r#   enable_batch_invariant_mode  sP   





rD  c                  C   s0   t dd} zt| dkW S  ty   Y dS w )NVLLM_BATCH_INVARIANT0r   F)r5  getenvintr   )valr"   r"   r#   _read_vllm_batch_invariant  s   rJ  rE  c                   C   s   t S r%   )rE  r"   r"   r"   r#   vllm_is_batch_invariant  s   rK  attention_backendc                 C   s   t jt jg}|t jt jg }| |vr-dd |D }| r| jnd }d| d| d}t|| |vr:d}tj|dd d	t	j
d
< dt	j
d< dt	j
d< d	t	j
d< d	t	j
d< dt	j
d< dt	j
d< dt	j
d< dt	j
d< dt	j
d< dt	j
d< dt	j
d< d	t	j
d< d S )Nc                 S   s   g | ]}|j qS r"   )r   )r  rk   r"   r"   r#   r	    s    z0override_envs_for_invariance.<locals>.<listcomp>z;VLLM batch_invariant mode requires an attention backend in z, but got 'z{'. Please use --attention-backend or attention_config to set one of the supported backends before enabling batch_invariant.zuYou are using a non-decode-invariant form of batch invariance. This will not be invariant between prefill and decode.local)scoperF  VLLM_ALLREDUCE_USE_SYMM_MEMz:4096:8r)  GROUPNCCL_LAUNCH_MODENCCL_COLLNET_ENABLENCCL_NVLS_ENABLEr+  NCCL_P2P_NET_DISABLENCCL_MIN_NCHANNELSNCCL_MAX_NCHANNELSSimple
NCCL_PROTOzallreduce:tree	NCCL_ALGONCCL_NTHREADSNCCL_SOCKET_NTHREADSVLLM_USE_AOT_COMPILE)r	   
FLASH_ATTNTRITON_ATTNFLASH_ATTN_MLA
TRITON_MLAr   RuntimeErrorloggerwarning_oncer5  r6  )rL  decode_invariant_backendssupported_backendssupported_namesbackend_nameerrorwarningr"   r"   r#   override_envs_for_invariance  sD   











rj  c                 C   s@   t  rt|  t  dtjjj_dtjjj	_dtjjj
_d S d S )Nieee)rK  rj  rD  r   r<  rt   r=  fp32_precisioncudnnconvrnn)rL  r"   r"   r#   init_batch_invariance!  s   rp  r%   )r   )FN)r  )>r5  collections.abcr   typingr   r   vllm.loggerr   vllm.platformsr   vllm.triton_utilsr   r   vllm.utils.torch_utilsr   #vllm.v1.attention.backends.registryr	   __name__rb  dictstrr$   jitr1   	constexprrs   Tensorr   r   r   rH  r   r   boolr=   r   r   r   r   r   r  r  r  r  r   r  r   r"  r.  r1  r;  rA  r?  r8  r9  rD  rJ  rE  __annotations__rK  rj  rp  r"   r"   r"   r#   <module>   s  






	]
M >,4
Z:X

3
0

>
6