o
    پic(                     @   s  d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlmZ d dlm	Z	m
Z
mZmZmZmZ d dlZd dlZd dlmZ d dlmZ eeZeddkZed	dkZed
ddd Zdd Zdd ZdHddZeeddZ de
dej!f de
dej!f fddZ"de
dej!f de
dej!f fddZ#e#Z$dd  Z%d!d" Z&d#d$ Z'edddId%edefd&d'Z(eddde)fd(d)Z*eddded* fd+d,Z+e* d-kre* nd.Z,e-ee,Z.e+ Z/e/d/kZ0e/d0kZ1e/d1kZ2e1od2ej34d v Z5e2od3ej64d v pej67 d  d4kZ8e2oej9:d5ddkZ;e2o'ej67d d  d6kZ<e=ej>d7Z?d8d9 Z@G d:d; d;eZAedddJd=e)d%edeBfd>d?ZCed@krte,dAkrWd.ne,Z,ejDejEjFe,dBZGejDejEjHe,dBZIdCefdDdEZJdS e,d.ks}J dFe.jEjFZGe.jEjHZIdCefdGdEZJdS )K    N)Enum)	lru_cache)AnyCallableDictLiteralOptionalTuple)version)torch_releaseFLA_COMPILER_MODE1
FLA_CI_ENV   )maxsizec                  C   s   t jdkr
td ttj} td}| |k r"td|  d tt jj	 dt jj
 }td}||k rBtd| d	 d
S )z
    Checks the current operating system, Triton version, and Python version,
    issuing warnings if they don't meet recommendations.
    This function's body only runs once due to lru_cache.
    win32zDetected Windows operating system. Triton does not have an official Windows release, thus FLA will not be adapted for Windows, and any potential errors will not be fixed. Please consider using a Linux environment for compatibility.z3.2.0zCurrent Triton version z is below the recommended 3.2.0 version. Errors may occur and these issues will not be fixed. Please consider upgrading Triton..z3.11zCurrent Python version zv is below the recommended 3.11 version. It is recommended to upgrade to Python 3.11 or higher for the best experience.N)sysplatformloggerwarningr
   parsetriton__version__version_infomajorminor)triton_versionrequired_triton_version
py_versionrequired_py_version r!   Y/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/attention/fla/utils.pycheck_environments   s"   




r#   c                 C   s    |   |       S N)detachflattenabsmaxitem)xyr!   r!   r"   get_abs_err?   s    r,   c                 C   sL   |   |        }|        }||d  S )Ng:0yE>)r%   r&   squaremeansqrtr)   )r*   r+   errbaser!   r!   r"   get_err_ratioC   s   $r2   Fư>c           
      C   s   t ||}|  d|ddt||d}t| t||}||kr$d S |s0trA|dk s0|dkrA||kr?dd l}	|	| d S d S ||k sIJ |d S )Nz diff: z.6fz ratio: g{Gz?g333333?r   )r,   r2   r   infor   warningswarn)
prefixreftriratior   err_atolabs_atolmsg
error_rater5   r!   r!   r"   assert_closeI   s   


r?   GDN_RECOMPUTE_SUPPRESS_LEVEL0fn.returnc                    s4   g  dt dtdtdtf fdd}|S )aR  
    A decorator that caches the most recent results of a function with tensor inputs.
    This decorator will store the output of the decorated function for the most recent set of input tensors.
    The cache is limited to a fixed size (default is 4). When the cache is full, the oldest entry will be removed.
    Args:
        fn (Callable[..., torch.Tensor]):
            The function to be decorated. It should take tensor inputs and return tensor outputs.
    Returns:
        Callable[..., torch.Tensor]:
            A wrapped version of the input function with single-entry caching.
       argskwargsrC   c                     s   t D ]J\}}|\} }t| t|krNt|t krNtdd t| |D rNt fdd| D rNd | |d d   | ||fg |  S q| i |}tkrbdd  | ||f |S )Nc                 s   s    | ]	\}}||u V  qd S r$   r!   ).0abr!   r!   r"   	<genexpr>r   s    z0tensor_cache.<locals>.wrapper.<locals>.<genexpr>c                 3   s(    | ]\}}| v o| | u V  qd S r$   r!   rG   kvlast_kwargsr!   r"   rJ   r       
r   )	enumeratelenallzipitemsappend)rE   rF   ientry	last_argslast_resultresultcache_entries
cache_sizerB   rN   r"   wrapperl   s&   
 $

ztensor_cache.<locals>.wrapper)	functoolswrapsr   rB   r_   r!   r\   r"   tensor_cache\   s
    rc   c                    s   t   fdd}|S )zn
    A decorator to make sure all input tensors are contiguous and set the device based on input tensors.
    c                     s   dd | D }dd |  D }d }| D ]}t|tjr |} nq|d u r6| D ]}t|tjr5|} nq)|d urAt|jj}nt	 }|  |i |W  d    S 1 sYw   Y  d S )Nc                 s   (    | ]}t |tjs|n| V  qd S r$   
isinstancetorchTensor
contiguousrG   rW   r!   r!   r"   rJ      rP   z/input_guard.<locals>.wrapper.<locals>.<genexpr>c                 S   *   i | ]\}}|t |tjs|n| qS r!   re   rK   r!   r!   r"   
<dictcomp>       z0input_guard.<locals>.wrapper.<locals>.<dictcomp>)
rU   rf   rg   rh   valuescustom_device_ctxdeviceindex
contextlibnullcontext)rE   rF   contiguous_argscontiguous_kwargstensorargvaluectxrB   r!   r"   r_      s0   $zinput_guard.<locals>.wrapperr`   ra   rb   r!   rz   r"   input_guard   s   r|   c                    s    fdd}|S )zf
    Perform a runtime check of the dependency versions, using the exact same syntax used by pip.
    c                    s   t   fdd}|S )Nc                    sD   ddl m} |  | gdd |D R i dd | D S )Nr   )require_versionc                 s   rd   r$   re   rj   r!   r!   r"   rJ      s
    
zFrequire_version.<locals>.decorator.<locals>.wrapper.<locals>.<genexpr>c                 S   rk   r!   re   rK   r!   r!   r"   rl      rm   zGrequire_version.<locals>.decorator.<locals>.wrapper.<locals>.<dictcomp>)transformers.utils.versionsr}   rU   )ry   rE   rF   r}   )rB   hintr
   r!   r"   r_      s   
z3require_version.<locals>.decorator.<locals>.wrapperr{   rb   r   r
   rz   r"   	decorator   s   z"require_version.<locals>.decoratorr!   )r
   r   r   r!   r   r"   r}      s   r}   c                    s    fdd}|S )Nc                     s   t jjj g| R i |S r$   )rg   utils
checkpoint)rE   rF   rz   r!   r"   r_      s   zcheckpoint.<locals>.wrapperr!   rb   r!   rz   r"   r      s   r   c                  C   s   dd l } | jddd d S )Nr   z>Triton is not supported on current platform, roll back to CPU.r   )
stacklevel)r5   r6   )r5   r!   r!   r"   _cpu_device_warning   s   
r   
tensor_idxc                 C   s4   zt jjjj| d W S  ty   t  Y dS w )Nmultiprocessor_count)r   runtimedriveractiver   get_device_propertiesBaseExceptionr   )r   r!   r!   r"   get_multiprocessor_count   s   r   c                   C   s.   z	t jjj jW S  ty   t  Y dS w )Ncpu)r   r   r   r   get_current_targetbackendr   r   r!   r!   r!   r"   get_available_device   s   r   )nvidiaamdintelmusac                  C   s.   t  } | dkr	dS | dkrdS | dkrdS | S )Ncudar   hipr   xpur   )r   )rp   r!   r!   r"   _check_platform   s   r   r   r   r   r   r   zIntel(R) Arc(TM) AzNVIDIA H	   FLA_USE_CUDA_GRAPH   gatherc                   C   s8   zdd t t D W S  ty   t  dg Y S w )Nc                 S   s"   g | ]}t jjjj|d  qS )max_shared_mem)r   r   r   r   r   r   rj   r!   r!   r"   
<listcomp>  s    z*get_all_max_shared_mem.<locals>.<listcomp>r   )rangedevice_torch_libdevice_countr   r   r!   r!   r!   r"   get_all_max_shared_mem  s   

r   c                   @   s2   e Zd ZdZdZdZdZedede	fddZ
d	S )
Backendi  i  i  i  archrC   c                 C   s,   z| |   jW S  ty   | jj Y S w r$   )upperrx   KeyErrorDEFAULT)clsr   r!   r!   r"   get_shared_memory#  s
   zBackend.get_shared_memoryN)__name__
__module____qualname__ADAAMPEREHOPPERr   classmethodstrintr   r!   r!   r!   r"   r     s    r   noner   c                 C   s4   zt  }|| }|t| kW S  ty   Y dS w )NF)r   r   r   	Exception)r   r   device_shared_mem_listmax_shared_memoryr!   r!   r"   check_shared_mem+  s   r   )   rD   r   )device_typerq   c                 C   s
   t | S r$   )r   rp   rq   r!   r!   r"   ro   :  s   
ro   z:Only cuda device is supported for PyTorch version < 2.4.0.c                 C   s   t j| S r$   )rg   r   rp   r   r!   r!   r"   ro   D  s   )Fr3   )r   )r   r   )Krr   r`   loggingosr   enumr   r   typingr   r   r   r   r   r	   rg   r   	packagingr
   sglang.srt.utils.commonr   	getLoggerr   r   getenvCOMPILER_MODEr   r#   r,   r2   r?   r   SUPPRESS_LEVELrh   rc   r|   ri   r}   r   r   r   r   r   r   rp   getattrr   device_platformis_amdis_intel	is_nvidiar   get_device_nameis_intel_alchemistr   get_device_capabilityis_nvidia_hopperenvirongetuse_cuda_graphis_tf32_supportedhasattrlanguageis_gather_supportedr   r   boolr   partialamp
custom_fwdautocast_custom_fwd
custom_bwdautocast_custom_bwdro   r!   r!   r!   r"   <module>   s|    

%
&&*%


	