o
    c۷iF                     @  s  d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	m
Z
 d dlmZmZ d dlmZmZmZmZmZ d dlZd dlmZ d dlZddlmZ d	ZeZd
d Zdd ZG dd dejjjZdd Z dddZ!G dd dZ"G dd dZ#	dddZ$dS )    )annotationsN)Path)cached_propertypartial)DictTupleListOptionalAny)Tensor   )__version__quackc                   C  s   t t  dt S )N_HOME)osgetenvPACKAGE_NAMEupperr   home r   r   E/home/ubuntu/vllm_env/lib/python3.10/site-packages/quack/autotuner.pyget_home_dir      r   c                   C  s   t jt dt dS )N.cache)r   pathjoinr   r   r   r   r   r   default_cache_dir    s   r   c                      s   e Zd Z fddZ  ZS )FileCacheManagerc                   sv   t  | tt  dd pt | _| jr7tj	
| j| j| _tj	
| jd| _tj| jdd d S td)N
_CACHE_DIR lockT)exist_okz$Could not create or locate cache dir)super__init__r   r   r   r   stripr   	cache_dirr   r   key	lock_pathmakedirsRuntimeError)selfr'   	__class__r   r   r$   %   s   zFileCacheManager.__init__)__name__
__module____qualname__r$   __classcell__r   r   r,   r   r   $   s    r   c                 C  s   t t| ddS )Nutf-8=)base64	b32encodebytesfromhexdecoderstripr'   r   r   r   _base322   s   r;      c                 C  sv   t jdddt jd}t j  | d }t }t | |k r9tdD ]}|| }q#t j  t | |k sdS dS )zSaturate the GPU to reach thermal steady-state before benchmarking.

    Without this, the first autotuning config gets artificially good numbers
    because the GPU hasn't been power-throttled yet.
    i   cuda)devicedtypei  d   N)torchrandnbfloat16r=   synchronizetimerange)duration_msatargett0_r   r   r   _gpu_warmup7   s   


rL   c                   @  s^   e Zd Z				ddddZedd Zd	d
 Zdd Zej	j
dd Zdd ZdddZdS )	AutotunerNFprune_configs_byOptional[Dict]c                   s  |st  g _n| _t|}| _i  _t|j  _|p,t	
t  dddk _g  _|dur:t| _t jdkrK fdd}	|	 _nd _t jdkr_ fdd}
|
 _nd _d _d	 _d _|r|d
 j _|d j _|d j _| _| _dS )a  
        :param prune_configs_by: a dict of functions that are used to prune configs, fields:
            'perf_model': performance model used to predicate running time with different configs, returns running time
            'top_k': number of configs to bench
            'prune_num_stages_by'(optional): a function used to prune num_stages. It takes configs:List[Config] as its input, and returns pruned configs.
        _CACHE_AUTOTUNINGN1r   c                   s    fddj D _d S )Nc                   s   i | ]	}| |   qS r   )clone).0namekwargsr   r   
<dictcomp>k       z9Autotuner.__init__.<locals>._pre_hook.<locals>.<dictcomp>)restore_valuerestore_copiesrU   r+   rU   r   	_pre_hookj   r   z%Autotuner.__init__.<locals>._pre_hookc                   s*    j D ]}| |  j|  qi  _d S N)rY   copy_rZ   )rV   	exceptionrT   r[   r   r   
_post_hooks   s   

z&Autotuner.__init__.<locals>._post_hook      ?
perf_modeltop_kearly_config_prune)AutotuneConfigconfigsinspect	signaturekeysr   list
parameters	arg_namesr   r   r   r   cache_resultsrY   lenpre_hook	post_hookrb   configs_top_krd   getfn	_do_bench)r+   rs   r'   rf   rY   rN   do_benchrm   rh   r\   r`   r   r[   r   r$   H   s>   


zAutotuner.__init__c                 C  s"   | j d u rttjjdddS | j S )N      )warmuprep)rt   r   tritontestingru   r[   r   r   r   ru      s   
zAutotuner.do_benchc             	     s  ddl m} |s
dS tt|ttdd}|dkrdS t }zt|fi |d 	 }| j
|i | W n	 ty@   Y nw t | dk rKdS tt  ddd	k}|rgtd
t| d| d t }	ddl ddlddl}
ddl} fdd} fdd}g }|D ] }t|tr|t|jt| t|jd q|| q| j
j}| j
j}g }t|D ])}|
j|jddg|
j |
j |s|
j!ndd}||j"}|dkr|#  q|| q|sdS dgt| }t$|D ]&\}}||t|  }||j%|||||	 d ||t|   d7  < qt$|D ]\}}t|| D ]}||j" q*q |D ]}|j%&  |'  q7|rVtdt |	 dd dS dS )a"  Pre-compile all configs in parallel subprocesses to populate .so cache.

        cute.compile() is not thread-safe (MLIR thread-local state) and fork after
        CUDA init causes segfaults. So we spawn persistent subprocess workers: each
        has its own CUDA context, creates FakeTensors matching the parent's tensor
        metadata, and compiles with COMPILE_ONLY=True. Workers stay alive to amortize
        import overhead across multiple configs. The parent then loads instantly from
        the .so cache during benchmarking.
        r   )CACHE_ENABLEDNQUACK_COMPILE_WORKERS8r         ?_PRINT_AUTOTUNINGrQ   zPre-compiling z configs with z workersc                   s6     |}| dt| | | |   d S )N<I)dumpswritepackrn   flush)streammsgdatapicklestructr   r   _send   s   

z$Autotuner._precompile.<locals>._sendc                   sB   |  d}t|dk rd S d|d }|r |  |S d S )N   r   r   )readrn   unpackloads)r   headerlengthr   r   r   _recv   s
   
z$Autotuner._precompile.<locals>._recv)shapestrider?   z-mzquack._compile_worker)stdinstdoutstderrREADY)	fn_modulefn_qualnametensor_metarV   config_kwargszPre-compilation done in z.1fs)(quack.cache_utilsr|   minrn   intr   r   rE   dict
all_kwargsrs   	Exceptionr   r   printr   r   
subprocesssys
isinstancer   appendrj   r   r   strr?   r/   r0   rF   Popen
executablePIPEDEVNULLr   kill	enumerater   closewait)r+   rf   argsrV   r|   max_workerst_checkcurrentverboserJ   r   r   r   r   r   argr   r   workersrK   preadypendingiconfigwwir   r   r   _precompile   s   






zAutotuner._precompilec             
     s   t jt  dd dk}|rtdjj d|  | |j	 @ }|r2t
dd| dt|fi | i j fdd	}zj|d
dW S  tyz } z|retd|  tdtdtdgW  Y d }~S d }~ww )Nr   rQ   zAutotuning kernel z with config zConflicting meta-parameters: , z8. Make sure that you don't re-define auto-tuned symbols.c               
     s   j d ur
  zjj i  W n ty3 }  zzjd ur,j| d W  W   d } ~ ww jd urBjd d d S d S )N)r_   )ro   rs   __call__r   rp   )er   r   
full_nargsr+   r   r   kernel_call  s(   




z%Autotuner._bench.<locals>.kernel_call)r   g?g?)	quantileszAutotuning failed with inf)r   environrr   r   r   r   rs   r.   ri   rV   
ValueErrorr   r   r   nargsru   r   float)r+   r   r   metar   	conflictsr   r   r   r   r   _bench  s&   "zAutotuner._benchc                   sj  |s|  d S | j }dd |D }t|tt|ksJ dtt|g| }td|d	 }t
t|}|jd d  d}||}	|	rtjt  dd	sd
d t||D  t|	d*}
t|
d } fdd|D }tj||jd| j|< || _d| _W d    d S 1 sw   Y  d S |  |jt|dd | j D d|d	d d S )Nc                 S  s   g | ]}t |qS r   r   )rS   cr   r   r   
<listcomp>6  s    z.Autotuner.check_disk_cache.<locals>.<listcomp>zConfig strings must be unique-r2      z.autotune.json_FORCE_CACHE_UPDATEFc                 S  s   i | ]\}}||qS r   r   )rS   r   r   r   r   r   rW   ?  s    z.Autotuner.check_disk_cache.<locals>.<dictcomp>rconfigs_timingsc                   s   i | ]	\}} | |qS r   r   )rS   r   timing
str2configr   r   rW   B  rX   r:   r   c                 S  s   g | ]
\}}t ||fqS r   r   )rS   r   timingsr   r   r   r   M  s    )r'   r   )binary)rs   rn   setVERSIONr   hashlibsha256r   encode	hexdigestr   r;   r.   get_filer   r   rr   r   r   zipopenjsonloadbuiltinsr   r   r   
bench_timeputr   items)r+   
tuning_keyrf   bench_fnrs   config_str_list	cache_keyr   	file_namer   cached_configsr   r   r   r   check_disk_cache/  sF   


zAutotuner.check_disk_cachec           
   	     s  t tj_d}tjdkri j}fdd| D   fddjD   D ]'\}}t|t	rY
t|j 
tdd | D  
t|j q2tjvrd}tjjfd	d
}jr| n|  j }njd }|_tt  dd dkr|stt djj djddj d jj i |! }	d _|	S )NTr   c                   s    i | ]\}}| j v r||qS r   )rl   )rS   kvr[   r   r   rW   [       z&Autotuner.__call__.<locals>.<dictcomp>c                   s    g | ]}| v rt  | qS r   r   )rS   r'   )_argsr   r   r   ]  r   z&Autotuner.__call__.<locals>.<listcomp>c                 S  s   g | ]
}|d v r
|ndqS )>   r   r      r   )rS   r   r   r   r   r   b  s    Fc                    s   j  di t  t }  fddD }t }tt  dd dkrD| D ]\}}td| d|d d	d
 q1||  _	t
j||jdj< |_d S )Nrf   c                   s"   i | ]}|j  d |iqS )r   )r   rS   r   )r   rV   r+   r   r   rW   n  s    z9Autotuner.__call__.<locals>.benchmark.<locals>.<dictcomp>r   rQ   [z] -> r   z.3fmsr:   )r   rL   rE   r   r   r   r   r   r   r   r   r   rr   r   r   )bench_startr   	bench_endr   time_)r   r'   rV   pruned_configsr+   r   r   	benchmarki  s   

z%Autotuner.__call__.<locals>.benchmarkr   r   rQ   z autotuning for function z finished after z.2fzs; best config selected: ;)"r   r   rl   r   rn   rf   r   ri   r   r   r   r   r   r   r?   tupler   prune_configsrA   compilerdisablerm   r   best_configr   r   r   r   r   rs   r.   r   r   r   )
r+   r   rV   used_cached_resultall_argsrK   r   r   r   retr   )r   r   r'   rV   r   r+   r   r   V  sV   



zAutotuner.__call__rV   r   return	List[Any]c                   s   j }jrjj jfi }jrTj}t|tr+|dkr+ttj | }n	t|ts4t	dt||krTfdd|D  t
   fdddd | }|S )Nra   zPError while pruning configs, top_k must be either 1) a float <= 1.0 or 2) an intc                   s,   i | ]}|j d i j | qS )r   )rb   r   r   r   )rV   r+   r   r   rW     s    
z+Autotuner.prune_configs.<locals>.<dictcomp>c                   s    |  S r]   r   )x)
est_timingr   r   <lambda>  s    z)Autotuner.prune_configs.<locals>.<lambda>r:   )rf   rd   r   rb   rq   r   r   r   rn   	TypeErrorsortedri   )r+   rV   r   rc   r   )r  rV   r+   r   r    s"   
 zAutotuner.prune_configs)NNNF)rN   rO   )rV   r   r
  r  )r.   r/   r0   r$   r   ru   r   r   rA   r  r  r   r   r  r   r   r   r   rM   G   s    A
u+
&=rM   c                   @  s@   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d Zdd Z	dS )re   z
    An object that represents a possible kernel configuration for the auto-tuner to try.

    :ivar kwargs: a dictionary of meta-parameters to pass to the kernel as keyword arguments.
    :type kwargs: dict[Str, Any]
    c                 K  s
   || _ d S r]   rU   )r+   rV   r   r   r   r$     s   
zAutotuneConfig.__init__c                 C  s   | di | _d S )NrV   )rr   rV   )r+   stater   r   r   __setstate__  s   zAutotuneConfig.__setstate__c                 C  s   | j S r]   rU   r[   r   r   r   r     s   zAutotuneConfig.all_kwargsc                 C  s6   g }| j  D ]\}}|| d|  qd|S )Nz: r   )rV   r   r   r   )r+   resr   r   r   r   r   __str__  s   
zAutotuneConfig.__str__c                 C  s   t t|    S r]   )hashr  r   r   r[   r   r   r   __hash__  s   zAutotuneConfig.__hash__c                 C  s(   t |    }t |   }||kS r]   )r  r   r   )r+   other
self_tupleother_tupler   r   r   __eq__  s   zAutotuneConfig.__eq__N)
r.   r/   r0   __doc__r$   r  r   r  r  r  r   r   r   r   re     s    re   Tc                   s6   dt   d d u rg  fdd}|S )Nzw
    Decorator for auto-tuning a function function.

    .. highlight:: python

    If the environment variable :code:`a  _PRINT_AUTOTUNING` is set to
    :code:`"1"`, we will print a message to stdout after autotuning each
    kernel, including the time spent autotuning and the best configuration.

    :param configs: a list of :code:`AutotuneConfig` objects
    :type configs: list[AutotuneConfig]
    :param key: a list of argument names whose change in value will trigger the evaluation of all provided configs.
    :type key: list[str]
    :param prune_configs_by: a dict of functions that are used to prune configs, fields:
        'perf_model': performance model used to predicate running time with different configs, returns running time
        'top_k': number of configs to bench
        'early_config_prune'(optional): a function used to do early prune (eg, num_stages). It takes configs:List[Config] as its input, and returns pruned configs.
    :param restore_value: a list of argument names whose value will be restored after evaluating any configs.
    :type restore_value: list[str]
    :param do_bench: a benchmark function to measure the time of each run.
    :type do_bench: lambda fn, quantiles
    :param cache_results: whether to cache autotune timings to disk.  Defaults to False.
    "type cache_results: bool
    c              	     s   t |  dS )N)rY   rN   ru   rm   )rM   )rs   rm   rf   ru   r'   rN   rY   r   r   	decorator  s   zautotune.<locals>.decorator)r   r   )rf   r'   rN   rY   ru   rm   r  r   r  r   autotune  s   r  )r<   )NNNNT)%
__future__r   r   r   rE   rg   r4   r   r   pathlibr   	functoolsr   r   typingr   r   r   r	   r
   rA   r   rz   r    r   r   r   r   r   runtimer   r   r;   rL   rM   re   r  r   r   r   r   <module>   s8   
  i!