o
    پiPL                     @   s   d dl Z d dlZd dlZd dlZd dl mZ d dlmZmZmZ d dl	m
Z
 d dlZd dlZd dlmZ d dlmZ d dlmZ d dlmZ G dd	 d	Zd
ee fddZG dd dZG dd deZdd ZG dd deZdS )    N)	ExitStack)AnyCallableOptional)patch)compilation_counter)pass_context)torch_releasec                   @   s   e Zd ZU dZeed< 	ddededefdd	Zd
efddZ		dde	j
dee deeef dee dee d
eee ee f fddZ	ddede	j
dee dedee d
efddZdS )CompilerInterfacez@
    The interface for a compiler that can be used by vLLM.
    nameF 	cache_dirdisable_cacheprefixc                 C      dS )a\  
        when the vLLM process uses `cache_dir` as the cache directory,
        the compiler should initialize itself with the cache directory,
        e.g. by re-directing its own cache directory to a sub-directory.

        prefix can be used in combination with cache_dir to figure out the base
        cache directory, e.g. there're multiple parts of model being compiled,
        but we want to share the same cache directory for all of them.

        e.g.
        cache_dir = "/path/to/dir/backbone", prefix = "backbone"
        cache_dir = "/path/to/dir/eagle_head", prefix = "eagle_head"
        N )selfr   r   r   r   r   ]/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/compilation/compiler_interface.pyinitialize_cache   s   z"CompilerInterface.initialize_cachereturnc                 C   r   )a  
        Gather all the relevant information from the vLLM config,
        to compute a hash so that we can cache the compiled model.

        See [`VllmConfig.compute_hash`][vllm.config.VllmConfig.compute_hash]
        to check what information
        is already considered by default. This function should only
        consider the information that is specific to the compiler.
        r   r   r   r   r   r   compute_hash/   s   
zCompilerInterface.compute_hashNgraphexample_inputscompiler_configruntime_shapekeyc                 C   r   )ak  
        Compile the graph with the given example inputs and compiler config,
        with a runtime shape. If the `runtime_shape` is None, it means
        the `example_inputs` have a dynamic shape. Otherwise, the
        `runtime_shape` specifies the shape of the inputs. Right now we only
        support one variable shape for all inputs, which is the batchsize
        (number of tokens) during inference.

        Dynamo will make sure `graph(*example_inputs)` is valid.

        The function should return a compiled callable function, as well as
        a handle that can be used to directly load the compiled function.

        The handle should be a plain Python object, preferably a string or a
        file path for readability.

        If the compiler doesn't support caching, it should return None for the
        handle. If the compiler fails to compile the graph, it should return
        None for the compiled function as well.

        `key` is required for StandaloneInductorAdapter, it specifies where to
        save the compiled artifact. The compiled artifact gets saved to
        `cache_dir/key`.
        NNr   )r   r   r   r   r   r   r   r   r   compile;   s    zCompilerInterface.compilehandlegraph_indexc                 C      t d)z
        Load the compiled function from the handle.
        Raises an error if the handle is invalid.

        The handle is the second return value of the `compile` function.
        zcaching is not supportedNotImplementedError)r   r   r   r   r    r   r   r   r   load]   s   zCompilerInterface.loadFr   r   N)__name__
__module____qualname____doc__str__annotations__boolr   r   fxGraphModulelistr   dictr   inttupler   r   r$   r   r   r   r   r
      sR   
 


(r
   r   c                  C   sB   g } ddl m} | }| | ddl m} | }| | | S )Nr   )	CacheBase)	torch_key)torch._inductor.codecacher4   
get_systemappendr5   )factorsr4   system_factorsr5   torch_factorsr   r   r   get_inductor_factorsn   s   

r<   c                   @   s2   e Zd ZdZdddZdd Zdd	 Zd
d ZdS )AlwaysHitShapeEnva  
    Why do we need this class:

    For normal `torch.compile` usage, every compilation will have
    one Dynamo bytecode compilation and one Inductor compilation.
    The Inductor compilation happens under the context of the
    Dynamo bytecode compilation, and that context is used to
    determine the dynamic shape information, etc.

    For our use case, we only run Dynamo bytecode compilation once,
    and run Inductor compilation multiple times with different shapes
    plus a general shape. The compilation for specific shapes happens
    outside of the context of the Dynamo bytecode compilation. At that
    time, we don't have shape environment to provide to Inductor, and
    it will fail the Inductor code cache lookup.

    By providing a dummy shape environment that always hits, we can
    make the Inductor code cache lookup always hit, and we can
    compile the graph for different shapes as needed.

    The following dummy methods are obtained by trial-and-error
    until it works.
    r   Nc                 C   s
   g | _ d S r&   )guardsr   r   r   r   __init__   s   
zAlwaysHitShapeEnv.__init__c                 O   r   )NTr   r   argskwargsr   r   r   evaluate_guards_expression      z,AlwaysHitShapeEnv.evaluate_guards_expressionc                 O   s   g S r&   r   r@   r   r   r   get_pruned_guards   rD   z#AlwaysHitShapeEnv.get_pruned_guardsc                 O   r   )Nr   r   r@   r   r   r   produce_guards_expression   rD   z+AlwaysHitShapeEnv.produce_guards_expression)r   N)r'   r(   r)   r*   r?   rC   rE   rF   r   r   r   r   r=   ~   s    
r=   c                   @   s   e Zd ZdZdZdefddZ	dded	ed
efddZ		dde	j
dee deeef dee dee deee ee f fddZ	ddede	j
dee dedee defddZdejfddZdS )InductorAdaptorzG
    The adaptor for the Inductor compiler, version 2.5, 2.6, 2.7.
    inductorr   c                 C   s,   t  }tjt| dd d d }|S )NF)usedforsecurity
   )r<   hashlibmd5r+   encode	hexdigest)r   r9   hash_strr   r   r   r      s   
zInductorAdaptor.compute_hashFr   r   r   r   c                 C   s   || _ || _|r|d t|  n|| _|rd S tj| jd}tj|dd |tjd< tj| jd}tj|dd |tjd< d S )Ninductor_cacheT)exist_okTORCHINDUCTOR_CACHE_DIRtriton_cacheTRITON_CACHE_DIR)	r   r   lenbase_cache_dirospathjoinmakedirsenviron)r   r   r   r   rP   rS   r   r   r   r      s   
z InductorAdaptor.initialize_cacheNr   r   r   r   r   c              	      s
  t  jd7  _ddlm} i }|d ur|| d|d< d|d< t|| t|}d\dd	lm	}m
  td d
 dkrQ|jd}	fdd}
tjjj}ntdkr_d }	fdd} fdd}dd }dtfdd}t }|	d ur|t|	|
 |td| |td| ddlm} t|dr|td| |td| |  |tjjjdd |tjjjdd  |tjjjdd! t| |||||d"}W d    n1 sw   Y  W d    n1 sw   Y  |ffS )#N   r   )
compile_fxTfx_graph_cacheFfx_graph_remote_cacher   )FxGraphCachecompiled_fx_graph_hash   rb      z+torch._inductor.codecache.FxGraphCache.loadc                     sf   | i |}|j }|jj  js1|jD ]}t|jsq|jjjjr0|jjj  |S q|S r&   )current_callable__code__co_filename
startswithrV   __closure__callablecell_contents)rA   rB   inductor_compiled_graphcompiled_fncell)	file_pathoriginal_loadr   r   r   hijack_load   s   



z,InductorAdaptor.compile.<locals>.hijack_loadrb      c                     s|   t jjj| i |}|}|d ur<|j}|jj  js9|j	D ]}t
|js(q |jj}|jjr8|j  nq |j|S r&   )torch	_inductorr]   compile_fx_innerre   rf   rg   rh   rV   ri   rj   rk   _fx_graph_cache_key)rA   rB   outputrl   rm   rn   code)ro   rO   r   r   r   hijacked_compile_fx_inner  s    

z:InductorAdaptor.compile.<locals>.hijacked_compile_fx_innerc                     s    | i |}|d |S Nr   r   )rA   rB   out)ra   rO   r   r   hijack_compiled_fx_graph_hash  s   z>InductorAdaptor.compile.<locals>.hijack_compiled_fx_graph_hashc                  _   s   d S r&   r   rA   rB   r   r   r   _check_can_cache!  s   z1InductorAdaptor.compile.<locals>._check_can_cacher   c                   S      t  S r&   r=   r   r   r   r   _get_shape_env*  s   z/InductorAdaptor.compile.<locals>._get_shape_envz0torch._inductor.codecache.compiled_fx_graph_hash5torch._inductor.codecache.FxGraphCache._get_shape_envAOTAutogradCacher   Mtorch._functorch._aot_autograd.autograd_cache.AOTAutogradCache._get_shape_envz7torch._inductor.codecache.FxGraphCache._check_can_cache)r_   )enable_autograd_cache)enable_remote_autograd_cache)inner_compileconfig_patches)r   num_inductor_compilestorch._inductor.compile_fxr]   updateset_inductor_configcopydeepcopyr6   r`   ra   r	   r$   rt   ru   rv   r=   r   enter_contextr   -torch._functorch._aot_autograd.autograd_cacher   hasattrmetrics_contextconfig
_functorchr   )r   r   r   r   r   r   r]   current_configr`   original_load_namerq   rz   r}   r   r   stackr   compiled_graphr   )ra   ro   rO   rp   r   r   r      s   


	
	
GzInductorAdaptor.compiler   r    c                    sR  t |tsJ t |d tsJ t |d tsJ |d }ddlm} ddlm} t b}	|	t	ddd  t
|drF|	t	d	d
d  |	|   td d dkrf|||dd  d useJ dn!tdkrddlm}
 |
|}|||dd |\ } d usJ dW d    n1 sw   Y  ddlm} || fdd}|S )Nr   r\   r   )r`   r   c                  _   r   r&   r   r~   r   r   r   <lambda>      z&InductorAdaptor.load.<locals>.<lambda>r   r   c                  _   r   r&   r   r~   r   r   r   r     r   rb   rc   TFzMInductor cache lookup failed. Please removethe cache directory and try again.rr   )CompiledFxGraphConstantsWithGm)graph_returns_tuplec                     s    t | } |}r|S |d S r{   )r0   )rA   	list_argsgraph_outputrl   returns_tupler   r   r     s
   z,InductorAdaptor.load.<locals>.compiled_graph)
isinstancer3   r+   r   r   r6   r`   r   r   r   r   r   r	   _lookup_graphtorch._inductor.output_coder   r   r   )r   r   r   r   r    r   rO   r   r`   
exit_stackr   	constants_r   r   r   r   r   r$   v  sV   



-
zInductorAdaptor.loadc                 C   s   ddl }|jj S )a  
        This method returns the Dynamo metrics context (if it exists,
        otherwise a null context). It is used by various compile components.
        Present in torch>=2.6, it's used inside FxGraphCache in
        torch==2.6 (but not after). It might also be used in various other
        torch.compile internal functions.

        Because it is re-entrant, we always set it (even if entering via Dynamo
        and the context was already entered). We might want to revisit if it
        should be set at a different level of compilation.

        This is likely a bug in PyTorch: public APIs should not rely on
        manually setting up internal contexts. But we also rely on non-public
        APIs which might not provide these guarantees.
        r   N)torch._dynamo.utils_dynamoutilsget_metrics_context)r   rt   r   r   r   r     s   zInductorAdaptor.metrics_contextr%   r   r&   )r'   r(   r)   r*   r   r+   r   r-   r   r.   r/   r0   r   r1   r   r2   r3   r   r   r$   
contextlibAbstractContextManagerr   r   r   r   r   rG      sV    


 8
NrG   c                 C   s"   t |trd| d< d| d< d S d S )NTmax_autotunecoordinate_descent_tuning)r   r2   )r   r   r   r   r   r     s   
r   c                   @   s   e Zd ZdZ			ddejdee dee	ef de
e de
e	 d	ed
ee
e e
e f fddZ		ddedejdee dede
e d	ed
efddZdS )EagerAdaptereagerNr\   r   r   r   r   r   
num_graphsr   c                 C   s   |d fS r&   r   )r   r   r   r   r   r   r   r   r   r   r        	zEagerAdapter.compiler   r    c                 C   r!   )Nz"eager compilation is not supportedr"   )r   r   r   r   r    r   r   r   r   r   r$     r   zEagerAdapter.load)NNr\   )Nr\   )r'   r(   r)   r   r.   r/   r0   r   r1   r+   r   r2   r3   r   r   r$   r   r   r   r   r     sJ    

r   )r   r   rK   rW   r   typingr   r   r   unittest.mockr   rt   r   torch.fxr.   *sglang.srt.compilation.compilation_counterr   $sglang.srt.compilation.inductor_passr   sglang.srt.utils.commonr	   r
   r0   r<   r=   rG   r   r   r   r   r   r   <module>   s*   Z&  7