o
    ۷i                     @   s  d dl Z d dlZd dlmZmZmZmZmZ d dlm	Z	m
Z
 ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ d dlmZ eeZ				d4dee	eej j!e
f deeeeef  dee deeeee eee  f  dee dee	ej j!e
ef fddZ"de	eB de#fddZ$de	deeej j! ee% f fddZ&de	eB dee%ej j!B  deeej j! ef fd d!Z'd"ej j!fd#d$Z(dee	eej j!f fd%d&Z)dee*ee% f fd'd(Z+de	e%B eB defd)d*Z,	d5d+ee* d,ee* d-ee* de-e* fd.d/Z.				0d6d+eee*  d,eee*  d-ee* d1ee% de-e* f
d2d3Z/dS )7    N)AnyTupleListUnionOptional)DiffusionPipeline
ModelMixin   )	CacheType)BlockAdapter)BlockAdapterRegister)CachedAdapter)BasicCacheConfig)DBCacheConfig)DBPruneConfig)CalibratorConfig)ParamsModifier   )ParallelismConfig)enable_parallelism)init_loggerpipe_or_adaptercache_configcalibrator_configparams_modifiersparallelism_configreturnc                 K   sB  |du r|du rt d t }nt d i }|dd }dur)|tjkr)| S |dd|dd|dd|dd|d	d|d
d|dd|dd|ddd	}dd | D }|r}t d |durv|jd-i | nt	d-i |}|dur||d< |dur|dddus|dddurt d ddl
m}	 |	|d|d|dd|ddd}|dur||d< |dur||d< |durt| tttjjtfrtj| fi |} ntdt|  dt d | jj d! |durt|tsJ d"d#|jvrt| |jd#< |jd# |_|jd$d }
dur.t| |
|jd$< g }t| tratj| |du d%}|du rSt| d&sNJ d'| j g}n!tj!|d(d)}t"|j }nt#| sntj!| d(d)} t"| j }t$|d*krt d+ | S t$|dkrt d, t%|D ]\}}t&||||< q| S ).u)  
    The `enable_cache` function serves as a unified caching interface designed to optimize the performance
    of diffusion transformer models by implementing an intelligent caching mechanism known as `DBCache`.
    This API is engineered to be compatible with nearly `all` diffusion transformer architectures that
    feature transformer blocks adhering to standard input-output patterns, eliminating the need for
    architecture-specific modifications.

    By strategically caching intermediate outputs of transformer blocks during the diffusion process,
    `DBCache` significantly reduces redundant computations without compromising generation quality.
    The caching mechanism works by tracking residual differences between consecutive steps, allowing
    the model to reuse previously computed features when these differences fall below a configurable
    threshold. This approach maintains a balance between computational efficiency and output precision.

    The default configuration (`F8B0, 8 warmup steps, unlimited cached steps`) is carefully tuned to
    provide an optimal tradeoff for most common use cases. The "F8B0" configuration indicates that
    the first 8 transformer blocks are used to compute stable feature differences, while no final
    blocks are employed for additional fusion. The warmup phase ensures the model establishes
    sufficient feature representation before caching begins, preventing potential degradation of
    output quality.

    This function seamlessly integrates with both standard diffusion pipelines and custom block
    adapters, making it versatile for various deployment scenarios—from research prototyping to
    production environments where inference speed is critical. By abstracting the complexity of
    caching logic behind a simple interface, it enables developers to enhance model performance
    with minimal code changes.

    Args:
        pipe_or_adapter (`DiffusionPipeline`, `BlockAdapter` or `Transformer`, *required*):
            The standard Diffusion Pipeline or custom BlockAdapter (from cache-dit or user-defined).
            For example: cache_dit.enable_cache(FluxPipeline(...)).

        cache_config (`BasicCacheConfig`, *required*, defaults to BasicCacheConfig()):
            Basic DBCache config for cache context, defaults to BasicCacheConfig(). The configurable params listed belows:
                Fn_compute_blocks: (`int`, *required*, defaults to 8):
                    Specifies that `DBCache` uses the**first n**Transformer blocks to fit the information at time step t,
                    enabling the calculation of a more stable L1 difference and delivering more accurate information
                    to subsequent blocks. Please check https://github.com/vipshop/cache-dit/blob/main/docs/DBCache.md
                    for more details of DBCache.
                Bn_compute_blocks: (`int`, *required*, defaults to 0):
                    Further fuses approximate information in the **last n** Transformer blocks to enhance
                    prediction accuracy. These blocks act as an auto-scaler for approximate hidden states
                    that use residual cache.
                residual_diff_threshold (`float`, *required*, defaults to 0.08):
                    the value of residual diff threshold, a higher value leads to faster performance at the
                    cost of lower precision.
                max_accumulated_residual_diff_threshold (`float`, *optional*, defaults to None):
                    The maximum accumulated relative l1 diff threshold for Cache. If set, when the
                    accumulated relative l1 diff exceeds this threshold, the caching strategy will be
                    disabled for current step. This is useful for some cases where the input condition
                    changes significantly in a single step. Default None means this feature is disabled.
                max_warmup_steps (`int`, *required*, defaults to 8):
                    DBCache does not apply the caching strategy when the number of running steps is less than
                    or equal to this value, ensuring the model sufficiently learns basic features during warmup.
                warmup_interval (`int`, *required*, defaults to 1):
                    Skip interval in warmup steps, e.g., when warmup_interval is 2, only 0, 2, 4, ... steps
                    in warmup steps will be computed, others will use dynamic cache.
                max_cached_steps (`int`, *required*, defaults to -1):
                    DBCache disables the caching strategy when the previous cached steps exceed this value to
                    prevent precision degradation.
                max_continuous_cached_steps (`int`, *required*, defaults to -1):
                    DBCache disables the caching strategy when the previous continous cached steps exceed this value to
                    prevent precision degradation.
                enable_separate_cfg (`bool`, *required*,  defaults to None):
                    Whether to do separate cfg or not, such as Wan 2.1, Qwen-Image. For model that fused CFG
                    and non-CFG into single forward step, should set enable_separate_cfg as False, for example:
                    CogVideoX, HunyuanVideo, Mochi, etc.
                cfg_compute_first (`bool`, *required*,  defaults to False):
                    Whether to compute cfg forward first, default is False, meaning:
                    0, 2, 4, ..., -> non-CFG step;
                    1, 3, 5, ... -> CFG step.
                cfg_diff_compute_separate (`bool`, *required*,  defaults to True):
                    Whether to compute separate difference values for CFG and non-CFG steps, default is True.
                    If False, we will use the computed difference from the current non-CFG transformer step
                    for the current CFG step.
                num_inference_steps (`int`, *optional*, defaults to None):
                    num_inference_steps for DiffusionPipeline, used to adjust some internal settings
                    for better caching performance. For example, we will refresh the cache once the
                    executed steps exceed num_inference_steps if num_inference_steps is provided.
                steps_computation_mask (`List[int]`, *optional*, defaults to None):
                    This param introduce LeMiCa/EasyCache style compute mask for steps. It is a list
                    of length num_inference_steps indicating whether to compute each step or not.
                    1 means must compute, 0 means use dynamic/static cache. If provided, will override
                    other settings to decide whether to compute each step.
                steps_computation_policy (`str`, *optional*, defaults to "dynamic"):
                    The computation policy for steps when using steps_computation_mask. It can be
                    "dynamic" or "static". "dynamic" means using dynamic cache for steps marked as 0
                    in steps_computation_mask, while "static" means using static cache for those steps.

        calibrator_config (`CalibratorConfig`, *optional*, defaults to None):
            Config for calibrator. If calibrator_config is not None, it means the user wants to use DBCache
            with a specific calibrator, such as taylorseer, foca, and so on.

        params_modifiers ('ParamsModifier', *optional*, defaults to None):
            Modify cache context params for specific blocks. The configurable params listed belows:
                cache_config (`BasicCacheConfig`, *required*, defaults to BasicCacheConfig()):
                    The same as 'cache_config' param in cache_dit.enable_cache() interface.
                calibrator_config (`CalibratorConfig`, *optional*, defaults to None):
                    The same as 'calibrator_config' param in cache_dit.enable_cache() interface.
                **kwargs: (`dict`, *optional*, defaults to {}):
                    The same as 'kwargs' param in cache_dit.enable_cache() interface.

        parallelism_config (`ParallelismConfig`, *optional*, defaults to None):
            Config for Parallelism. If parallelism_config is not None, it means the user wants to enable
            parallelism for cache-dit. Please check https://github.com/vipshop/cache-dit/blob/main/src/cache_dit/parallelism/parallel_config.py
            for more details of ParallelismConfig.
                backend: (`ParallelismBackend`, *required*, defaults to "ParallelismBackend.NATIVE_DIFFUSER"):
                    Parallelism backend, currently only NATIVE_DIFFUSER and NVTIVE_PYTORCH are supported.
                    For context parallelism, only NATIVE_DIFFUSER backend is supported, for tensor parallelism,
                    only NATIVE_PYTORCH backend is supported.
                ulysses_size: (`int`, *optional*, defaults to None):
                    The size of Ulysses cluster. If ulysses_size is not None, enable Ulysses style parallelism.
                    This setting is only valid when backend is NATIVE_DIFFUSER.
                ring_size: (`int`, *optional*, defaults to None):
                    The size of ring for ring parallelism. If ring_size is not None, enable ring attention.
                    This setting is only valid when backend is NATIVE_DIFFUSER.
                tp_size: (`int`, *optional*, defaults to None):
                    The size of tensor parallelism. If tp_size is not None, enable tensor parallelism.
                    This setting is only valid when backend is NATIVE_PYTORCH.
                parallel_kwargs: (`dict`, *optional*, defaults to {}):
                    Additional kwargs for parallelism backends. For example, for NATIVE_DIFFUSER backend,
                    it can include `cp_plan` and `attention_backend` arguments for `Context Parallelism`.

        kwargs (`dict`, *optional*, defaults to {})
            Other cache context kwargs, please check https://github.com/vipshop/cache-dit/blob/main/src/cache_dit/caching/cache_contexts/cache_context.py
            for more details.

    Examples:
    ```py
    >>> import cache_dit
    >>> from diffusers import DiffusionPipeline
    >>> pipe = DiffusionPipeline.from_pretrained("Qwen/Qwen-Image") # Can be any diffusion pipeline
    >>> cache_dit.enable_cache(pipe) # One-line code with default cache options.
    >>> output = pipe(...) # Just call the pipe as normal.
    >>> stats = cache_dit.summary(pipe) # Then, get the summary of cache acceleration stats.
    >>> cache_dit.disable_cache(pipe) # Disable cache and run original pipe.
    Nz1cache_config is None, using default DBCacheConfigzParallelism is enabled and cache_config is None. Please manually set cache_config to avoid potential compatibility issues. Otherwise, cache will not be enabled.
cache_typeFn_compute_blocksBn_compute_blocksmax_warmup_stepsmax_cached_stepsmax_continuous_cached_stepsresidual_diff_thresholdenable_separate_cfgcfg_compute_firstcfg_diff_compute_separate)	r   r   r    r!   r"   r#   r$   r%   r&   c                 S   s   i | ]\}}|d ur||qS N ).0kvr(   r(   W/home/ubuntu/vllm_env/lib/python3.10/site-packages/cache_dit/caching/cache_interface.py
<dictcomp>   s    z enable_cache.<locals>.<dictcomp>zManually settup DBCache context without BasicCacheConfig is deprecated and will be removed in the future, please use `cache_config` parameter instead!r   enable_taylorseerenable_encoder_taylorseerzManually settup TaylorSeer calibrator without TaylorSeerCalibratorConfig is deprecated and will be removed in the future, please use `calibrator_config` parameter instead!r	   )TaylorSeerCalibratorConfigtaylorseer_cache_typeresidualtaylorseer_order)enable_calibratorenable_encoder_calibratorcalibrator_cache_typer3   r   r   ztype: zg is not valid, Please pass DiffusionPipeline or BlockAdapterfor the 1's position param: pipe_or_adapterz.cache_config is None, skip enabling cache for .z7parallelism_config should be of type ParallelismConfig.has_controlnetextra_parallel_modules)skip_post_inittransformerz_The given DiffusionPipeline does not have a 'transformer' attribute, cannot enable parallelism.F)uniquer   zJNo transformer is detected in the BlockAdapter, skip enabling parallelism.zhMultiple transformers are detected in the BlockAdapter, all transfomers will be enabled for parallelism.r(   )'loggerinfor   warninggetr
   NONEitemsupdater   cache_contexts.calibratorsr0   
isinstancer   r   torchnnModuler   r   apply
ValueErrortype	__class____name__r   parallel_kwargs_has_controlnet_parse_extra_parallel_modulesr   get_adapterhasattrr;   	normalizeflattenis_normalizedlen	enumerater   )r   r   r   r   r   kwargscontext_kwargsr   deprecated_kwargsr0   extra_parallel_moduletransformersadapterir;   r(   r(   r,   enable_cache   s    0
















r_   c                 C   s6   t | tr	| j}n| }t|drt|ddurdS dS )z+Check if the given pipeline has ControlNet.
controlnetNTF)rE   r   piperR   getattr)r   ra   r(   r(   r,   rO   m  s   
rO   ra   c                 C   sh   | j j}t| dr|ds|dst| ddfS t| dr&t| ddfS t| dr2t| ddfS dS )Ntext_encoder_2Hunyuan	Kandinskytext_encoder_3text_encoder)NN)rL   rM   rR   
startswithrb   )ra   pipe_cls_namer(   r(   r,   _parse_text_encoderx  s   

rj   r[   c                 C   s   t | tr	| j}n| }|sg S g }|D ]@}t |tjjr"|| qt||rJ|dkrAt|\}}|d ur;|| qt	
d q|t|| qt	
d| d q|S )Nrg   zAText encoder not found in the pipeline for extra parallel module.zExtra parallel module name z not found in the pipeline.)rE   r   ra   rF   rG   rH   appendrR   rj   r=   r?   rb   )r   r[   ra   parsed_extra_parallel_modulesmodule_or_namerg   _r(   r(   r,   rP     s.   



rP   r;   c                 K   s   |r>d|vr'ddl m} ||dd\}}t||d< |dur&t||d< nh d}t| | }|r>td	| d
 tj	| fi | dS )a  Refresh cache context for the given transformer. This is useful when
    the users run into transformer-only case with dynamic num_inference_steps.
    For example, when num_inference_steps changes significantly between different
    requests, the cache context should be refreshed to avoid potential
    precision degradation. Usage:
    ```py
    >>> import cache_dit
    >>> from cache_dit import DBCacheConfig
    >>> from diffusers import DiffusionPipeline
    >>> # Init cache context with num_inference_steps=None (default)
    >>> pipe = DiffusionPipeline.from_pretrained("Qwen/Qwen-Image")
    >>> pipe = cache_dit.enable_cache(pipe.transformer, cache_config=DBCacheConfig(...))
    >>> # Assume num_inference_steps is 28, and we want to refresh the context
    >>> cache_dit.refresh_context(transformer, num_inference_steps=28, verbose=True)
    >>> output = pipe(...) # Just call the pipe as normal.
    >>> stats = cache_dit.summary(pipe.transformer) # Then, get the summary
    >>> # Update the cache context with new num_inference_steps=50.
    >>> cache_dit.refresh_context(pipe.transformer, num_inference_steps=50, verbose=True)
    >>> output = pipe(...) # Just call the pipe as normal.
    >>> stats = cache_dit.summary(pipe.transformer) # Then, get the summary
    >>> # Update the cache context with new cache_config.
    >>> cache_dit.refresh_context(
        pipe.transformer,
        cache_config=DBCacheConfig(
            residual_diff_threshold=0.1,
            max_warmup_steps=10,
            max_cached_steps=20,
            max_continuous_cached_steps=4,
            num_inference_steps=50,
        ),
        verbose=True,
    )
    >>> output = pipe(...) # Just call the pipe as normal.
    >>> stats = cache_dit.summary(pipe.transformer) # Then, get the summary
    ```
    r   r	   )load_cache_configT)resetNr   >   verboser   r   zIforce_refresh_kwargs contains cache_config, please put the extra kwargs: zF into cache_config directly. Ohtherwise, these kwargs will be ignored.)
utilsro   copydeepcopysetkeysr=   r?   r   maybe_refresh_context)r;   force_refresh_kwargsro   r   r   allowed_keysnot_allowed_keysr(   r(   r,   refresh_context  s0   (

r{   c                 C   s(   | j j}t|  td| d d S )Nz$Acceleration hooks is disabled for: r7   )rL   rM   r   maybe_release_hooksr=   r?   )r   cls_namer(   r(   r,   disable_cache  s   
r~   c                  K   s   t jdi | S )Nr(   )r   supported_pipelines)rX   r(   r(   r,   r      s   r   c                 C   s
   t | S r'   )r   rQ   )ra   r(   r(   r,   rQ     s   
rQ   compute_bins
cache_binstotal_stepsc                 C   s   g }d}|   } |  }|   |  |d ur't| t| |ks&J dnt| t| }||k r`| rE|  }|dg|  ||7 }|rW| }|dg|  ||7 }||kr\n||k s3|d | S )Nr   zDThe sum of compute and cache intervals must be at least total_steps.r	   )rs   reversesumpopextend)r   r   r   maskstepcicair(   r(   r,   _steps_mask  s0   r   mediummask_policyc              
      s  | dur|durt | ||dS |dusJ dg dg dgg dg dgg dg d	gg d
g dgd}dttt  dtfdd dttt  dtdttt  f fdddttttt  f dtdttttt  f ffdd}|dkr| D ]}tt|d t|d }t	|d } t	|d } ||k rt
|D ]@}|d |  tt| | |d d  d7  <  ||kr n |d |  tt|| |d d  d7  <  ||kr nq ||krn|d d  d7  <  ||krn ||k sq~|||}n|dk r|dkr|||}n|dk r|dkr|dkrng dg dgg d g d!gg d"g d#gg d$g d%gd}|d&krmg d'dggg d(ddggg d)dd*ggg d+d*d*ggd}ng d(dggg d)ddggg d+dd*ggg d,d*d*ggd}| D ]}|||}qn1|dk r|d-v sJ d.| d/d*dgdgg}	d0dgd*gg}
|d1kr|	}n|
}||||d}||vrtd2| d3t|  d/|| \} }t | ||d}d|d< |S )4a  
    Define a step computation mask based on compute and cache bins.

    Args:
        compute_bins (`List[int]`, *optional*, defaults to None):
            A list specifying the number of consecutive steps to compute.
            For example, [4, 2] means compute 4 steps, then 2 steps.
        cache_bins (`List[int]`, *optional*, defaults to None):
            A list specifying the number of consecutive steps to cache.
            For example, [2, 4] means cache 2 steps, then 4 steps.
        total_steps (`int`, *optional*, defaults to None):
            Total number of steps for which the mask is generated.
            If provided, the sum of compute_bins and cache_bins must be at
            least total_steps.
        mask_policy (`str`, *optional*, defaults to "medium"):
            Predefined mask policy. Options are "slow", "medium", "fast", "ultra".
            For examples, if total_steps=28, each policy corresponds to specific
            compute and cache bin configurations:
                - "slow": compute_bins=[8, 3, 3, 2, 1, 1], cache_bins=1, 2, 2, 2, 3]
                - "medium": compute_bins=[6, 2, 2, 2, 2, 1], cache_bins=[1, 3, 3, 3, 3]
                - "fast": compute_bins=[6, 1, 1, 1, 1], cache_bins=[1, 3, 4, 5, 4]
                - "ultra": compute_bins=[4, 1, 1, 1, 1], cache_bins=[2, 5, 6, 7]
    Returns:
        `List[int]`: A list representing the step computation mask, where 1
        indicates a compute step and 0 indicates a cache step.
    N)r   r   r   z?total_steps must be provided when using predefined mask_policy.)      r   r   r	   r	   )r	   r   r   r   r   )   r   r   r   r   r	   )r	   r   r   r   r   )r   r	   r	   r	   r	   r	   )r	   r         r   )r   r	   r	   r	   r	   )r   r   r      )slowr   fastultrapolicyr   c                 S   s   t | d t | d  S )Nr   r	   )r   )r   r(   r(   r,   _sum_policyr  s   zsteps_mask.<locals>._sum_policytarget_stepsc                    s   | \}} | |krN|r|d  d8  < |d dkr|    | |kr)	 ||gS |r=|d  d8  < |d dkr=|    | |krH	 ||gS  | |ks
||gS )Nr	   r   )r   )r   r   r   r   )r   r(   r,   _truncate_policyu  s$   z$steps_mask.<locals>._truncate_policypoliciesc                    s(   i }|   D ]\}} ||||< q|S r'   )rB   )r   r   truncated_policiesnamer   )r   r(   r,   _truncate_predefined_policies  s   z1steps_mask.<locals>._truncate_predefined_policies   r   r	   g      ?r      r   )r   r   r   r   r	   )r	   r	   r	   r	   )r   r   r	   r	   r	   )r	   r	   r   r   )r   r	   r	   r	   r	   )r	   r   r   r   )r   r	   r	   r	   r	   )r	   r   r   r   	   )r   r   r	   )r   r	   r	   )r   r	   r	   r   )r   r	   r	   )r   r	   r	   )r   r   zaOnly total_steps=4 or 6 is supported for predefined masks while total_steps < 8. Got total_steps=r7   r   r   zmask_policy z is not valid. Choose from )r   r   intdictstrvaluesminrV   rs   rt   rangemaxrJ   listrv   )r   r   r   r   predefined_policiesr   r   min_bins_lenr^   constant_plicy_4_stepsconstant_plicy_6_stepsconstant_plicycompute_maskr(   )r   r   r,   
steps_mask1  s  !
*
	..





r   )NNNNr'   )NNNr   )0rs   rF   typingr   r   r   r   r   	diffusersr   r   cache_typesr
   block_adaptersr   r   cache_adaptersr   cache_contextsr   r   r   r   params_modifierr   parallelismr   r   cache_dit.loggerr   rM   r=   rG   rH   r_   boolrO   r   rj   rP   r{   r~   r   r   rQ   r   r   r   r(   r(   r(   r,   <module>   s    


  Y

%
D



	
&

