o
    پi>1                     @   sp   d Z ddlmZ ddlmZmZ ddlZddlZddl	m
Z
 er&ddlmZ eG dd dZG d	d
 d
ZdS )a  
TeaCache: Temporal similarity-based caching for diffusion models.

TeaCache accelerates diffusion inference by selectively skipping redundant
computation when consecutive diffusion steps are similar enough. This is
achieved by tracking the L1 distance between modulated inputs across timesteps.

Key concepts:
- Modulated input: The input to transformer blocks after timestep conditioning
- L1 distance: Measures how different consecutive timesteps are
- Threshold: When accumulated L1 distance exceeds threshold, force computation
- CFG support: Separate caches for positive and negative branches

References:
- TeaCache: Accelerating Diffusion Models with Temporal Similarity
  https://arxiv.org/abs/2411.14324
    )	dataclass)TYPE_CHECKINGAnyN)	DiTConfig)TeaCacheParamsc                   @   sN   e Zd ZU dZeed< eed< eed< eed< eed< ee ed< ded	< d
S )TeaCacheContexta  Common context extracted for TeaCache skip decision.

    This context is populated from the forward_batch and forward_context
    during each denoising step, providing all information needed to make
    cache decisions.

    Attributes:
        current_timestep: Current denoising timestep index (0-indexed).
        num_inference_steps: Total number of inference steps.
        do_cfg: Whether classifier-free guidance is enabled.
        is_cfg_negative: True if currently processing negative CFG branch.
        teacache_thresh: Threshold for accumulated L1 distance.
        coefficients: Polynomial coefficients for L1 rescaling.
        teacache_params: Full TeaCacheParams for model-specific access.
    current_timestepnum_inference_stepsdo_cfgis_cfg_negativeteacache_threshcoefficientsr   teacache_paramsN)	__name__
__module____qualname____doc__int__annotations__boolfloatlist r   r   `/home/ubuntu/.local/lib/python3.10/site-packages/sglang/multimodal_gen/runtime/cache/teacache.pyr       s   
 r   c                
   @   s   e Zd ZU dZh dZee ed< eed< dddZ	dd	d
Z
dejdee dedeeef fddZdejdedee dedef
ddZdedB fddZdejdejddfddZdeeef defddZdejdejfddZdS )TeaCacheMixinar  
    Mixin class providing TeaCache optimization functionality.

    TeaCache accelerates diffusion inference by selectively skipping redundant
    computation when consecutive diffusion steps are similar enough.

    This mixin should be inherited by DiT model classes that want to support
    TeaCache optimization. It provides:
    - State management for tracking L1 distances
    - CFG-aware caching (separate caches for positive/negative branches)
    - Decision logic for when to compute vs. use cache

    Example usage in a DiT model:
        class MyDiT(TeaCacheMixin, BaseDiT):
            def __init__(self, config, **kwargs):
                super().__init__(config, **kwargs)
                self._init_teacache_state()

            def forward(self, hidden_states, timestep, ...):
                ctx = self._get_teacache_context()
                if ctx is not None:
                    # Compute modulated input (model-specific, e.g., after timestep embedding)
                    modulated_input = self._compute_modulated_input(hidden_states, timestep)
                    is_boundary = (ctx.current_timestep == 0 or
                                   ctx.current_timestep >= ctx.num_inference_steps - 1)

                    should_calc = self._compute_teacache_decision(
                        modulated_inp=modulated_input,
                        is_boundary_step=is_boundary,
                        coefficients=ctx.coefficients,
                        teacache_thresh=ctx.teacache_thresh,
                    )

                    if not should_calc:
                        # Use cached residual (must implement retrieve_cached_states)
                        return self.retrieve_cached_states(hidden_states)

                # Normal forward pass...
                output = self._transformer_forward(hidden_states, timestep, ...)

                # Cache states for next step
                if ctx is not None:
                    self.maybe_cache_states(output, hidden_states)

                return output

    Subclass implementation notes:
        - `_compute_modulated_input()`: Model-specific method to compute the input
          after timestep conditioning (used for L1 distance calculation)
        - `retrieve_cached_states()`: Must be overridden to return cached output
        - `maybe_cache_states()`: Override to store states for cache retrieval

    Attributes:
        cnt: Counter for tracking steps.
        enable_teacache: Whether TeaCache is enabled.
        previous_modulated_input: Cached modulated input for positive branch.
        previous_residual: Cached residual for positive branch.
        accumulated_rel_l1_distance: Accumulated L1 distance for positive branch.
        is_cfg_negative: Whether currently processing negative CFG branch.
        _supports_cfg_cache: Whether this model supports CFG cache separation.

    CFG-specific attributes (only when _supports_cfg_cache is True):
        previous_modulated_input_negative: Cached input for negative branch.
        previous_residual_negative: Cached residual for negative branch.
        accumulated_rel_l1_distance_negative: L1 distance for negative branch.
    >   wanzimagehunyuan_CFG_SUPPORTED_PREFIXESconfigreturnNc                 C   sX   d| _ d| _| jj | jv | _d| _d| _d| _	d| _
| jr*d| _d| _d| _dS dS )z:Initialize TeaCache state. Call this in subclass __init__.r   TN        F)cntenable_teacacher   prefixlowerr   _supports_cfg_cacheprevious_modulated_inputprevious_residualaccumulated_rel_l1_distancer   !previous_modulated_input_negativeprevious_residual_negative$accumulated_rel_l1_distance_negativeselfr   r   r   _init_teacache_state   s   
z"TeaCacheMixin._init_teacache_statec                 C   sD   d| _ d| _d| _d| _d| _d| _| jr d| _d| _d| _	dS dS )z>Reset all TeaCache state at the start of each generation task.r   Nr!   FT)
r"   r'   r(   r)   r   r#   r&   r*   r+   r,   r-   r   r   r   reset_teacache_state   s   
z"TeaCacheMixin.reset_teacache_statemodulated_inpr   r   c           	      C   s   | j r| jn| j}|du rdS || }|  |     }t|}| j r.| j	n| j
}||| }||kr=dS |dfS )ap  
        Compute L1 distance and decide whether to calculate or use cache.

        Args:
            modulated_inp: Current timestep's modulated input.
            coefficients: Polynomial coefficients for L1 rescaling.
            teacache_thresh: Threshold for cache decision.

        Returns:
            Tuple of (new_accumulated_distance, should_calc).
        Nr!   TF)r   r*   r'   absmeancpuitemnppoly1dr,   r)   )	r.   r1   r   r   prev_modulated_inpdiffrel_l1rescale_funcr)   r   r   r   _compute_l1_and_decide   s"    
z$TeaCacheMixin._compute_l1_and_decideis_boundary_stepc                 C   s`   | j sdS |rd\}}n
| j|||d\}}| js#| | _|| _|S | jr.| | _|| _|S )a  
        Compute cache decision for TeaCache.

        Args:
            modulated_inp: Current timestep's modulated input.
            is_boundary_step: True for boundary timesteps that always compute.
            coefficients: Polynomial coefficients for L1 rescaling.
            teacache_thresh: Threshold for cache decision.

        Returns:
            True if forward computation is needed, False to use cache.
        Tr2   )r1   r   r   )	r#   r=   r   cloner'   r)   r&   r*   r,   )r.   r1   r>   r   r   	new_accumshould_calcr   r   r   _compute_teacache_decision   s"   



z(TeaCacheMixin._compute_teacache_decisionc           	   	   C   s   ddl m} | }|j}|du s|jr|jdu rdS |j}|j}|j}|j}|j}|dkr4| js4| 	  t
|||||j|j|dS )z
        Check TeaCache preconditions and extract common context.

        Returns:
            TeaCacheContext if TeaCache is enabled and properly configured,
            None if should skip TeaCache logic entirely.
        r   )get_forward_contextN)r   r	   r
   r   r   r   r   )6sglang.multimodal_gen.runtime.managers.forward_contextrC   forward_batchr#   r   r   r	   do_classifier_free_guidancer   r0   r   r   r   )	r.   rC   forward_contextrE   r   r   r	   r
   r   r   r   r   _get_teacache_context  s0   
z#TeaCacheMixin._get_teacache_contexthidden_statesoriginal_hidden_statesc                 C      dS )zACache states for later retrieval. Override in subclass if needed.Nr   )r.   rI   rJ   r   r   r   maybe_cache_states0  s   z TeaCacheMixin.maybe_cache_stateskwargsc                 K   rK   )z4Check if forward can be skipped using cached states.Fr   )r.   rM   r   r   r   %should_skip_forward_for_cached_states6  s   z3TeaCacheMixin.should_skip_forward_for_cached_statesc                 C   s   t d)z8Retrieve cached states. Must be implemented by subclass.z)retrieve_cached_states is not implemented)NotImplementedError)r.   rI   r   r   r   retrieve_cached_states:  s   z$TeaCacheMixin.retrieve_cached_states)r    N)r   r   r   r   r   setstrr   r   r/   r0   torchTensorr   r   tupler   r=   rB   r   rH   rL   dictr   rN   rP   r   r   r   r   r   ;   sH   
 E



/
)-
r   )r   dataclassesr   typingr   r   numpyr7   rS   $sglang.multimodal_gen.configs.modelsr   -sglang.multimodal_gen.configs.sample.teacacher   r   r   r   r   r   r   <module>   s   