o
    GiK7                     @   s   d dl Z d dlZd dlmZ d dlZd dlmZ ddlmZ ddl	m
Z
mZmZ eeZdZdZd	Zee Zd
ZdZeG dd dZG dd dZG dd deZdedeee ee f fddZdejjdefddZdejdedefddZ dS )    N)	dataclass   )logging   )HookRegistry	ModelHookStateManagertaylorseer_cache)z^blocks.*attnz^transformer_blocks.*attnz ^single_transformer_blocks.*attn)z"^temporal_transformer_blocks.*attn)z^[^.]*block[^.]*\.[^.]+$)z
^proj_out$c                   @   s   e Zd ZU dZdZeed< dZeed< dZedB ed< dZ	eed	< e
jZe
jdB ed
< dZee dB ed< dZee dB ed< dZeed< defddZdS )TaylorSeerCacheConfiga  
    Configuration for TaylorSeer cache. See: https://huggingface.co/papers/2503.06923

    Attributes:
        cache_interval (`int`, defaults to `5`):
            The interval between full computation steps. After a full computation, the cached (predicted) outputs are
            reused for this many subsequent denoising steps before refreshing with a new full forward pass.

        disable_cache_before_step (`int`, defaults to `3`):
            The denoising step index before which caching is disabled, meaning full computation is performed for the
            initial steps (0 to disable_cache_before_step - 1) to gather data for Taylor series approximations. During
            these steps, Taylor factors are updated, but caching/predictions are not applied. Caching begins at this
            step.

        disable_cache_after_step (`int`, *optional*, defaults to `None`):
            The denoising step index after which caching is disabled. If set, for steps >= this value, all modules run
            full computations without predictions or state updates, ensuring accuracy in later stages if needed.

        max_order (`int`, defaults to `1`):
            The highest order in the Taylor series expansion for approximating module outputs. Higher orders provide
            better approximations but increase computation and memory usage.

        taylor_factors_dtype (`torch.dtype`, defaults to `torch.bfloat16`):
            Data type used for storing and computing Taylor series factors. Lower precision reduces memory but may
            affect stability; higher precision improves accuracy at the cost of more memory.

        skip_predict_identifiers (`list[str]`, *optional*, defaults to `None`):
            Regex patterns (using `re.fullmatch`) for module names to place as "skip" in "cache" mode. In this mode,
            the module computes fully during initial or refresh steps but returns a zero tensor (matching recorded
            shape) during prediction steps to skip computation cheaply.

        cache_identifiers (`list[str]`, *optional*, defaults to `None`):
            Regex patterns (using `re.fullmatch`) for module names to place in Taylor-series caching mode, where
            outputs are approximated and cached for reuse.

        use_lite_mode (`bool`, *optional*, defaults to `False`):
            Enables a lightweight TaylorSeer variant that minimizes memory usage by applying predefined patterns for
            skipping and caching (e.g., skipping blocks and caching projections). This overrides any custom
            `inactive_identifiers` or `active_identifiers`.

    Notes:
        - Patterns are matched using `re.fullmatch` on the module name.
        - If `skip_predict_identifiers` or `cache_identifiers` are provided, only matching modules are hooked.
        - If neither is provided, all attention-like modules are hooked by default.

    Example of inactive and active usage:

    ```py
    def forward(x):
        x = self.module1(x)  # inactive module: returns zeros tensor based on shape recorded during full compute
        x = self.module2(x)  # active module: caches output here, avoiding recomputation of prior steps
        return x
    ```
       cache_interval   disable_cache_before_stepNdisable_cache_after_stepr   	max_ordertaylor_factors_dtypeskip_predict_identifierscache_identifiersFuse_lite_modereturnc                 C   sF   d| j  d| j d| j d| j d| j d| j d| j d| j d	S )
Nz%TaylorSeerCacheConfig(cache_interval=z, disable_cache_before_step=z, disable_cache_after_step=z, max_order=z, taylor_factors_dtype=z, skip_predict_identifiers=z, cache_identifiers=z, use_lite_mode=))r   r   r   r   r   r   r   r   self r   T/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/hooks/taylorseer_cache.py__repr__[   s$   zTaylorSeerCacheConfig.__repr__)__name__
__module____qualname____doc__r   int__annotations__r   r   r   torchbfloat16r   dtyper   liststrr   r   boolr   r   r   r   r   r
      s   
 7r
   c                   @   st   e Zd ZejddfdejdB dedefddZdd
dZ	de
ejdf d	dfddZejjd	eej fddZdS )TaylorSeerStater   Fr   Nr   is_inactivec                 C   s:   || _ || _|| _d| _d | _i | _d | _d | _d| _d S )Nr   )	r   r   r)   module_dtypeslast_update_steptaylor_factorsinactive_shapesdevicecurrent_step)r   r   r   r)   r   r   r   __init__j   s   
zTaylorSeerState.__init__r   c                 C   s"   d| _ d | _i | _d | _d | _d S )Nr*   )r0   r,   r-   r.   r/   r   r   r   r   reset{   s
   
zTaylorSeerState.resetoutputs.c           
         s   t dd |D  _|d j _ jrt dd |D  _nZt|D ]U\}}d|i} jd u }|si j j }|dkr?td j	
|i }t jD ]}|
|}	|	d u rX n|| |	|j | ||d < qK fdd| D  j	|< q" j _d S )	Nc                 s       | ]}|j V  qd S Nr$   .0outputr   r   r   	<genexpr>       z)TaylorSeerState.update.<locals>.<genexpr>r   c                 s   r4   r5   )shaper7   r   r   r   r:      r;   z0Delta step cannot be zero for TaylorSeer update.r   c                    s   i | ]\}}||  jqS r   )tor   )r8   orderfactorr   r   r   
<dictcomp>   s    z*TaylorSeerState.update.<locals>.<dictcomp>)tupler+   r/   r)   r.   	enumerater,   r0   
ValueErrorr-   getranger   r=   r$   items)
r   r3   ifeaturesnew_factorsis_first_update
delta_stepprev_factorsjprevr   r   r   update   s*   

"
zTaylorSeerState.updatec                 C   s  | j d u r	td| j| j  }g }| jr;| jd u rtdtt| jD ]}|t	j
| j| | j| | jd q$|S | jsBtdt| j}t| jd }t|D ]7}| j| }| j| }t	j|d |d}t|D ]}	||	 t|	 }
||	 }||||
  }qk|| qR|S )Nz3Cannot predict without prior initialization/update.z*Inactive shapes not set during prediction.)r$   r/   z'Taylor factors empty during prediction.r   r6   )r,   rC   r0   r)   r.   rE   lenr+   appendr"   zerosr/   r-   
zeros_likemath	factorialr=   )r   step_offsetr3   rG   num_outputs
num_ordersoutput_dtyper-   r9   r>   coeffr?   r   r   r   predict   s<   




zTaylorSeerState.predict)r   N)r   r   r   r"   r#   r$   r    r'   r1   r2   rA   TensorrO   compilerdisabler%   r[   r   r   r   r   r(   i   s&    


r(   c                       s   e Zd ZdZ	ddededejdededB f
 fdd	Zd
ej	j
fddZd
ej	j
ddfddZejjdefddZd
ej	j
fddZ  ZS )TaylorSeerCacheHookTNr   r   r   state_managerr   c                    s,   t    || _|| _|| _|| _|| _d S r5   )superr1   r   r   r   r   r`   )r   r   r   r   r`   r   	__class__r   r   r1      s   

zTaylorSeerCacheHook.__init__modulec                 C   s   |S r5   r   r   rd   r   r   r   initialize_hook   s   z#TaylorSeerCacheHook.initialize_hookr   c                 C   s   | j   dS )z4
        Reset state between sampling runs.
        N)r`   r2   re   r   r   r   reset_state   s   zTaylorSeerCacheHook.reset_statec                 C   sh   | j  }| jd7  _|j}|| jk }|| j d | j dk}| jd uo)|| jk}|p/|p/|}||fS Nr   r   )r`   	get_stater0   r   r   r   )r   stater0   is_warmup_phaseis_compute_intervalis_cooldown_phaseshould_computer   r   r   _measure_should_compute   s   

z+TaylorSeerCacheHook._measure_should_computec           	      O   sj   |   \}}|r#| jj|i |}t|tjr|fn|}|| |S | }t|dkr1|d S t	|S rh   )
ro   fn_reforiginal_forward
isinstancer"   r\   rO   r[   rP   rA   )	r   rd   argskwargsrn   rj   r3   wrapped_outputsoutputs_listr   r   r   new_forward   s   
zTaylorSeerCacheHook.new_forwardr5   )r   r   r   _is_statefulr    r"   r$   r   r1   nnModulerf   rg   r]   r^   r'   ro   rw   __classcell__r   r   rb   r   r_      s&    
r_   configr   c                 C   s8   | j dur| j nd}| jdur| jnd}|pg |pg fS )zV
    Resolve effective inactive and active pattern lists from config + templates.
    N)r   r   )r|   inactive_patternsactive_patternsr   r   r   _resolve_patterns   s   r   rd   c                    s   t |\}}|p	t}|jr!td t}t}|js|jr!t	d | 
 D ]&\ }t fdd|D }t fdd|D }|sD|sDq%t|||d q%dS )a  
    Applies the TaylorSeer cache to a given pipeline (typically the transformer / UNet).

    This function hooks selected modules in the model to enable caching or skipping based on the provided
    configuration, reducing redundant computations in diffusion denoising loops.

    Args:
        module (torch.nn.Module): The model subtree to apply the hooks to.
        config (TaylorSeerCacheConfig): Configuration for the cache.

    Example:
    ```python
    >>> import torch
    >>> from diffusers import FluxPipeline, TaylorSeerCacheConfig

    >>> pipe = FluxPipeline.from_pretrained(
    ...     "black-forest-labs/FLUX.1-dev",
    ...     torch_dtype=torch.bfloat16,
    ... )
    >>> pipe.to("cuda")

    >>> config = TaylorSeerCacheConfig(
    ...     cache_interval=5,
    ...     max_order=1,
    ...     disable_cache_before_step=3,
    ...     taylor_factors_dtype=torch.float32,
    ... )
    >>> pipe.transformer.enable_cache(config)
    ```
    z(Using TaylorSeer Lite variant for cache.z"Lite mode overrides user patterns.c                 3       | ]	}t | V  qd S r5   re	fullmatchr8   patternnamer   r   r:   -      z)apply_taylorseer_cache.<locals>.<genexpr>c                 3   r   r5   r   r   r   r   r   r:   .  r   )rd   r|   r)   N)r   _TRANSFORMER_BLOCK_IDENTIFIERSr   loggerinfo_PROJ_OUT_IDENTIFIERS_BLOCK_IDENTIFIERSr   r   warningnamed_modulesany_apply_taylorseer_cache_hook)rd   r|   r}   r~   	submodulematches_inactivematches_activer   r   r   apply_taylorseer_cache  s&   

r   r)   c                 C   sL   t t|j|j|dd}t| }t|j|j|j|j	|d}|
|t dS )a  
    Registers the TaylorSeer hook on the specified nn.Module.

    Args:
        name: Name of the module.
        module: The nn.Module to be hooked.
        config: Cache configuration.
        is_inactive: Whether this module should operate in "inactive" mode.
    )r   r   r)   )init_kwargs)r   r   r   r   r`   N)r   r(   r   r   r   check_if_exists_or_initializer_   r   r   r   register_hook_TAYLORSEER_CACHE_HOOK)rd   r|   r)   r`   registryhookr   r   r   r   8  s    
	r   )!rT   r   dataclassesr   r"   torch.nnry   utilsr   hooksr   r   r   
get_loggerr   r   r   $_SPATIAL_ATTENTION_BLOCK_IDENTIFIERS%_TEMPORAL_ATTENTION_BLOCK_IDENTIFIERSr   r   r   r
   r(   r_   rA   r%   r&   r   rz   r   r'   r   r   r   r   r   <module>   s6    
O\"26