o
    ٷii                     @   s  U d Z ddlmZ ddlmZ ddlmZ ddlZddlm	Z	 ddl
mZ eG dd dZ			d5d	e	jd
ejdejdejdejeB eB dejdejdejdB dejdB deeef dB dedefddZd	e	jdejdejeB eB dejdejdejdejdejdejdejdejd ed!ejdedefd"d#Z	$	%d6d	e	jd&eej d'ejd(eej d)ed*ededefd+d,Zeeed-Zeeef ed.< d/ed0eddfd1d2Zd/edefd3d4ZdS )7a  
Model-specific extractors for TeaCache.

This module provides a registry of extractor functions that know how to extract
modulated inputs from different transformer architectures. Adding support for
a new model requires only adding a new extractor function to the registry.

With Option B enhancement, extractors now return a CacheContext object containing
all model-specific information needed for generic caching, including preprocessing,
transformer execution, and postprocessing logic.
    )Callable)	dataclass)AnyN)get_forward_contextc                   @   s   e Zd ZU dZejed< ejed< ejdB ed< ejed< eg eejdf f ed< eejge	f ed	< dZ
eee	f dB ed
< dddZdS )CacheContextaU
  
    Context object containing all model-specific information for caching.

    This allows the TeaCacheHook to remain completely generic - all model-specific
    logic is encapsulated in the extractor that returns this context.

    Attributes:
        modulated_input: Tensor used for cache decision (similarity comparison).
            Must be a torch.Tensor extracted from the first transformer block,
            typically after applying normalization and modulation.

        hidden_states: Current hidden states (will be modified by caching).
            Must be a torch.Tensor representing the main image/latent states
            after preprocessing but before transformer blocks.

        encoder_hidden_states: Optional encoder states (for dual-stream models).
            Set to None for single-stream models (e.g., Flux).
            For dual-stream models (e.g., Qwen), contains text encoder outputs.

        temb: Timestep embedding tensor.
            Must be a torch.Tensor containing the timestep conditioning.

        run_transformer_blocks: Callable that executes model-specific transformer blocks.
            Signature: () -> tuple[torch.Tensor, ...]

            Returns:
                tuple containing:
                - [0]: processed hidden_states (required)
                - [1]: processed encoder_hidden_states (optional, only for dual-stream)

            Example for single-stream:
                def run_blocks():
                    h = hidden_states
                    for block in module.transformer_blocks:
                        h = block(h, temb=temb)
                    return (h,)

            Example for dual-stream:
                def run_blocks():
                    h, e = hidden_states, encoder_hidden_states
                    for block in module.transformer_blocks:
                        e, h = block(h, e, temb=temb)
                    return (h, e)

        postprocess: Callable that does model-specific output postprocessing.
            Signature: (torch.Tensor) -> Union[torch.Tensor, Transformer2DModelOutput, tuple]

            Takes the processed hidden_states and applies final transformations
            (normalization, projection) to produce the model output.

            Example:
                def postprocess(h):
                    h = module.norm_out(h, temb)
                    output = module.proj_out(h)
                    return Transformer2DModelOutput(sample=output)

        extra_states: Optional dict for additional model-specific state.
            Use this for models that need to pass additional context beyond
            the standard fields.
    modulated_inputhidden_statesNencoder_hidden_statestemb.run_transformer_blockspostprocessextra_statesreturnc                 C   s:  t | jtjstdt| j t | jtjs"tdt| j | jdur8t | jtjs8tdt| j t | jtjsItdt| j t	| j
sXtdt| j
 t	| jsgtdt| j | jjd | jjd krtd	| jjd  d
| jjd  | jj| jjkrtd| jj d| jj dS )a  
        Validate that the CacheContext contains valid data.

        Raises:
            TypeError: If fields have wrong types
            ValueError: If tensors have invalid properties
            RuntimeError: If callables fail basic invocation tests

        This method should be called after creating a CacheContext to catch
        common developer errors early with clear error messages.
        z*modulated_input must be torch.Tensor, got z(hidden_states must be torch.Tensor, got Nz8encoder_hidden_states must be torch.Tensor or None, got ztemb must be torch.Tensor, got z-run_transformer_blocks must be callable, got z"postprocess must be callable, got r   z4Batch size mismatch: modulated_input has batch size z, but hidden_states has z$Device mismatch: modulated_input on z, hidden_states on )
isinstancer   torchTensor	TypeErrortyper   r	   r
   callabler   r   shape
ValueErrordevice)self r   a/home/ubuntu/.local/lib/python3.10/site-packages/vllm_omni/diffusion/cache/teacache/extractors.pyvalidatea   s:   



zCacheContext.validate)r   N)__name__
__module____qualname____doc__r   r   __annotations__r   tupler   r   dictstrr   r   r   r   r   r      s   
 
=

r   moduler   r	   encoder_hidden_states_masktimestep
img_shapestxt_seq_lensguidanceadditional_t_condattention_kwargskwargsr   c
                    s,  ddl m  tdrtjdkrtd|jjj	d}
|dur:|j	d }|du rE||n|||j||jdjd }|}|jd	d
d\}}||\}}fdd}|
dd fdd}t|||dS )a  
    Extract cache context for QwenImageTransformer2DModel.

    This is the ONLY Qwen-specific code needed for TeaCache support.
    It encapsulates preprocessing, modulated input extraction, transformer execution,
    and postprocessing logic.

    Args:
        module: QwenImageTransformer2DModel instance
        hidden_states: Input hidden states tensor
        encoder_hidden_states: Text encoder outputs
        encoder_hidden_states_mask: Mask for text encoder
        timestep: Current diffusion timestep
        img_shapes: Image shapes for position embedding
        txt_seq_lens: Text sequence lengths
        guidance: Optional guidance scale for CFG
        additional_t_cond: Optional additional timestep conditioning
        attention_kwargs: Additional attention arguments
        **kwargs: Additional keyword arguments ignored by this extractor

    Returns:
        CacheContext with all information needed for generic caching
    r   )Transformer2DModelOutputtransformer_blocksz#Module must have transformer_blocks)r   dtypeNi  r      dimc               
      s   } }}d}j durAj jdkrAt }|jdurA|jdkrAjd }|j|j }tj||tjj	d}d|dd|jdf< |durK|
 rKd}|durU|
 rUd}jD ]}|| || |d\}} qX| |fS )z$Execute all Qwen transformer blocks.N   r   r/   r   F)r   r	   r%   r
   image_rotary_embjoint_attention_kwargshidden_states_mask)parallel_configsequence_parallel_sizer   sp_original_seq_lensp_padding_sizer   r   onesboolr   allr.   )heencoder_maskr9   ctx
batch_sizepadded_seq_lenblock)r+   r	   r%   r   r7   r$   r
   r   r   r      s>   

	z4extract_qwen_context.<locals>.run_transformer_blocksreturn_dictTc                    s*    | } | }s|fS  |dS )z*Apply Qwen-specific output postprocessing.)sample)norm_outproj_outrA   output)r-   r$   rH   r
   r   r   r     s
   

z)extract_qwen_context.<locals>.postprocessr   r   r	   r
   r   r   )!diffusers.models.modeling_outputsr-   hasattrlenr.   r   img_intor   r/   txt_normtxt_intime_text_embed	pos_embedimg_modchunk	img_norm1getr   )r$   r   r	   r%   r&   r'   r(   r)   r*   r+   r,   rG   img_mod_paramsimg_mod1_img_modulatedr   r   r   )	r-   r+   r	   r%   r   r7   r$   rH   r
   r   extract_qwen_context   s:   $




)r`   x_tpacked_vae_token_indexespacked_vae_position_idspacked_text_idspacked_text_indexespacked_indexespacked_position_idspacked_seqlenskey_values_lenspast_key_valuespacked_key_value_indexesc              
      s   j j|}|tjf|< t|tjs$tj	|g|j
d}| dkr/|d}|}|}|| | }|jjkrN|j}|< } 	f
dd}fdd}t|d|||dS )	a  
    Extract cache context for Bagel model.

    Args:
        module: Bagel instance
        x_t: Latent image input
        timestep: Current timestep
        packed_vae_token_indexes: Indexes for VAE tokens in packed sequence
        packed_vae_position_ids: Position IDs for VAE tokens
        packed_text_ids: Text token IDs
        packed_text_indexes: Indexes for text tokens in packed sequence
        packed_indexes: Global indexes
        packed_position_ids: Global position IDs
        packed_seqlens: Sequence lengths
        key_values_lens: KV cache lengths
        past_key_values: KV cache
        packed_key_value_indexes: KV cache indexes
        **kwargs: Additional keyword arguments

    Returns:
        CacheContext with all information needed for generic caching
    r0   r   c                     sD   i } j rdd} jjd	 ddd	| }|jfS )Ngen)moderb   re   F)	packed_query_sequence
query_lenspacked_query_position_idspacked_query_indexesrj   ri   rk   update_past_key_values	is_causalr   )use_moelanguage_modelforwardrn   )extra_inputsrM   
ri   r$   rf   rk   rg   rh   packed_sequencere   rb   rj   r   r   r   Y  s(   
z5extract_bagel_context.<locals>.run_transformer_blocksc                    s     | }| }|S )N)llm2vae)rA   v_t)r$   rb   r   r   r   p  s   
z*extract_bagel_context.<locals>.postprocessNrN   )ru   modelembed_tokens	new_zerossumhidden_sizer   r   r   tensorr   r4   	unsqueezelatent_pos_embedtime_embeddervae2llmr/   rS   r   )r$   ra   r&   rb   rc   rd   re   rf   rg   rh   ri   rj   rk   r,   packed_text_embeddingpacked_pos_embedpacked_timestep_embedsx_t_embr   r   r   r   rx   r   extract_bagel_context  s0   (


r   r1   r5   xt	cap_feats
patch_sizef_patch_sizec           5         s\  ddl m} tdrtjdkrtdt|}|d j}	|j }
|
 	||\}}}}}}dd |D }t
|}tj|dd}j d  |} | j|t|< t|j|dd}tj|dd\}}t|j|dd}t|j|dd}||d	d
d}||d	d
d}||d	d
d}tj||ftj|	d}t|D ]\}}d||d|f< qjD ]
}||||| }qdd |D }t
|}tj|dd} | } j| t|< t| j|dd}!tj|dd\}"}#t|"j|dd}"t|#j|dd}#||!d	d
d}$||"d	d
d}%||#d	d
d}&tj||ftj|	d}'t|D ]\}}d|'|d|f< q3jD ]
}||$|'|%|&}$qDg }(g })g }*t|D ]M}|| }+|| },|(t|| d|+ |$| d|, g |)t|| d|+ |%| d|, g |*t|| d|+ |&| d|, g qYdd t||D }-t
|-}.||(d	d
d||)d	d
d||*d	d
dtj||.ftj|	dt|-D ]\}}d|d|f< qوjd }/|/ djddd}0d|0d  }1|/ |1 }2 fdd}3 fdd}4t!|2d |3|4|ddS )a  
    Extract cache context for ZImageTransformer2DModel.

    This is the ONLY Z-Image-specific code needed for TeaCache support.
    It encapsulates preprocessing, modulated input extraction, transformer execution,
    and postprocessing logic.

    Args:
        module: ZImageTransformer2DModel instance
        x: List of image tensors per batch item
        t: Timestep tensor
        cap_feats: List of caption feature tensors per batch item
        patch_size: Patch size for patchification (default: 2)
        f_patch_size: Frame patch size (default: 1)
        **kwargs: Additional keyword arguments ignored by this extractor

    Returns:
        CacheContext with all information needed for generic caching
    r   )pad_sequencelayersz(Module must have main transformer layersc                 S      g | ]}t |qS r   rQ   .0r^   r   r   r   
<listcomp>      z*extract_zimage_context.<locals>.<listcomp>r3   -Tg        )batch_firstpadding_valuer6   r5   Nc                 S   r   r   r   r   r   r   r   r     r   c                 S   s   g | ]\}}|| qS r   r   )r   abr   r   r   r     s       r1   g      ?c                     s&   } j D ]
}||  } q| fS )z,Execute all Z-Image main transformer blocks.)r   )rA   layer)adaln_inputr$   unifiedunified_attn_maskunified_cosunified_sinr   r   r     s   
z6extract_zimage_context.<locals>.run_transformer_blocksc                    sB   j  d  |  } t| jdd} | }|i fS )z-Apply Z-Image specific output postprocessing.r   r   r3   )all_final_layerlistunbind
unpatchifyrL   )r   r   r$   r   x_sizer   r   r     s   z+extract_zimage_context.<locals>.postprocess)r   r   r   r   x_item_seqlensr   r   )r   r   r	   r
   r   r   r   )"torch.nn.utils.rnnr   rP   rQ   r   r   r   t_scale
t_embedderpatchify_and_embedmaxr   catall_x_embeddertype_asx_pad_tokenr   splitrope_embedderzerosr?   	enumeratenoise_refinercap_embeddercap_pad_tokencontext_refinerrangeappendzipadaLN_modulationr   rY   attention_norm1r   )5r$   r   r   r   r   r   r,   r   bszr   t_scaled	x_patchescap_feats_processed	x_pos_idscap_pos_idsx_inner_pad_maskcap_inner_pad_maskr   x_max_item_seqlen
x_embeddedx_listx_cosx_sin	x_batchedx_cos_batchedx_sin_batchedx_attn_maskiseq_lenr   cap_item_seqlenscap_max_item_seqlencap_embeddedcap_listcap_coscap_sincap_batchedcap_cos_batchedcap_sin_batchedcap_attn_maskunified_listunified_cos_listunified_sin_listx_lencap_lenunified_item_seqlensunified_max_item_seqlenrG   
mod_params	scale_msar   r   r   r   )	r   r   r$   r   r   r   r   r   r   r   extract_zimage_context  s   






,,0


r   )QwenImageTransformer2DModelBagelZImageTransformer2DModelEXTRACTOR_REGISTRYtransformer_cls_nameextractor_fnc                 C   s   |t | < dS )a  
    Register a new extractor function for a model type.

    This allows extending TeaCache support to new models without modifying
    the core TeaCache code.

    Args:
        transformer_cls_name: Transformer model type identifier (class name or type string)
        extractor_fn: Function with signature (module, *args, **kwargs) -> CacheContext

    Example:
        >>> def extract_flux_context(module, hidden_states, timestep, guidance=None, **kwargs):
        ...     # Preprocessing
        ...     temb = module.time_text_embed(timestep, guidance)
        ...     # Extract modulated input
        ...     modulated = module.transformer_blocks[0].norm1(hidden_states, emb=temb)
        ...     # Define execution
        ...     def run_blocks():
        ...         h = hidden_states
        ...         for block in module.transformer_blocks:
        ...             h = block(h, temb=temb)
        ...         return (h,)
        ...     # Define postprocessing
        ...     def postprocess(h):
        ...         return module.proj_out(module.norm_out(h, temb))
        ...     # Return context
        ...     return CacheContext(modulated, hidden_states, None, temb, run_blocks, postprocess)
        >>> register_extractor("FluxTransformer2DModel", extract_flux_context)
    N)r   )r   r   r   r   r   register_extractorI  s   r   c                 C   s2   | t v rt |  S tt  }td|  d| d)aN  
    Get extractor function for given transformer class.

    This function looks up the extractor based on the exact transformer_cls_name string,
    which should match the transformer type in the pipeline (i.e., pipeline.transformer.__class__.__name__).

    Args:
        transformer_cls_name: Transformer class name (e.g., "QwenImageTransformer2DModel")
                                Must exactly match a key in EXTRACTOR_REGISTRY.

    Returns:
        Extractor function with signature (module, *args, **kwargs) -> CacheContext

    Raises:
        ValueError: If model type not found in registry

    Example:
        >>> # Get extractor for QwenImageTransformer2DModel
        >>> extractor = get_extractor("QwenImageTransformer2DModel")
        >>> ctx = extractor(transformer, hidden_states, encoder_hidden_states, timestep, ...)
    zUnknown model type: 'z'. Available types: zW
To add support for a new model, use register_extractor() or add to EXTRACTOR_REGISTRY.)r   r   keysr   )r   available_typesr   r   r   get_extractorj  s   r   )NNN)r1   r5   )r   collections.abcr   dataclassesr   typingr   r   torch.nnnn#vllm_omni.diffusion.forward_contextr   r   Moduler   floatintr"   r#   r`   
LongTensor	IntTensorr   r   r   r   r    r   r   r   r   r   r   <module>   s    	

 	

j
 B
!