o
    ۷ia8                     @   s   d dl mZ d dlmZ d dlZd dlmZ d dlmZ d dlm	Z	m
Z
 d dlmZ dd	lmZmZ dd
lmZ eeZG dd de
Z		ddejdejdejdejdB dedefddZdejddfddZG dd deZdS )    )chain)AnyN)nn)init_logger)HookRegistry	ModelHook)current_omni_platform   )OffloadBackendOffloadConfig)ModuleDiscoveryc                       s,  e Zd ZdZdZ		ddejdejde	j
dB defd	d
Zdejdejf fddZe	d deeejf deeejf dejdedeeejejf eejeeeef  f f f
ddZejjd deddfddZejjd!ddZdejdededeeef fddZdejdedefddZ  ZS )"LayerwiseOffloadHookaz  Hook for layerwise (transformer-block-wise) CPU offloading.

    The hook instance retains parameters for both the current registered block
    module and those for the next block, as well as flattened CPU tensors which
    record the parameters of the current block module, so that these parameters
    could be re-materialized on device in an overlapping way.
    This hook should be registered to each of the transformer blocks in DiT
    module(s) of the target pipeline.

    Based on implementations from:
    https://github.com/sgl-project/sglang/blob/v0.5.8/python/sglang/multimodal_gen/runtime/utils/layerwise_offload.py
    layerwise_offloadNT
next_blockdevicestream
pin_memoryc                 C   sV   t |tjs
J d|| _|| _|pt | _|| _d | _	i | _
i | _i | _i | _d S )Nz0transformer block must be type `torch.nn.Module`)
isinstancer   Moduler   r   r   current_streamcopy_streamr   _prefetch_donenext_block_parametersnext_block_buffersdtype_cpu_flattened_weightsdtype_metadata)selfr   r   r   r    r   e/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm_omni/diffusion/offloader/layerwise_backend.py__init__$   s   
zLayerwiseOffloadHook.__init__modulereturnc                    sl   t  |}t| | _t| | _t| j | _t| j | _	t
| j| j	| j| j\| _| _|S N)superinitialize_hookdictnamed_parametersblock_parametersnamed_buffersblock_buffersr   r   r   r   _to_cpur   r   r   r   )r   r    	__class__r   r   r$   :   s   z$LayerwiseOffloadHook.initialize_hookparamsbufsc              	   C   s  i }i }i }t |  | D ]\}}|j}	|	|vri ||	< |||	 |< q| D ]Z\}	}
tdd |
 D }tj||	d|d}d}|
 D ]7\}}| }||||  |  |	|vrdg ||	< ||	 	||||j
d tjd||	d|_||7 }qG|||	< q)||fS )	a  Helper method to move block parameters and buffers to CPU, flattening by dtype.

        Consolidates parameters and buffers into contiguous CPU tensors grouped by dtype
        for GPU transfers. Replaces original tensors with empty placeholders.

        Returns:
            Tuple of
                flattened CPU tensors by dtype,
                metadata for reconstruction by dtype
        c                 s   s    | ]	\}}|  V  qd S r"   )numel).0_tr   r   r   	<genexpr>j   s    z/LayerwiseOffloadHook._to_cpu.<locals>.<genexpr>cpu)dtyper   r   r   )nameoffsetr/   shaper   r   r5   )r   itemsr5   sumtorchemptyr/   copy_flattenappendr8   data)r-   r.   r   r   dtype_grouped_weightsr   r   r6   param_or_bufr5   name2weightstotal_numel
cpu_tensorcurrent_offsetr/   r   r   r   r*   L   s8   	

zLayerwiseOffloadHook._to_cpunon_blockingc                 C   s  | j t  | j}| j}t }i }t| j - | j	 D ]\}}t
j|j|| jd}|j||d |||< q || j  W d   n1 sJw   Y  | j	 D ]1\}}	|| }|	D ]&}
|
d }||v rl|| n|| }||
d |
d |
d   |
d |_q^qT|| _dS )zCopy layer weights from CPU -> GPU.

        Pre-fetch target block in an asynchronous way with compute - memory copy overlap,
        with non_blocking set to True.
        )r5   r   rH   Nr6   r7   r/   r8   )r   wait_streamr   r   r   r   Eventr   r   r:   r<   r=   r8   r   r>   recordr   viewrA   r   )r   rH   layer_params
layer_bufsevtgpu_weightsr5   
cpu_weight
gpu_weightordered_metadatametadatatarget_nametarget_param_or_bufr   r   r   prefetch_layer   s.   


z#LayerwiseOffloadHook.prefetch_layerc                 C   sz   | j }|durt | d| _ | j D ]\}}tjd| j|j	d|_
q| j D ]\}}tjd| j|j	d|_
q+dS )zFree GPU memory for layer by replacing tensors with empty placeholders.
        This function does not actually offload weights from GPU back to CPU.
        Nr   r9   )r   r   r   
wait_eventr'   r:   r<   r=   r   r5   rA   r)   )r   rP   r1   parambufr   r   r   offload_layer   s   z"LayerwiseOffloadHook.offload_layerargskwargsc                 O   s   | j dd ||fS )NTrI   )rX   )r   r    r]   r^   r   r   r   pre_forward   s   z LayerwiseOffloadHook.pre_forwardoutputc                 C   s   |    |S r"   )r\   )r   r    r`   r   r   r   post_forward   s   z!LayerwiseOffloadHook.post_forwardNT)Tr!   N)__name__
__module____qualname____doc__
_HOOK_NAMEr   r   r<   r   r   Streamboolr   r$   staticmethodr%   str	ParameterTensortupler5   listr   r*   compilerdisablerX   r\   r_   ra   __classcell__r   r   r+   r   r      sD    
,6&$ r   Tr    r   r   r   r   r!   c                 C   s*   t | }t||||}|tj| |S r"   )r   get_or_creater   register_hookrh   )r    r   r   r   r   registryhookr   r   r   apply_block_hook   s   
rx   c                 C   s8   t | dd }|d ur|tj td| jj d S d S )N_hook_registryzRemoved offload hook from %s)getattrremove_hookr   rh   loggerdebugr,   rd   )r    rv   r   r   r   remove_block_hook   s
   r~   c                       s   e Zd ZdZdedejf fddZdej	ddfd	d
Z
dddZedej	dedB fddZedej	deddfddZedej	deej	 fddZ  ZS )LayerWiseOffloadBackenda  Layer-wise (block-level) offloading backend.

    Implements sliding window offloading where only a small number of transformer
    blocks reside on GPU at a time. Blocks are prefetched asynchronously while
    previous blocks compute, and freed after use.
    configr   c                    s"   t  || t | _g | _d S r"   )r#   r   r   ri   r   _blocks)r   r   r   r+   r   r   r      s   

z LayerWiseOffloadBackend.__init__pipeliner!   Nc              
   C   sl  | j r
td d S t|}|jstd d S |jD ]}|| j q|j	d urNz|j	j| jdd W n t
yM } ztd| W Y d }~nd }~ww td|j t|jD ]\}}|j| }td| d|jj d	 t|}t|}	|r|	std
||jj || j qZt|	}
|
dkrtd||jj || j qZ| D ]#\}}||krtd|  q|| j td| d| j  q|	d |	d }}t||| j| j| jj}|jdd t|	d d D ]\}}|	|d |
  }t||| j| j| jj qtd|
 d | j|	 qZt| jdkr2t| jd dkr4d| _ d S d S d S )Nz'LayerWiseOffloadBackend already enabledz@No DiT/transformer modules found, skipping layer-wise offloadingTrI   zFailed to move VAE to GPU: %sz$Applying layer-wise offloading on %szApplying hooks on z ()z@Target layers (blocks) not found. Skipping offloading on %s (%s)r	   z<#Target layers (blocks) <= 1. Skipping offloading on %s (%s)zSkipped blocks module zMoved z to device r   Fz!Layer-wise offloading enabled on z layers (blocks))enabledr|   warningr   discoverditsencoderstor   vae	Exceptionr}   info	dit_names	enumerater,   rd   r   get_blocks_attr_nameget_blocks_from_ditlennamed_childrenrx   r   r   pin_cpu_memoryrX   r   r@   )r   r   modulesencexci
dit_moduledit_nameblocks_attr_nameblocks
num_blocksr6   m
last_blockfirst_blockrw   blockr   r   r   r   enable   sp   







$
zLayerWiseOffloadBackend.enablec                 C   sF   | j sd S | jD ]}|D ]}t| qq| j  d| _ td d S )NFzLayer-wise offloading disabled)r   r   r~   clearr|   r   )r   r   r   r   r   r   rr   8  s   


zLayerWiseOffloadBackend.disablemodelc                 C   s   t | jddS )z6Retrieve blocks attribute name from provided DiT model_layerwise_offload_blocks_attrN)rz   r,   )r   r   r   r   r   D  s   z,LayerWiseOffloadBackend.get_blocks_attr_namer6   c                 C   s"   t | jdst| jd| d S d S )Nr   )hasattrr,   setattr)r   r6   r   r   r   set_blocks_attr_nameI  s   z,LayerWiseOffloadBackend.set_blocks_attr_namec                 C   sh   t | }|du rtd| jj d g S t| |d}|du r0td| d| jj d g S t|S )a/  
        Retrieve a list of blocks from provided DiT model. Blocks attribute name
        are found by `_layerwise_offload_blocks_attr` set to DiT models. For example,

        ```
        class WanTransformer3DModel(nn.Module):
            _layerwise_offload_blocks_attr = "blocks"
        ```
        Nz.No _layerwise_offload_blocks_attr defined for z, skipping layerwise offloadingzBlocks (layers) 'z' not found on )r   r   r|   r   r,   rd   rz   rp   )r   r   r   r   r   r   r   N  s   
z+LayerWiseOffloadBackend.get_blocks_from_ditrc   )rd   re   rf   rg   r   r<   r   r   r   r   r   rr   rk   rl   r   r   rp   r   rs   r   r   r+   r   r      s    
O$r   rb   )	itertoolsr   typingr   r<   r   vllm.loggerr   vllm_omni.diffusion.hooksr   r   vllm_omni.platformsr   baser
   r   module_collectorr   rd   r|   r   r   r   ri   rj   rx   r~   r   r   r   r   r   <module>   s:    7
