o
    پikL                     @   s   d dl Z d dlmZ d dlmZmZmZmZmZ d dl	Z	d dl
mZ d dlmZ eeZG dd dZG dd	 d	Zd
e	jjfddZdS )    N)chain)AnyDictListSetTuple)
ServerArgs)init_loggerc                   @   sZ  e Zd ZdZddddejjdedede	d	e	d
eddfddZ
dededB fddZejjd/ddZd0ddZdedejfddZejjd0dede	ddfddZejjdeddfddZejjd/ddZejjd/d d!Zejjdeddfd"d#Zejjd/d$d%Zejjd&eeejf dee dB fd'd(Zd)d* Zd/d+d,Zd/d-d.ZdS )1LayerwiseOffloadManagera+  A lightweight layerwise CPU offload manager.

    This utility offloads per-layer parameters/buffers from GPU to CPU, and
    supports async H2D prefetch using a dedicated CUDA stream.

    Typical usage:
    - Construct the manager with the target model and the list-like module
      attribute that represents transformer blocks (e.g. ``blocks``).
    - Call :meth:`initialize` once to offload weights and prefetch layer 0.
    - During forward, call :meth:`prefetch_layer` for the next layer and
      :meth:`release_layer` for the finished layer.
    T   )pin_cpu_memoryprefetch_sizemodellayers_attr_str
num_layersenabledr   r   returnNc                C   s   || _ || _|| _|| _ttd|| j| _t|otj	
 | _| js%d S tdtj	 | _tj	 | _tdt| d| _i | _i | _t | _i | _i | _i | _g | _|   d S )Nr   cudaz(^|\.)z\.(\d+)(\.|$))r   r   r   r   minmaxr   booltorchr   is_availabler   devicecurrent_deviceStreamcopy_streamrecompileescape_layer_name_re_consolidated_cpu_weights_weight_metadataset_gpu_layers_prefetch_events_named_parameters_named_buffers_forward_hooks_initialize)selfr   r   r   r   r   r    r+   i/home/ubuntu/.local/lib/python3.10/site-packages/sglang/multimodal_gen/runtime/utils/layerwise_offload.py__init__   s*   
z LayerwiseOffloadManager.__init__namec                 C   s:   | j |}|s
d S zt|dW S  ty   Y d S w )N   )r    searchintgroup	Exception)r*   r.   mr+   r+   r,   _match_layer_idxG   s   z(LayerwiseOffloadManager._match_layer_idxc                 C   s  | j sd S t| j | _t| j | _i }t| j | j }|D ]#\}}| 	|}|d u s6|| j
kr7q$||i |jg ||f q$| D ]e\}}i | j|< i | j|< | D ]R\}}tdd |D }	tj|	|| jd}
d}|D ]0\}}| }|
|||  |  ||||jd| j| |< tjd| j|d|_||7 }qx|
| j| |< q^qL| jdd	 |   td
| j d| j
  d S )Nc                 s   s    | ]	\}}|  V  qd S N)numel).0_tr+   r+   r,   	<genexpr>i   s    z6LayerwiseOffloadManager._initialize.<locals>.<genexpr>)dtype
pin_memoryr   )r<   offsetr7   shaper   r   r<   Fnon_blockingz?LayerwiseOffloadManager initialized with num prefetched layer: z, total num layers: )r   dictr   named_parametersr&   named_buffersr'   r   itemsr5   r   
setdefaultr<   appendr!   r"   sumr   emptyr   r7   copy_flattenr?   r   dataprepare_for_next_reqregister_forward_hooksloggerinfor   )r*   layer_groupsall_tensorsr.   tensor	layer_idxdtype_to_paramsr<   weightstotal_numel
cpu_buffercurrent_offsetweightr7   r+   r+   r,   r)   P   sR   



z#LayerwiseOffloadManager._initializec                 C   sJ   t | jD ]	}| j||d q|s!| jdur#tj | j dS dS dS )zd
        Prepare for the next round of denoising loop with prefetching the necessary layers
        rB   N)ranger   prefetch_layerr   r   r   current_streamwait_stream)r*   rC   ir+   r+   r,   rO      s
   z,LayerwiseOffloadManager.prepare_for_next_reqc                 C   s&   || j v r| j | }|S | j| }|S )z1get the target model weight/buffer to be replaced)r&   r'   )r*   r.   targetr+   r+   r,   get_target_with_name   s
   


z,LayerwiseOffloadManager.get_target_with_namerV   rC   c                 C   s`  | j r| jdu s| jdu rdS |dk s|| jkrdS || jv r!dS || jvr(dS | jtj	  i }tj
| j) | j|  D ]\}}tj|j|| jd}|j||d |||< qBW d   n1 sfw   Y  tj }|| j || j|< | j|  D ]%\}}	|	d }|| }| |}
||	d |	d |	d   |	d |
_q| j| dS )	z
        idempotent
        Nr   )r<   r   rB   r<   r>   r7   r?   )r   r   r   r   r$   r!   r`   r   r   r_   streamrG   rK   r?   rL   Eventrecordr%   r"   rc   viewrN   add)r*   rV   rC   gpu_buffersr<   rZ   
gpu_buffereventr.   metarb   r+   r+   r,   r^      s@   




	


z&LayerwiseOffloadManager.prefetch_layerc                 C   s   | j r| jdu r
dS | j|d |dkrdS || jvrdS | j|i  D ]\}}| |}t	j
d| j|d d|_q'| j| dS )z
        lightweight release layer weights
        Basically set the reference count to the gpu weight tensor to zero. The weights on cpu is untouched
        Nr   r@   r<   rA   )r   r   r%   popr$   r"   getrG   rc   r   rK   rN   discard)r*   rV   r.   rl   rb   r+   r+   r,   release_layer   s   

z%LayerwiseOffloadManager.release_layerc                 C   sN   | j r| jd u r
d S | jd urtj | j t| jD ]}| 	| qd S r6   )
r   r   r   r   r   r_   r`   listr$   rp   r*   rV   r+   r+   r,   release_all   s   
z#LayerwiseOffloadManager.release_allc                 C   s\   | j r| jdu r
dS | jdurtj | j t| jD ]}|| j	vr+| j
|dd qdS )z Load all layers from CPU to GPU.NFrB   )r   r   r   r   r   r_   r`   r]   r   r$   r^   rr   r+   r+   r,   load_all_layers   s   

z'LayerwiseOffloadManager.load_all_layersc           
      C   s   | j r|| jvr
dS || jvrdS | jdurtj | j | j	|i 
 D ].\}}| |}|j  }|d }| j| | }|d }|d }	||||	  | q(dS )z,Sync a layer's weights from GPU back to CPU.Nr<   r>   r7   )r   r$   r!   r   r   r   r_   r`   r"   rn   rG   rc   rN   rM   cpurL   )
r*   rV   r.   rl   rb   
gpu_weightr<   rZ   r>   r7   r+   r+   r,   sync_layer_to_cpu   s   


z)LayerwiseOffloadManager.sync_layer_to_cpuc                 C   sN   | j r| jdu r
dS | jdurtj | j t| jD ]}| 	| qdS )z5Sync all loaded layers' weights from GPU back to CPU.N)
r   r   r   r   r   r_   r`   rq   r$   rw   rr   r+   r+   r,   sync_all_layers_to_cpu  s   
z.LayerwiseOffloadManager.sync_all_layers_to_cpuweight_dictc              	   C   s  | j sdS t }| D ]~\}}| |}|du rq| j|}|du s(||vr)q|| }t|d t|jkrLtd| dt|d  dt|j |d }|d }	|d }
| j	| | }||	|	|
  
|j|d	  || jv r| |}|j
|j|jd	 || q|S )
a`  Update consolidated CPU buffers with new weights.

        When layerwise offload (--dit-layerwise-offload) is enabled, the
        offload manager replaces GPU parameters with small torch.empty((1,))
        placeholders while real weights live in consolidated pinned CPU
        buffers.

        The refit process writes new weights directly into the CPU buffers,
        bypassing the placeholders.  For any layer that happens to be resident
        on the GPU at update time, the live GPU tensor is also updated.

        Args:
            weight_dict: Mapping of parameter name to new weight tensor.

        Returns:
            Set of parameter names that were successfully updated.

        Raises:
            ValueError: If a weight's shape does not match the recorded
                metadata (i.e., the real shape, not the placeholder shape).
        Nr?   zShape mismatch for z: expected=z	, loaded=r<   r>   r7   )r<   )r   r#   rG   r5   r"   rn   tupler?   
ValueErrorr!   rL   torM   r$   rc   rN   r<   rh   )r*   ry   updated_namesr.   loaded_weightrV   
meta_layerrl   r<   r>   r7   rZ   rb   r+   r+   r,   update_cpu_weights  s>   



z*LayerwiseOffloadManager.update_cpu_weightsc           	      c   sx    t | jD ]3}| j|  D ])\}}|d }|d }|d }|d }| j| | }|||||  |fV  qqdS )a  Yield (name, tensor) pairs from consolidated CPU buffers.

        This reconstructs the original weight tensors (with correct shapes)
        from the flat CPU buffers using stored metadata.  Unlike
        model.named_parameters(), which returns (1,) placeholders
        when offload is enabled, this method returns the real weights and
        can be used for checksum computation.
        r<   r>   r7   r?   N)sortedr"   rG   r!   reshape)	r*   rV   r.   rl   r<   r>   r7   r?   rZ   r+   r+   r,   iter_cpu_weightsU  s   	z(LayerwiseOffloadManager.iter_cpu_weightsc                    s|    j sd S t j j} fdd} fdd} j  t|D ]\}}|||}|||} j	||g q!d S )Nc                        fdd}|S )Nc                    s    dkr
j dd  jv rtj j    j dkr=t j  dj  D ]}|j }j	|dd q.d S d S )Nr   FrB   r/   T)
rO   r%   r   r   r_   
wait_eventr   r]   r   r^   )moduleinputjlayer_to_prefetchra   r*   r+   r,   hookn  s   

zSLayerwiseOffloadManager.register_forward_hooks.<locals>.make_pre_hook.<locals>.hookr+   ra   r   r*   ra   r,   make_pre_hookm  s   zELayerwiseOffloadManager.register_forward_hooks.<locals>.make_pre_hookc                    r   )Nc                    s      d S r6   )rp   )r   r   outputr   r+   r,   r   ~  s   zTLayerwiseOffloadManager.register_forward_hooks.<locals>.make_post_hook.<locals>.hookr+   r   r   r   r,   make_post_hook}  s   zFLayerwiseOffloadManager.register_forward_hooks.<locals>.make_post_hook)
r   getattrr   r   r(   clear	enumerateregister_forward_pre_hookregister_forward_hookextend)r*   layersr   r   ra   layerpre_hook_handlepost_hook_handler+   r   r,   rP   g  s   
	z.LayerwiseOffloadManager.register_forward_hooksc                 C   s"   | j D ]}|  q| j   dS )z$Remove all registered forward hooks.N)r(   remover   )r*   hook_handler+   r+   r,   remove_forward_hooks  s   

z,LayerwiseOffloadManager.remove_forward_hooksr   N)T)__name__
__module____qualname____doc__r   nnModulestrr1   r   r-   r5   compilerdisabler)   rO   Tensorrc   r^   rp   rs   rt   rw   rx   r   r   r   r   rP   r   r+   r+   r+   r,   r
      sZ    	
+	
;	*	

=
%r
   c                   @   sX   e Zd ZU dZee ed< g Zee	 ed< de
fddZdd ZdddZdddZd
S )OffloadableDiTMixinzT
    A mixin that registers forward hooks for a DiT to enable layerwise offload
    layer_nameslayerwise_offload_managersserver_argsc              	   C   s   g | _ | jD ]A}t| |d }|d u st|tjjsqt|}|jdk r1dt	t
|j|d   }nt	|j}t| ||d|j|d}| j | qtd| jj d| j  d S )Ng      ?r   T)r   r   r   r   r   r   zEnabled layerwise offload for z on modules: )r   r   r   
isinstancer   r   
ModuleListlendit_offload_prefetch_sizer1   roundr
   r   rI   rQ   rR   	__class__r   )r*   r   
layer_namemodule_listr   r   managerr+   r+   r,   configure_layerwise_offload  s.   


z/OffloadableDiTMixin.configure_layerwise_offloadc                 C   s*   | j d u rd S | j D ]}|jdd q
d S )NTrB   )r   rO   r*   r   r+   r+   r,   rO     s
   

z(OffloadableDiTMixin.prepare_for_next_reqr   Nc                 C   s4   | j du rdS | j D ]}|jr|  |  q
dS )zCDisable layerwise offload: load all layers to GPU and remove hooks.N)r   r   r   rt   r   r+   r+   r,   disable_offload  s   

z#OffloadableDiTMixin.disable_offloadc                 C   s<   | j du rdS | j D ]}|jr|  |  |  q
dS )zTRe-enable layerwise offload: sync weights to CPU, release layers, and restore hooks.N)r   r   rx   rs   rP   r   r+   r+   r,   enable_offload  s   

z"OffloadableDiTMixin.enable_offloadr   )r   r   r   r   r   r   __annotations__r   rq   r
   r   r   rO   r   r   r+   r+   r+   r,   r     s   
 
	r   r   c                 c   s    g }t | tr| jrdd | jD }|s|  E dH  dS t }|D ]}| D ]\}}|| ||fV  q)q#|  D ]\}}||vrJ||fV  q=dS )aH  Yield (name, tensor) pairs with materialized weights, even under offload.

    When layerwise offload is active, module.named_parameters() returns
    (1,) placeholders for offloaded layers.  This function reads the
    actual data from the offload manager's CPU buffers and chains it with
    the non-offloaded parameters.
    c                 S   s   g | ]}|j r|qS r+   )r   )r8   r4   r+   r+   r,   
<listcomp>  s    z-iter_materialized_weights.<locals>.<listcomp>N)r   r   r   rE   r#   r   rh   )r   offload_managersoffloaded_namesr   r.   rU   paramr+   r+   r,   iter_materialized_weights  s$   

r   )r   	itertoolsr   typingr   r   r   r   r   r   )sglang.multimodal_gen.runtime.server_argsr   1sglang.multimodal_gen.runtime.utils.logging_utilsr	   r   rQ   r
   r   r   r   r   r+   r+   r+   r,   <module>   s       A