o
    ۷iFm                     @   s   d dl Z d dlZd dlmZ d dlZd dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZmZ d dlmZ d dlmZ d d	lmZmZmZ d d
lmZmZ d dlmZ d dlmZmZm Z  d dl!m"Z" ee#Z$G dd dZ%dS )    N)OrderedDict)init_logger)BaseLayerWithLoRA)	LoRAModel)LoRALayerWeightsPackedLoRALayerWeights)
PEFTHelper)LoRARequest)get_adapter_absolute_pathget_supported_lora_modulesreplace_submodule)MergedColumnParallelLinearQKVParallelLinear)
LoRAConfig)*_expand_expected_modules_for_packed_layers_match_target_modulesfrom_layer_diffusion)stable_lora_int_idc                   @   s  e Zd ZdZ			d@dejdejdejde	d	e
dB d
efddZdee
 fddZdee
ee
 f fddZde
de	dee
 dB fddZdAdedB d
eddfddZdd Zde	d
efddZed
efddZdedeeef fd d!Zd"ejdee
 fd#d$Zd%eddfd&d'Zd(e	ddfd)d*Zd+ed,e
de e!B dB fd-d.Z"de	d/ede#fd0d1Z$de	d/eddfd2d3Z%dBd4d5Z&dBd6d7Z'dede#fd8d9Z(de	de#fd:d;Z)dee	 fd<d=Z*de	de#fd>d?Z+dS )CDiffusionLoRAManagerzManager for LoRA adapters in diffusion models.

    Reuses vLLM's LoRA infrastructure, adapted for diffusion pipelines.
    Uses LRU cache management similar to LRUCacheLoRAModelManager.
       N      ?pipelinedevicedtypemax_cached_adapters	lora_path
lora_scalec                 C   s   || _ || _|| _|  | _|  | _t| j| j| _|| _	i | _
d| _i | _t | _t | _i | _d| _td|||| |durZtd|| tdt||d}| || dS dS )a0  
        Initialize the DiffusionLoRAManager.

        Args:
            max_cached_adapters: Maximum number of LoRA adapters to keep in the
                CPU-side cache (LRU). This mirrors vLLM's `max_cpu_loras` and is
                exposed to users via `OmniDiffusionConfig.max_cpu_loras`.
        Nr   zcInitializing DiffusionLoRAManager: device=%s, dtype=%s, max_cached_adapters=%d, static_lora_path=%sz:Loading LoRA during initialization from %s with scale %.2fstatic)	lora_namelora_int_idr   )r   r   r   _compute_supported_lora_modules_supported_lora_modules_compute_packed_modules_mapping_packed_modules_mappingr   _expected_lora_modulesr   _registered_adapters_active_adapter_id_adapter_scalesr   _adapter_access_orderset_pinned_adapters_lora_modules_max_lora_rankloggerinfor	   r   set_active_adapter)selfr   r   r   r   r   r   init_request r2   V/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm_omni/diffusion/lora/manager.py__init__)   sB   

zDiffusionLoRAManager.__init__returnc                 C   sZ   t t| j}d}| j D ]\}}t|tr#d}||dd  q|r+|d |S )a1  Compute supported LoRA module suffixes for this pipeline.

        vLLM's get_supported_lora_modules() returns suffixes for LinearBase
        modules. After this manager replaces layers with BaseLayerWithLoRA
        wrappers, those LinearBase modules become nested under ".base_layer",
        which would cause get_supported_lora_modules() to return "base_layer".
        To make adapter loading stable across multiple adapters, we also accept
        suffixes from existing BaseLayerWithLoRA wrappers and drop "base_layer"
        when appropriate.
        FT.
base_layer)	r)   r   r   named_modules
isinstancer   addsplitdiscard)r0   	supportedhas_lora_wrappersnamemoduler2   r2   r3   r    l   s   

z4DiffusionLoRAManager._compute_supported_lora_modulesc              	   C   s   i }| j  D ]R}t|dd}t|tsq| D ]?\}}t|tr$|s%qt|ttfr5t	dd |D s6qt|}|s=q|
|}|du rK|||< q||krXtd|||| qq|S )av  Collect packed->sublayer mappings from the diffusion model.

        vLLM models declare `packed_modules_mapping` on the model class. For
        diffusion pipelines, we attach the same mapping on the transformer
        module(s) that implement packed (fused) projections, so LoRA loading can
        accept checkpoints trained against the logical sub-projections.
        packed_modules_mappingNc                 s   s    | ]}t |tV  qd S N)r:   str).0sr2   r2   r3   	<genexpr>   s    zGDiffusionLoRAManager._compute_packed_modules_mapping.<locals>.<genexpr>z=Conflicting packed_modules_mapping for %s: %s vs %s; using %s)r   modulesgetattrr:   dictitemsrD   listtupleallgetr-   warning)r0   mappingrA   packedpacked_name	sub_namessub_names_listexistingr2   r2   r3   r"      s6   
 

z4DiffusionLoRAManager._compute_packed_modules_mappingpacked_module_suffixn_slicesc                 C   s<   | j |}|s
d S t||krtd|t|| d S |S )NzWpacked_modules_mapping[%s] has %d slices but layer expects %d; skipping sublayer lookup)r#   rO   lenr-   rP   )r0   rW   rX   sub_suffixesr2   r2   r3   _get_packed_sublayer_suffixes   s   z2DiffusionLoRAManager._get_packed_sublayer_suffixeslora_requestc              	   C   s   |du rt d |   dS td|r t d |   dS |j}t d||j|j|t	| j
| j || j
vrGt d||j | | n| | | || dS )zSet the active LoRA adapter for the pipeline.

        Args:
            lora_request: The LoRA request, or None to deactivate all adapters.
            lora_scale: The external scale for the LoRA adapter.
        Nz8No lora_request provided, deactivating all LoRA adaptersg        zDReceived a request with LoRA scale 0; deactivating all LoRA adapterszMSetting active adapter: id=%d, name=%s, path=%s, scale=%.2f, cache_size=%d/%dz#Loading new adapter: id=%d, name=%s)r-   debug_deactivate_all_adaptersmathiscloserP   r   r   r   rY   r%   r   r.   add_adapter_touch_adapter_info_activate_adapter)r0   r\   r   
adapter_idr2   r2   r3   r/      s.   


	
z'DiffusionLoRAManager.set_active_adapterc                 C   s   t   | j|< | j| dS )z)Update the current caching ordering info.N)timer(   move_to_endr0   rd   r2   r2   r3   rb      s   z(DiffusionLoRAManager._touch_adapter_inford   c                 C   s   t |}|| j|< dS )zUpdate the adapter scale for a given adapter ID. To avoid potential
        issues with using Floats as keys, for now, we round float values to
        3 decimal points.
        N)r   _get_rounded_scaler'   )r0   rd   r   scaler2   r2   r3   _update_adapter_scale   s   
z*DiffusionLoRAManager._update_adapter_scalec                 C   s
   t | dS )zNormalizes a lora scale for use as a key in the _adapter_scales
        dict; for now we just round scales to 3 decimal places.
           )round)r   r2   r2   r3   rh      s   
z'DiffusionLoRAManager._get_rounded_scalec                 C   s   | j stdtd| j  t|j}td| tj|d |jd}t	d|j
|j|j tj|| j ||jd| jd |jd d	}t	d|jt|jt|j  |j D ]}|  qU||fS )	Nz:No supported LoRA modules found in the diffusion pipeline.zSupported LoRA modules: %szResolved LoRA path: %s)max_position_embeddingstensorizer_config_dictz:Loaded PEFT config: r=%d, lora_alpha=%d, target_modules=%scpu)expected_lora_modulespeft_helperlora_model_idr   r   model_vocab_sizern   weights_mapperz4Loaded LoRA model: id=%d, num_modules=%d, modules=%s)r$   
ValueErrorr-   r]   r
   r   r   from_local_dirrn   r.   r
lora_alphatarget_modulesr   from_local_checkpointr   r   idrY   lorasrL   keysvaluesoptimize)r0   r\   r   rq   
lora_modellorar2   r2   r3   _load_adapter   sH   

z"DiffusionLoRAManager._load_adapterrA   c                 C   s(   t |tr	g dS t |trddgS g S )a_  Return a packed_modules_list suitable for vLLM LoRA can_replace_layer().

        Diffusion transformers frequently use packed projection layers like
        QKVParallelLinear (fused QKV). vLLM's LoRA replacement logic relies on
        `packed_modules_list` length to decide between single-slice vs packed
        LoRA layer implementations.
        )qkv01)r:   r   r   )r0   rA   r2   r2   r3   _get_packed_modules_list  s
   

z-DiffusionLoRAManager._get_packed_modules_listrq   c              	      s  |  |j t|dd }d  d t|tr|r|n	t|tr#|r#| dtdtf fdd}t| jd| j	| j
dd}d	D ]}t| j|sFq=t| j|}t|tjsSq=|jdd
D ]\}}t|tsid|dv rjqY| d| }	|	| jv r}td|	 qY| |}
d us d ur||	}|st|
dkr|	d\}}}| |t|
}|d ur|D ]}|r| d| n|}||rd} nq|sqYt|d||
d d}||urt|trt||| || j|	< td|	t|j qYq=d S )Nry   module_namer5   c                    s6   d urdd l }|| d uS  d u rdS t|  S )Nr   T)regexsearchr   )r   retarget_modules_listtarget_modules_patternr2   r3   _matches_target:  s   
zGDiffusionLoRAManager._replace_layers_with_lora.<locals>._matches_targetr   Fmax_lora_rank	max_lorasmax_cpu_loras
lora_dtypefully_sharded_loras)transformertransformer_2dit)remove_duplicater8   r6   z#Layer %s already replaced, skippingT)layerr   lora_configpacked_modules_listmodel_configzReplaced layer: %s -> %s)_ensure_max_lora_rankrw   rI   r:   rD   rL   boolr   r,   r   r   hasattrr   nnModuler9   r   r<   r+   r-   r]   r   rY   
rpartitionr[   r   r   type__name__)r0   rq   ry   r   r   component_name	componentr   rA   full_module_namer   should_replaceprefix_packed_suffixrZ   
sub_suffixsub_full_name
lora_layerr2   r   r3   _replace_layers_with_lora/  st   



z.DiffusionLoRAManager._replace_layers_with_loramin_rankc                 C   s   || j krdS |dkrtd| td| j | || _ | js"dS t| j d| j| jdd}| j D ]
}|j	d|dd q3| j
durV| j
}| j| }d| _
| || dS dS )	a  Ensure LoRA buffers can accommodate adapters up to `min_rank`.

        We allocate per-layer LoRA buffers once when we first replace layers.
        If a later adapter has a larger rank, we need to reinitialize those
        buffers and re-apply the currently active adapter.
        Nr   zInvalid LoRA rank: z"Increasing max LoRA rank: %d -> %dr   Fr   )r   r   r   )r,   ru   r-   r.   r+   r   r   r   r~   create_lora_weightsr&   r'   rc   )r0   r   r   r   	active_idactive_scaler2   r2   r3   r   |  s.   
	

z*DiffusionLoRAManager._ensure_max_lora_rankr   r   c                 C   s`   | |}|dur|S d|v r|ddd n|}| |}|dur$|S |dd }| |S )a  Best-effort lookup for LoRA weights by name.

        Tries:
        - Full module name (e.g. transformer.blocks.0.attn.to_qkv)
        - Relative name without the top-level component (e.g. blocks.0.attn.to_qkv)
        - Suffix-only name (e.g. to_qkv)
        Nr6   r   r7   )get_lorar<   )r0   r   r   lora_weightscomponent_relative_namemodule_suffixr2   r2   r3   _get_lora_weights  s   


z&DiffusionLoRAManager._get_lora_weightsri   c                 C   s,   t |}| j|k}| j||k}|o|S )z?True if the adapter_id is active and the current scale matches.)r   rh   r&   r'   rO   )r0   rd   ri   rounded_scale	is_activematches_scaler2   r2   r3   _is_active_at_scale  s   

z(DiffusionLoRAManager._is_active_at_scalec                    s  |  | rtd|  d S td| | j| }| j D ]L\}}| ||}|d u rt|dd}|dkr|	d\}}	}
| 
|
|}|d u rP|d qg }d}|D ],}|ra| d| n|}| ||}|d urvd}t|trvd }|t|tr|nd  qV|s|d qg }g }|D ]}|d u r|d  |d  q||j ||j   q|jd||d	 td
||  n|d qt|tr|j} fdd|jD }|jd||d	 td|  qt|dd}|dkrQt|dd }|d u r|d qt|}|jjd |kr%td||jjd | |d qttj|jt|dd}|jg| } fdd|D }|jd||d	 td|  q|j  }|jd|j|d	 td||jj|jj  q|| _| |  d S )Nz0Adapter %d already active at scale %.3f skippingzActivating adapter: id=%drX   r   r6   r   FT)indexlora_alora_bz;Activated packed LoRA for %s via submodules=%s (scale=%.2f)c                    s    g | ]}|d u r
d n|  qS rC   r2   rE   bri   r2   r3   
<listcomp>  s    z:DiffusionLoRAManager._activate_adapter.<locals>.<listcomp>z)Activated packed LoRA for %s (scale=%.2f)output_sliceszQSkipping LoRA for %s due to shape mismatch: lora_b[0]=%d != sum(output_slices)=%d)dimc                    s   g | ]}|  qS r2   r2   r   r   r2   r3   r     s    z5Activated fused LoRA for packed layer %s (scale=%.2f)zCActivated LoRA for %s: lora_a shape=%s, lora_b shape=%s, scale=%.2f)r   r-   r]   r.   r%   r+   rK   r   rI   r   r[   
reset_lorar:   r   appendr   r   r   set_lorasumshaperP   rL   torchr<   r&   rj   )r0   rd   ri   r   r   r   r   rX   r   r   r   rZ   	sub_loras	any_foundr   r   sub_loralora_a_listlora_b_listr   totalb_splitsscaled_lora_br2   r   r3   rc     s   














z&DiffusionLoRAManager._activate_adapterc                 C   s@   t dt| j | j D ]}|d qd | _t d d S )Nz$Deactivating all adapters: %d layersr   zAll adapters deactivated)r-   r.   rY   r+   r~   r   r&   r]   )r0   r   r2   r2   r3   r^   5  s
   z-DiffusionLoRAManager._deactivate_all_adaptersc                    s   t  j jd krB fdd j D }|s!td j dS |d }td|t  j j  | t  j jd ks
dS dS )z]Evict unpinned registered adapters until we have room for a new
        adapter to be loaded.r   c                    s   g | ]	}| j vr|qS r2   )r*   )rE   aidr0   r2   r3   r   A  s    z?DiffusionLoRAManager._evict_for_new_adapter.<locals>.<listcomp>zjCache full (%d) but all adapters are pinned; cannot evict. Increase max_cached_adapters or unpin adapters.r   z*Evicting LRU adapter: id=%d (cache: %d/%d)N)	rY   r%   r   r(   r}   r-   rP   r.   remove_adapter)r0   evict_candidateslru_adapter_idr2   r   r3   _evict_for_new_adapter<  s"   
z+DiffusionLoRAManager._evict_for_new_adapterc                 C   s   |j }|| jv rtd| dS td||j |   | |\}}| | || j|< | 	| td|t
| j| j dS )zG
        Add a new adapter to the cache without activating it.
        z'Adapter %d already registered, skippingFz"Adding new adapter: id=%d, name=%sz#Adapter %d added, cache size: %d/%dT)r   r%   r-   r]   r.   r   r   r   rb   r   rY   r   )r0   r\   rd   r   rq   r2   r2   r3   ra   S  s   



z DiffusionLoRAManager.add_adapterc                 C   s   || j vrtd| dS td| | j|kr|   | j |= | j|d | j|d | j	
| td|t| j | j dS )z3
        Remove an adapter from the cache.
        z#Adapter %d not found, cannot removeFzRemoving adapter: id=%dNz%Adapter %d removed, cache size: %d/%dT)r%   r-   r]   r.   r&   r^   r'   popr(   r*   r=   rY   r   rg   r2   r2   r3   r   o  s"   

z#DiffusionLoRAManager.remove_adapterc                 C   s   t | j S )z&Return list of registered adapter ids.)rL   r%   r}   r   r2   r2   r3   list_adapters  s   z"DiffusionLoRAManager.list_adaptersc                 C   sP   || j vrtd| dS | j| t | j|< | j| td| dS )z4Mark an adapter as pinned so it will not be evicted.z Adapter %d not found, cannot pinFz'Pinned adapter id=%d (won't be evicted)T)	r%   r-   r]   r*   r;   re   r(   rf   r.   rg   r2   r2   r3   pin_adapter  s   
z DiffusionLoRAManager.pin_adapter)r   Nr   )r   )r5   N),r   
__module____qualname____doc__r   r   r   r   r   intrD   floatr4   r)   r    rJ   rL   r"   r[   r	   r/   rb   rj   staticmethodrh   rM   r   r   r   r   r   r   r   r   r   r   r   rc   r^   r   ra   r   r   r   r2   r2   r2   r3   r   "   sb    
C$$

1M&


t
r   )&r_   re   collectionsr   r   torch.nnr   vllm.loggerr   vllm.lora.layersr   vllm.lora.lora_modelr   vllm.lora.lora_weightsr   r   vllm.lora.peft_helperr   vllm.lora.requestr	   vllm.lora.utilsr
   r   r   !vllm.model_executor.layers.linearr   r   vllm_omni.config.lorar   vllm_omni.diffusion.lora.utilsr   r   r   vllm_omni.lora.utilsr   r   r-   r   r2   r2   r2   r3   <module>   s$   