o
    3NPiw*                  	   @   s  d Z ddlZddlZddlZddlZej r'dejj_	dejjj
_dejj_ejZdd Zee_zddlm  mZ ejZdddZee_W n	 eyP   Y nw zddlm  mZ ejZddd	Zee_W n eefyr   Y nw ed
ZG dd dZ e  Z!dS )a  
Model management - Hot inference with adaptive loading.

Key improvements:
- community-1 FIRST (better per pyannote docs), 3.1 as fallback
- Lazy loading with compute-aware batching
- Better GPU memory management

=== v7.0 OPTIMIZATION: cuDNN Benchmark + TF32 ===
Enables CUDA optimizations for faster inference:
- cudnn.benchmark=True: Auto-tune convolution algorithms
- TF32 precision: 3x faster matmul on Ampere+ GPUs
Expected gain: 5-10% on diarization (conv-heavy model)
    NTc                  O   s   d|d< t | i |S )NFweights_only_original_torch_load)argskwargs r   P/home/ubuntu/.cursor/worktrees/maya3data__SSH__216.81.248.184_/zxg/src/models.py_patched_torch_load"   s   r	   c                 K      t | |ddS NF)map_locationr   r   )path_or_urlr   r   r   r   r   _patched_pl_load+      r   c                 C   r
   r   r   )r   r   r   r   r   _patched_fabric_load4   r   r   zFastPipelineV6.Modelsc                       sl   e Zd ZdZdZ fddZdd Zdejfdd	Z	d
d Z
dddZdefddZddefddZ  ZS )ModelManagera  
    Singleton model manager - loads all models once, keeps them hot.
    
    Model priority (per user request):
    1. community-1 (better performance per pyannote docs)
    2. speaker-diarization-3.1 (fallback)
    
    Key optimization: Load once, inference many times.
    Nc                    s&   | j d u rt | | _ d| j _| j S )NF)	_instancesuper__new___initialized)cls	__class__r   r   r   I   s   
zModelManager.__new__c                 C   sP   | j rd S d| _ d | _d | _d | _d | _d | _d | _d | _d| _d| _	d | _
d S )NTF)r   device
silero_vadsilero_utilsdiarization_pipelinesegmentation_modelembedding_modelpanns_model_loaded_panns_loaded_model_nameselfr   r   r   __init__O   s   
zModelManager.__init__returnc                 C   s*   | j du rt tj rdnd| _ | j S )z#Get CUDA device or fallback to CPU.Ncudacpu)r   torchr'   is_availabler#   r   r   r   
get_device^   s   
zModelManager.get_devicec                    s  | j rtd| j d dS td td td t }|  }td|  tj rLtj	dj
tjd }td|d	 d
d td tjjddddd\}}|| _|| _td ddlm} ddlm} ddl}	td |j t|	 j  fdd}
|
|_ddg}|D ]B\}}z!td| d |j||jd| _|| _td| d W  n ty } ztd| d|  W Y d}~qd}~ww | jdu rtd | j | td! d| _!td" dd#l"m#} |j$d$d%d&t%|id'| _&td( d)| _ t | }tj r<tj' d	 }tj d	 }td*|d
d+|d
d, td-|d.d/ td dS )0z
        Load all models once into GPU memory.
        
        Order: VAD -> Diarization (community-1 first) -> Segmentation -> Embedding
        u   ✅ Models already loaded: z (hot inference)NzF======================================================================u    🚀 LOADING ALL MODELS INTO GPUzDevice: r   zGPU Memory before: g    eAz.2fzGB freezLoading Silero VAD...zsnakers4/silero-vadr   F)repo_or_dirmodelforce_reloadonnxu   ✅ Silero VAD loaded)Pipeline)SpeakerDiarizationzLoading PyAnnote Diarization...c                    sX   fdd|  D }t| t|  }|r!td|   | g|R i |S )Nc                    s   i | ]\}}| v r||qS r   r   ).0kv)_valid_paramsr   r   
<dictcomp>   s    zCModelManager.load_all.<locals>._patched_sd_init.<locals>.<dictcomp>z!Filtered out unsupported params: )itemssetkeysloggerdebug)r$   r   r   filtered_kwargsremoved_original_sd_initr5   r   r   _patched_sd_init   s
   z/ModelManager.load_all.<locals>._patched_sd_init)z(pyannote/speaker-diarization-community-1zcommunity-1)z pyannote/speaker-diarization-3.1zspeaker-diarization-3.1zTrying z...)tokenu   ✅ Loaded z (preferred model)zFailed to load z: z'Failed to load any diarization pipelineu&   ✅ PyAnnote Diarization loaded on GPUzLoading ECAPA-TDNN...)EncoderClassifierz!speechbrain/spkrec-ecapa-voxcelebz/ephemeral/models/ecapar   )sourcesavedirrun_optsu   ✅ ECAPA-TDNN loadedTzGPU Memory: zGB allocated, zGB reservedu   ✅ All models loaded in .1fzs (STAYING HOT))(r    r:   infor"   timer+   r)   r'   r*   get_device_propertiestotal_memorymemory_reservedhubloadr   r   pyannote.audior0   pyannote.audio.pipelinesr1   inspectr%   r8   	signature
parametersr9   from_pretrainedhf_tokenr   	ExceptionwarningRuntimeErrortor   speechbrain.inference.speakerrB   from_hparamsstrr   memory_allocated)r$   configstartr   freer-   utilsr0   r1   rP   r@   models_to_try
model_name
short_nameerB   elapsed	allocatedreservedr   r>   r   load_alld   s   












zModelManager.load_allFc                 C   s@   t j r|rt  t j  t j  dS t j  dS dS )z
        Clear GPU cache between operations.
        
        Args:
            aggressive: If True, also empty reserved memory and run gc
        N)r)   r'   r*   gccollectempty_cachereset_peak_memory_stats)r$   
aggressiver   r   r   clear_cache   s   

zModelManager.clear_cachec                 C   s4   t j rt jdj}t jd}|| d S dS )zGet current free VRAM in GB.r   i   @g        )r)   r'   r*   rI   rJ   rK   )r$   totalrg   r   r   r   get_vram_free_gb   s
   
zModelManager.get_vram_free_gbr'   r   c              
   C   s   | j r| jdurtd | jS td t }z6ddlm} |dkr0tj	 s0d}t
d ||d	| _d
| _ t | }td| d|dd | jW S  tye } z
td td|d}~w tyy } z	td|   d}~ww )a~  
        Load PANNs CNN14 model for music detection (lazy loading).
        
        === v6.7 FEATURE: Music Detection ===
        PANNs (Pretrained Audio Neural Networks) trained on AudioSet.
        Used to detect music/instruments in audio chunks.
        
        Args:
            device: 'cuda' or 'cpu'
        
        Returns:
            Loaded AudioTagging model
        Nu.   ✅ PANNs CNN14 already loaded (hot inference)z*Loading PANNs CNN14 for music detection...r   )AudioTaggingr'   r(   z'CUDA not available, using CPU for PANNs)r   Tu   ✅ PANNs CNN14 loaded on z in rF   su9   ❌ PANNs not installed. Run: pip install panns-inferencez_panns-inference package required for music detection. Install with: pip install panns-inferenceu   ❌ Failed to load PANNs: )r!   r   r:   rG   rH   panns_inferencerq   r)   r'   r*   rV   ImportErrorerrorrU   )r$   r   r^   rq   re   rd   r   r   r   
load_panns   s4   



zModelManager.load_panns)F)r'   )__name__
__module____qualname____doc__r   r   r%   r)   r   r+   rh   rn   floatrp   r[   rv   __classcell__r   r   r   r   r   =   s    	
pr   )N)"rz   rH   loggingri   r)   r'   r*   backendscudnn	benchmarkmatmul
allow_tf32rM   r   r	   pytorch_lightning.core.savingcoresaving	pl_savingpl_load_orig_pl_loadr   rt   #lightning_fabric.utilities.cloud_io	utilitiescloud_io_load_orig_fabric_loadr   AttributeError	getLoggerr:   r   MODELSr   r   r   r   <module>   s@   







 
_