o
    q^i3                  	   @   s  d Z ddlZddlZeedsdd e_ddlZddlZddlZddlZz'ddl	Z	ddl
Z
ee
drJe
jZe	eZdejvrJdddd	d
Zee
_W n	 eyT   Y nw ej rjdejj_dejjj_dejj_ejZdd Zee_zddlm  mZ ej Z!dddZ"e"e_ W n	 e#y   Y nw zddl$m%  m&Z& e&j'Z(dddZ)e)e&_'W n e#e*fy   Y nw e+dZ,G dd dZ-e- Z.dS )a  
Model management - Hot inference with adaptive loading.

Key improvements:
- community-1 FIRST (better per pyannote docs), 3.1 as fallback
- Lazy loading with compute-aware batching
- Better GPU memory management

=== v7.0 OPTIMIZATION: cuDNN Benchmark + TF32 ===
Enables CUDA optimizations for faster inference:
- cudnn.benchmark=True: Auto-tune convolution algorithms
- TF32 precision: 3x faster matmul on Ampere+ GPUs
Expected gain: 5-10% on diarization (conv-heavy model)
    Nlist_audio_backendsc                   C   s   ddgS )N	soundfilesox r   r   r   L/home/ubuntu/.cursor/worktrees/maya3__SSH__216.81.248.184_/nmo/src/models.py<lambda>   s    r   hf_hub_downloaduse_auth_token)r	   tokenc                 O   s&   |d u r
| d ur
| }t |d|i|S )Nr
   )_orig_hf_hub_download)r	   r
   argskwargsr   r   r   _patched_hf_hub_download)   s   r   Tc                  O   s   d|d< t | i |S )NFweights_only_original_torch_load)r   r   r   r   r   _patched_torch_load?   s   r   c                 K      t | |ddS NF)map_locationr   r   )path_or_urlr   r   r   r   r   _patched_pl_loadH      r   c                 C   r   r   r   )r   r   r   r   r   _patched_fabric_loadQ   r   r   zFastPipelineV6.Modelsc                       sl   e Zd ZdZdZ fddZdd Zdejfdd	Z	d
d Z
dddZdefddZddefddZ  ZS )ModelManagera  
    Singleton model manager - loads all models once, keeps them hot.
    
    Model priority (per user request):
    1. community-1 (better performance per pyannote docs)
    2. speaker-diarization-3.1 (fallback)
    
    Key optimization: Load once, inference many times.
    Nc                    s&   | j d u rt | | _ d| j _| j S )NF)	_instancesuper__new___initialized)cls	__class__r   r   r   f   s   
zModelManager.__new__c                 C   sP   | j rd S d| _ d | _d | _d | _d | _d | _d | _d | _d| _d| _	d | _
d S )NTF)r   device
silero_vadsilero_utilsdiarization_pipelinesegmentation_modelembedding_modelpanns_model_loaded_panns_loaded_model_nameselfr   r   r   __init__l   s   
zModelManager.__init__returnc                 C   s*   | j du rt tj rdnd| _ | j S )z#Get CUDA device or fallback to CPU.Ncudacpu)r"   torchr0   is_availabler,   r   r   r   
get_device{   s   
zModelManager.get_devicec                    s  | j rtd| j d dS t|ddptjdd}|s"td||_	td td	 td t

 }|  }td
|  tj rbtjdjtjd }td|d dd td tjjdddddd\}}|| _|| _td ddlm} ddlm}	 ddl}
td |	j t|
 j   fdd}||	_ddg}|D ]B\}}z!td| d |j!||j	d| _"|| _td | d! W  n t#y } zt$d"| d#|  W Y d}~qd}~ww | j"du rtd$| j"%| td% d| _&td& dd'l'm(} |j)d(d)d*t*|id+| _+td, d-| _ t

 | }tj rStj, d }tj d }td.|dd/|dd0 td1|d2d3 td dS )4z
        Load all models once into GPU memory.
        
        Order: VAD -> Diarization (community-1 first) -> Segmentation -> Embedding
        u   ✅ Models already loaded: z (hot inference)Nhf_tokenHF_TOKEN zHF_TOKEN is required to load pyannote models.
- Set env var HF_TOKEN (do not hard-code it)
- Ensure you've accepted the model terms on Hugging Face:
  - pyannote/speaker-diarization-community-1
  - pyannote/segmentation-3.0 (if needed)
zF======================================================================u    🚀 LOADING ALL MODELS INTO GPUzDevice: r   zGPU Memory before: g    eAz.2fzGB freezLoading Silero VAD...zsnakers4/silero-vadr#   F)repo_or_dirmodelforce_reloadonnxverbosezSilero VAD loaded)Pipeline)SpeakerDiarizationzLoading PyAnnote Diarization...c                    sX   fdd|  D }t| t|  }|r!td|   | g|R i |S )Nc                    s   i | ]\}}| v r||qS r   r   ).0kv)_valid_paramsr   r   
<dictcomp>   s    zCModelManager.load_all.<locals>._patched_sd_init.<locals>.<dictcomp>z!Filtered out unsupported params: )itemssetkeysloggerdebug)r-   r   r   filtered_kwargsremoved_original_sd_initrB   r   r   _patched_sd_init   s
   z/ModelManager.load_all.<locals>._patched_sd_init)z(pyannote/speaker-diarization-community-1zcommunity-1)z pyannote/speaker-diarization-3.1zspeaker-diarization-3.1zTrying z...)r
   u   ✅ Loaded z (preferred model)zFailed to load z: z'Failed to load any diarization pipelineu&   ✅ PyAnnote Diarization loaded on GPUzLoading ECAPA-TDNN...)EncoderClassifierz!speechbrain/spkrec-ecapa-voxcelebz/ephemeral/models/ecapar"   )sourcesavedirrun_optsu   ✅ ECAPA-TDNN loadedTzGPU Memory: zGB allocated, zGB reservedu   ✅ All models loaded in .1fzs (STAYING HOT))-r)   rG   infor+   getattrosenvirongetRuntimeErrorr5   timer4   r2   r0   r3   get_device_propertiestotal_memorymemory_reservedrH   hubloadr#   r$   pyannote.audior=   pyannote.audio.pipelinesr>   inspectr.   rE   	signature
parametersrF   from_pretrainedr%   	Exceptionwarningtor&   speechbrain.inference.speakerrN   from_hparamsstrr'   memory_allocated)r-   configr5   startr"   freer9   utilsr=   r>   ra   rM   models_to_try
model_name
short_nameerN   elapsed	allocatedreservedr   rK   r   load_all   s   












zModelManager.load_allFc                 C   s@   t j r|rt  t j  t j  dS t j  dS dS )z
        Clear GPU cache between operations.
        
        Args:
            aggressive: If True, also empty reserved memory and run gc
        N)r2   r0   r3   gccollectempty_cachereset_peak_memory_stats)r-   
aggressiver   r   r   clear_cache   s   

zModelManager.clear_cachec                 C   s4   t j rt jdj}t jd}|| d S dS )zGet current free VRAM in GB.r   i   @g        )r2   r0   r3   rZ   r[   r\   )r-   totalrv   r   r   r   get_vram_free_gb  s
   
zModelManager.get_vram_free_gbr0   r"   c              
   C   s0  | j r| jdurtd | jS td t }zTddlm} |dkr0tj	 s0d}t
d tjd	}|rNtj|rNtd
|  |||d| _n||d| _d| _ t | }td| d|dd | jW S  ty } z
td td|d}~w ty } z	td|   d}~ww )a  
        Load PANNs CNN14 model for music detection (lazy loading).
        
        === v7.4: Pre-baked model support ===
        If PANNS_MODEL_PATH env var is set, loads from that path (Docker pre-baked).
        Otherwise downloads from default location.
        
        Args:
            device: 'cuda' or 'cpu'
        
        Returns:
            Loaded AudioTagging model
        Nu.   ✅ PANNs CNN14 already loaded (hot inference)z*Loading PANNs CNN14 for music detection...r   )AudioTaggingr0   r1   z'CUDA not available, using CPU for PANNsPANNS_MODEL_PATHz   Using pre-baked model: )checkpoint_pathr"   )r"   Tu   ✅ PANNs CNN14 loaded on z in rR   su9   ❌ PANNs not installed. Run: pip install panns-inferencez_panns-inference package required for music detection. Install with: pip install panns-inferenceu   ❌ Failed to load PANNs: )r*   r(   rG   rS   rY   panns_inferencer   r2   r0   r3   rf   rU   rV   rW   pathexistsImportErrorerrorre   )r-   r"   rm   r   r   rt   rs   r   r   r   
load_panns  s<   



zModelManager.load_panns)F)r0   )__name__
__module____qualname____doc__r   r   r.   r2   r"   r4   rw   r}   floatr   rj   r   __classcell__r   r   r    r   r   Z   s    	
~r   )N)/r   rU   
torchaudiohasattrr   rY   loggingrx   r2   ra   huggingface_hubr   r   rb   _hf_sigrc   r   re   r0   r3   backendscudnn	benchmarkmatmul
allow_tf32r^   r   r   pytorch_lightning.core.savingcoresaving	pl_savingpl_load_orig_pl_loadr   r   #lightning_fabric.utilities.cloud_io	utilitiescloud_io_load_orig_fabric_loadr   AttributeError	getLoggerrG   r   MODELSr   r   r   r   <module>   sb   





	






 
s