o
    3NPi-                     @   s(  d Z ddlZddlZddlmZmZmZ ddlZddl	Z	ddl
Z
ddlmZ edZ	ddejded	ed
e	jdedejfddZ	d dedee deeejf fddZdededefddZdee dee	j ded
e	jdeeejf f
ddZdee dee	j d
e	jdeeejf fddZdS )!a  
Speaker embedding extraction with OOM protection and compute monitoring.

=== OPTIMIZATION (v6.2) ===
Key improvements:
- Accept audio buffer: No file re-read when buffer provided
- GPU memory-aware batching: Query torch.cuda.mem_get_info() for dynamic batch sizing
- Adaptive batching based on segment length
- OOM protection with fallback to individual processing
- Memory-efficient processing order (short segments first)
    N)ListDictOptional)MODELSzFastPipelineV6.Embeddings      ?audio_chunksrmax_window_samplesdevicewindow_overlapreturnc                 C   s(  t | }t|d|  }g }td||D ]t}	t|	| |}
| |	|
 }t ||k r*qzDt|d|}tdg|}t   |	||
  }|jdkrW| }|| W d   n1 sfw   Y  ~~W q ty } ztd|  W Y d}~qd}~ww |sdS tj|ddS )a  
    Extract embedding from a long segment by chunking and averaging.
    
    === OPTIMIZATION (v6.1) ===
    Instead of discarding long segments (>10s), we:
    1. Split into overlapping windows
    2. Extract embedding from each window
    3. Average the embeddings
    
    This preserves ALL your data - longer samples are valuable for TTS!
    
    Args:
        audio_chunk: Audio samples (numpy array)
        sr: Sample rate
        max_window_samples: Maximum samples per window
        device: Torch device
        embedding_model: Embedding extraction model
        window_overlap: Overlap ratio between windows (0.5 = 50%)
    
    Returns:
        Averaged embedding vector
       r         ?   NzWindow embedding failed: )axis)lenintrangemintorchtensor	unsqueezetono_gradencode_batchcpunumpyndimsqueezeappend	Exceptionloggerdebugnpmean)r   r   r	   r
   embedding_modelr   total_samplesstep_sampleswindow_embeddings	start_idxend_idxwindowpaddedwav_lensembe r0   T/home/ubuntu/.cursor/worktrees/maya3data__SSH__216.81.248.184_/zxg/src/embeddings.py_extract_embedding_chunked   s4   

r2   
audio_pathsegmentsc           #      C   s  t dt| d t }tjdd t }tj}|dur,|j}|j	}t d nt
| \}	}|	d }i }
td| }|j}g }g }d}t|D ]E\}}|d	d
v r[qOt|d | }t|d | }||| }t||k r||d7 }qOt||kr|||f qO||t|f qO|dkrt d| d |rt dt| d |D ]\}}t|||||}|dur||
|< qtjdd |}|s|
rt d |
S t d |
S |jdd d t||j}t|| d | }t dt| d| d| d ttdt||D ]\}}t|| t|}||| }dd |D }dd |D }tdd |D }t|| d } | d krkt d!|d  d"| d# d$d% t|||||
 tjdd qzt||||||
 W n) ty }! zt  d!|d  d&|! d' t|||||
 W Y d}!~!nd}!~!ww |d d( dkrtjdd qtjdd t | }"t d)|"d*d+t|
 d,t| d- |
S ).u  
    Extract speaker embeddings with adaptive batching to prevent OOM.
    
    === OPTIMIZATION (v6.2) ===
    - Accept audio_buffer: No file re-read when buffer provided
    - GPU memory-aware batching: Use torch.cuda.mem_get_info() for dynamic batch sizing
    - Long segments (>10s) are chunked and averaged - preserving valuable data
    
    Strategy to prevent OOM while preserving ALL data:
    1. Query available VRAM before each batch
    2. Short segments: batch processing for efficiency
    3. Long segments: chunk into windows → extract → average
    4. Adaptive batch size based on VRAM + segment lengths
    5. Aggressive cache clearing between batches
    
    Args:
        audio_path: Path to audio file
        segments: List of segments to extract embeddings for
        config: Pipeline configuration
        audio_buffer: Optional AudioBuffer (avoids file re-read)
    
    Returns:
        Dict mapping segment index to embedding array
    u   🔢 Extracting embeddings (z segments)...T)
aggressiveNz.   Using pre-loaded audio buffer (no file I/O)r   g333333?speaker)OVERLAP
NON_SPEECHstartendr   z   Skipped z segments (too short <0.3s)z   Processing z+ long segments via chunking (no data loss!)z2   All segments were long - processed via chunkingz-   No valid segments for embedding extractionc                 S   s   t | d S )Nr   r   )xr0   r0   r1   <lambda>   s    z,extract_embeddings_batched.<locals>.<lambda>)keyz segments in z batches (size=)c                 S      g | ]}|d  qS )r   r0   .0itemr0   r0   r1   
<listcomp>       z.extract_embeddings_batched.<locals>.<listcomp>c                 S   r@   )r   r0   rA   r0   r0   r1   rD      rE   c                 s   s    | ]}t |V  qd S Nr;   )rB   cr0   r0   r1   	<genexpr>   s    z-extract_embeddings_batched.<locals>.<genexpr>   i ez	   Batch z too large (g    eAz.2fzGB), processing individuallyz	 failed: z, trying individual processing   u   ✅ Embeddings: z.1fzs | /z
 extracted)!r!   infor   timer   clear_cache
get_devicer%   waveform_npsample_rate
torchaudioloadr   r   r   max_embedding_length	enumerategetr   r   r   r2   warningsort_compute_adaptive_batch_sizeembedding_batch_sizer   r   max_process_individually_process_batchr    error)#r3   r4   configaudio_bufferr9   r
   r%   rP   r   waveform
embeddingsmin_samplesmax_samplesshort_items
long_itemsskipped_too_shortisegstart_sample
end_samplechunkr.   valid_items
batch_sizetotal_batches	batch_numbatch_start	batch_endbatch_itemsindicesaudio_chunksmax_lenestimated_sizer/   elapsedr0   r0   r1   extract_embeddings_batched]   s   

"
"(ry   rm   
base_batchc              	   C   s   | s|S t dd | D t|  }zddlm} |t| t||dddd}W n ttfy4   |}Y nw |d	k rAt|d d}n|d
k rH|}nt|d d}t||S )z
    Compute optimal batch size based on segment lengths AND available VRAM.
    
    === OPTIMIZATION (v6.2) ===
    Query torch.cuda.mem_get_info() for dynamic batch sizing instead of
    relying solely on average length heuristics.
    c                 s   s    | ]	}t |d  V  qdS )r   Nr;   rA   r0   r0   r1   rH      s    z/_compute_adaptive_batch_size.<locals>.<genexpr>r   )compute_optimal_batch_size   @   g       @)	num_itemsavg_item_samplesrz   	min_batch	max_batchvram_headroom_gbi>  i  rI   )	sumr   src.audio_bufferr{   r   ImportErrorr    r   r[   )rm   rz   avg_lenr{   
vram_batchlength_batchr0   r0   r1   rY      s,   

rY   rt   ru   rv   rb   c                 C   s  t t||}t t|}t|D ]\}}	|	||dt|	f< t|	| ||< q||}||}t  2 |||  }
|
j	dkrU|
j
d dkrU|
d}
n|
j	dkrc|
|
j
d d}
W d   n1 smw   Y  t| D ]\}}|
j	dkr|
| n|
||< qv~~~
dS )zProcess a batch of segments.Nr   r   r   )r   zerosr   rU   r   r   r   r   r   r   shaper   reshape)rt   ru   rv   r
   r%   rb   r,   r-   jrl   embsidxr0   r0   r1   r]     s$   	




r]   c                 C   s   t | |D ]a\}}z@|d|}tdg|}t  |||  }	|	j	dkr3|	
 }	|	||< W d   n1 sAw   Y  ~~W q tyf }
 ztd| d|
  W Y d}
~
qd}
~
ww dS )z*Process segments one at a time (fallback).r   r   r   Nz#   Individual embedding failed for z: )zipr   r   r   r   r   r   r   r   r   r   r    r!   r^   )rt   ru   r
   r%   rb   r   rl   padded_singlewav_lens_singler.   r/   r0   r0   r1   r\   <  s    


"r\   )r   rF   )__doc__rM   loggingtypingr   r   r   r   r#   r   rR   
src.modelsr   	getLoggerr!   ndarrayr   r
   floatr2   strry   rY   Tensorr]   r\   r0   r0   r0   r1   <module>   sj   
	
H
 )
!