o
    3NPi                     @   s   d Z ddlZddlZddlmZmZ ddlmZ ddlZ	ddl
mZ edZddee d	ed
ededef
ddZdedee deee	jf dee fddZdee dee fddZdS )z9Speaker clustering and merging with temporal constraints.    N)ListDict)defaultdict)cosine_similarityzFastPipelineV5.Clustering皙?segmentsspk1spk2min_overlapreturnc                    sl    fdd| D }fdd| D }|D ]\}}|D ]\}}	t ||}
t||	}||
 |kr2  dS qqdS )zv
    Cannot-link constraint: temporal overlap = different speakers.
    Two speakers can't talk at the same time.
    c                    (   g | ]}|d   kr|d |d fqS speakerstartend .0s)r   r   T/home/ubuntu/.cursor/worktrees/maya3data__SSH__216.81.248.184_/zxg/src/clustering.py
<listcomp>      ( z,speakers_overlap_in_time.<locals>.<listcomp>c                    r   r   r   r   )r	   r   r   r      r   TF)maxmin)r   r   r	   r
   
spk1_times
spk2_timess1_starts1_ends2_starts2_endoverlap_startoverlap_endr   )r   r	   r   speakers_overlap_in_time   s   

r"   
audio_path
embeddingsc              	   C   s  t d|j d t }|s|S tt}t|D ]\}}||v r0||d  |||| f qi }i }	| D ]\}
}dd |D }|rUt	j
|dd||
< t||	|
< q9t| }t|dkrot d	t| d
 |S t|}t	||f}t|D ]"}t|D ]}||krt|||  g|||  gd |||f< qq~dd |D }i }ddddd}g }t|D ]}t|d |D ]}||| || |||f f qq|jdd dd |D ]K\}}}|d  d7  < ||jk r|d  d7  < qt|||r
|d  d7  < q|}||v r|| }||v s|||< |||< |d  d7  < qtt| }dd t|D }|D ]}||d |d }||||d< q>t | }ttdd |D }t dt| d| d|dd t d|  |S ) a  
    Conservative speaker merging with cannot-link constraints.
    
    Strategy:
    1. Compute speaker centroids from embeddings
    2. Build similarity matrix
    3. Merge high-similarity speakers (respecting temporal constraints)
    4. Assign final speaker labels
    u!   🔧 Merging speakers (threshold=z)...r   c                 S   s   g | ]}|d  qS )   r   )r   itemr   r   r   r   ?   s    z"merge_speakers.<locals>.<listcomp>r   )axis   u	   ✅ Only z speaker(s), no merging needed)r   r   c                 S   s   i | ]}||qS r   r   r   r   r   r   
<dictcomp>U   s    z"merge_speakers.<locals>.<dictcomp>)	attemptedblocked_thresholdblocked_overlap
successfulc                 S      | d S )Nr%   r   xr   r   r   <lambda>c       z merge_speakers.<locals>.<lambda>T)keyreverser*   r+   r,   r-   c                 S   s   i | ]\}}|d |dqS )SPEAKER_02dr   )r   ir   r   r   r   r)      s    c                 s   s    | ]}|d  V  qdS )r   Nr   )r   segr   r   r   	<genexpr>   s    z!merge_speakers.<locals>.<genexpr>u   ✅ Merged:     → z speakers (z.1fzs)z
   Stats: )loggerinfocluster_merge_thresholdtimer   list	enumerateappenditemsnpmeanlenkeyszerosranger   sortr"   sortedsetvaluesget)r#   r   r$   configr   speaker_segmentsr7   r8   speaker_embeddingsspeaker_countsr   rB   embsspeakersn
sim_matrixj	merge_map	merged_tostatspairsr   r	   simroot1unique_speakersname_mapmergedelapsedfinal_speakersr   r   r   merge_speakers   s   "


$rb   c           	      C   s  | sg S t | dd d} g }d}| D ]\}|du r#| }|dd q|d |d k}|d |d	  }|dddk}|dddk}|r_|r_|r_||jkr_|d	 |d	< |d	 |d  |d
< q|| | }|dd q|rv|| tdt|  dt| d |S )a\  
    Merge adjacent same-speaker segments with small gaps.
    
    Rules:
    1. Same speaker
    2. Gap <= max_silence_gap

    Important semantics:
    - The "gap" is typically a VAD-derived non-speech region (silence/breath).
      If we MERGE, we do NOT "ignore" that non-speech; we *include* it inside the
      merged [start, end] span by extending `end` to the later segment.
    - This is intentionally conservative by default: larger allowed gaps reduce
      fragmentation, but also increase the amount of silence included in a clip
      and can (rarely) hide missed micro-interjections if upstream diarization
      failed to label them as speech.

    Status handling (v6.8 fix):
    - Preserve any existing `status` (especially 'unusable') on incoming segments.
    - If a segment has no `status` key, default it to 'usable' so downstream code
      (stats, sample generation, UI) can reliably filter by status.
    - Only merge two segments when BOTH are usable. Unusable segments act as
      hard boundaries so we never "smear" low-quality/too-short regions into
      an otherwise clean clip.
    c                 S   r.   )Nr   r   r/   r   r   r   r1      r2   z)merge_adjacent_segments.<locals>.<lambda>)r3   Nstatususabler   r   r   durationz   Adjacent merge: r:   z	 segments)	rJ   copy
setdefaultrM   max_silence_gaprA   r;   r<   rE   )	r   rN   r_   currentr8   same_speakergapcurrent_usable
seg_usabler   r   r   merge_adjacent_segments   s<   


 rn   )r   )__doc__r>   loggingtypingr   r   collectionsr   numpyrC   sklearn.metrics.pairwiser   	getLoggerr;   strfloatboolr"   intndarrayrb   rn   r   r   r   r   <module>   s&   
$
o