o
    pi?,                     @   s   d dl Z d dlmZmZmZ d dlZd dlm  mZ	 d dl
mZmZmZ d dlmZ d dlmZ d dlmZ d dlmZmZ d dlmZ d d	lmZ d d
lmZ d dlmZ G dd dZdS )    N)DictSequenceUnion)Problem
ResolutionSpecifications)EqualErrorRate)create_rng_for_worker)Segment)SpeakerDiarizationProtocolSpeakerVerificationProtocol)default_collate)Metric)BinaryAUROC)tqdmc                   @   s  e Zd ZdZedefddZejdefddZedefddZejd	efd
dZedefddZ	e	jdefddZ	d'ddZ
deeee eeef f fddZdd Zdd Zd(ddZdefddZdefdd Zd!d" Zd#d$ Zdefd%d&ZdS )))SupervisedRepresentationLearningTaskMixinz6Methods common to most supervised representation tasksreturnc                 C      t | dr| jS | j| j S )Nnum_classes_per_batch_)hasattrr   
batch_sizenum_chunks_per_classself r   Y/home/ubuntu/.local/lib/python3.10/site-packages/pyannote/audio/tasks/embedding/mixins.pynum_classes_per_batch0      
z?SupervisedRepresentationLearningTaskMixin.num_classes_per_batchr   c                 C   
   || _ d S N)r   )r   r   r   r   r   r   6      
c                 C   r   )Nnum_chunks_per_class_)r   r!   r   r   r   r   r   r   r   :   r   z>SupervisedRepresentationLearningTaskMixin.num_chunks_per_classr   c                 C   r   r   )r!   )r   r   r   r   r   r   @   r    c                 C   s   t | dr| jS | j| j S )Nbatch_size_)r   r"   r   r   r   r   r   r   r   D   r   z4SupervisedRepresentationLearningTaskMixin.batch_sizer   c                 C   r   r   )r"   )r   r   r   r   r   r   J   r    Nc              	      s   t   _d jj d}t j |ddD ]A}|d  D ]8} fdd|d |D }|s1qtdd	 |D }| jvrEt	  j|<  j| 
|d
 |d ||d qqttjtj j jt jd _d S )NzLoading z training labelsfile)iterabledescunit
annotationc                    s   g | ]
}|j  jkr|qS r   )durationmin_duration.0segmentr   r   r   
<listcomp>X   s
    zCSupervisedRepresentationLearningTaskMixin.setup.<locals>.<listcomp>c                 s   s    | ]}|j V  qd S r   r(   r*   r   r   r   	<genexpr>c   s    zBSupervisedRepresentationLearningTaskMixin.setup.<locals>.<genexpr>uriaudio)r0   r1   r(   speech_turns)problem
resolutionr(   r)   classes)dict_trainprotocolnamer   trainlabelslabel_timelinesumlistappendr   r   REPRESENTATIONr   CHUNKr(   r)   sortedspecifications)r   stager%   fklassr2   r(   r   r   r   setupN   s6   


z/SupervisedRepresentationLearningTaskMixin.setupc                 C   s   t dddtddgS )NTF)compute_on_cpu	distances)rH   )r   r   r   r   r   r   default_metricz   s   
z8SupervisedRepresentationLearningTaskMixin.default_metricc                 c   sx   t | j}t| jj}|| j| j}d}	 || |D ]}| jj	|}t
| jD ]}|j| j| dd | j| D dd^}}|j|d dd |d D dd^}	}|	j|k r| jj||	\}
}t|| jjj |
jd  }|d|}t|
||| f}
n||	j|	j| }t||| }| jj||\}
}|
|d	V  |d7 }|| jkr|| j| j}d}q,qq)
zIterate over training samples

        Yields
        ------
        X: (time, channel)
            Audio chunks.
        y: int
            Speaker index.
        r   Tc                 S   s   g | ]}|d  qS r.   r   )r+   rE   r   r   r   r-      s    zKSupervisedRepresentationLearningTaskMixin.train__iter__.<locals>.<listcomp>   )weightskr2   c                 S   s   g | ]}|j qS r   r.   )r+   sr   r   r   r-      s    )Xy)r	   modelr>   rC   r5   uniformr)   r(   shuffleindexranger   choicesr7   r1   cropmathfloorsample_rateshaperandintFpadstartendr
   r   )r   rngr5   batch_durationnum_samplesrF   rP   _r#   speech_turnrO   num_missing_framesleft_pad
start_timechunkr   r   r   train__iter__   sX   





z7SupervisedRepresentationLearningTaskMixin.train__iter__c                 C   s>   t dd | j D }d| j| j  }t| jt|| S )Nc                 s   s"    | ]}|D ]}|d  V  qqdS )r(   Nr   )r+   datadatumr   r   r   r/      s    zISupervisedRepresentationLearningTaskMixin.train__len__.<locals>.<genexpr>      ?)	r=   r7   valuesr)   r(   maxr   rX   ceil)r   r(   avg_chunk_durationr   r   r   train__len__   s
   z6SupervisedRepresentationLearningTaskMixin.train__len__r:   c                 C   sD   t |}|dkr | jjdd | j|d | jjjd}|j|d< |S )Nr:   T)moderO   )samplesrZ   )r   augmentationr:   rQ   hparamsrZ   rt   )r   batchrD   collated	augmentedr   r   r   
collate_fn   s   
z4SupervisedRepresentationLearningTaskMixin.collate_fn	batch_idxc                 C   sT   |d |d }}| j |  ||}t|rd S | j jd|ddddd d|iS )NrO   rP   z
loss/trainFTon_stepon_epochprog_barloggerloss)rQ   	loss_functorchisnanlog)r   rw   r{   rO   rP   r   r   r   r   training_step   s   
	z7SupervisedRepresentationLearningTaskMixin.training_stepprepared_dictc                 C   s&   t | jtrt| j |d< d S d S )N
validation)
isinstancer8   r   r>   development_trial)r   r   r   r   r   prepare_validation   s   z<SupervisedRepresentationLearningTaskMixin.prepare_validationc           
      C   s  t | jtrw| jd | }t }dD ]\}|d|d }| jj|}|| jkrFt	d| d| j  d| d| j  }| jj
||\}}n | j|\}}t| j| jjj |jd  }	t|d|	f}||d|d< q|d	 |d
< |S t | jtr	 d S d S )Nr   )rK      r#   drm   rK   r   rO   	referencerP   )r   r8   r   prepared_datar6   rQ   r1   get_durationr(   r
   rW   rX   rY   rZ   r[   r]   r^   r   )
r   idxtrialrk   r#   r(   middlerO   rd   rf   r   r   r   val__getitem__   s0   
z8SupervisedRepresentationLearningTaskMixin.val__getitem__c                 C   s.   t | jtrt| jd S t | jtrdS d S )Nr   r   )r   r8   r   lenr   r   r   r   r   r   
val__len__  s
   z4SupervisedRepresentationLearningTaskMixin.val__len__c                 C   s   t | jtrLt   | |d  }| |d  }t||}W d    n1 s-w   Y  |d }| j	|| | jj
| jj	ddddd d S d S )NX1X2rP   FTr|   )r   r8   r   r   no_gradrQ   detachr]   cosine_similarityvalidation_metriclog_dict)r   rw   r{   emb1emb2y_predy_truer   r   r   validation_step  s    

z9SupervisedRepresentationLearningTaskMixin.validation_stepr   )r:   )__name__
__module____qualname____doc__propertyintr   setterr   r   rG   r   r   r   r   strrJ   rj   rr   rz   r   r   r   r   r   r   r   r   r   r   +   s4    
,
M
r   )rX   typingr   r   r   r   torch.nn.functionalnn
functionalr]   pyannote.audio.core.taskr   r   r   *pyannote.audio.torchmetrics.classificationr   pyannote.audio.utils.randomr	   pyannote.corer
   pyannote.database.protocolr   r   torch.utils.data._utils.collater   torchmetricsr   torchmetrics.classificationr   r   r   r   r   r   r   <module>   s   