o
    ei8                     @   s   d Z ddlZddlmZ ddlmZ zddlmZ W n ey7   dZ	e	d7 Z	e	d7 Z	e	d	7 Z	e	d
7 Z	ee	w ddl
Z
eeZdd Zdd Zdd Z			dddZdd ZdS )zE
Utilities for training kmeans model.

Author
 * Pooneh Mousavi 2023
    N)tqdm)
get_logger)MiniBatchKMeansz=The optional dependency sklearn is needed to use this module
z=Cannot import sklearn.cluster.MiniBatchKMeans to use KMeans/
z%Please follow the instructions below
z=============================
zpip install -U scikit-learn
c                 C   s\   |  |} | j\}}| || |}}|||| jdd}|| d   dS )a  Extract features (output of SSL model) and acculamte them on cpu to be used for clustering.

    Arguments
    ---------
    batch : tensor
        Single batch of data.
    features_list : list
        accumulate features list.
    ssl_model : torch.nn.Module
        SSL-model used to  extract features used for clustering.
    ssl_layer_num : int
        specify output of which layer of the ssl_model should be used.
    device : str
        `cpu` or `cuda` device.
    )end_dimcpuN)tosigflattenextenddetachnumpy)batchfeatures_list	ssl_modelssl_layer_numdevicewavswav_lensfeats r   V/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/speechbrain/utils/kmeans.pyaccumulate_and_extract_features   s   

r   c
           
      C   sZ   t j|	rtd|	 d t|	S td|	 d t| ||||||||ddddS )	a*  Return a k-means clustering model with specified parameters.

    Arguments
    ---------
    n_clusters : MiniBatchKMeans
        The number of clusters to form as well as the number of centroids to generate.
    init : int
        Method for initialization: {'k-means++'', ''random''}
    max_iter : int
        Maximum number of iterations over the complete dataset before stopping independently of any early stopping criterion heuristics.
    batch_size : int
        Size of the mini batches.
    tol : float
        Control early stopping based on the relative center changes as measured by a smoothed, variance-normalized of the mean center squared position changes.
    max_no_improvement :int
        Control early stopping based on the consecutive number of mini batches that does not yield an improvement on the smoothed inertia.
    n_init : int
        Number of random initializations that are tried
    reassignment_ratio : float
        Control the fraction of the maximum number of counts for a center to be reassigned.
    random_state :int
        Determines random number generation for centroid initialization and random reassignment.
    checkpoint_path : str
        Path to saved model.

    Returns
    -------
    MiniBatchKMeans
        a k-means clustering model with specified parameters.
    zThe checkpoint is loaded from .zNo checkpoint is found at z(. New model is initialized for training.   TN)
n_clustersinitmax_iter
batch_sizetolmax_no_improvementn_initreassignment_ratiorandom_stateverbosecompute_labels	init_size)ospathexistsloggerinfojoblibloadr   )
r   r   r   r   r   r    r!   r"   r#   checkpoint_pathr   r   r   fetch_kmeans_model8   s(   *

r/   c                 C   sF   t dt| |D ]}| |||  }t||k r dS ||}qdS )a  Process data in chunks of a specified size.

    Arguments
    ---------
    data : list
        The list of integers to be processed.
    chunk_size : int
        The size of each chunk.
    model : MiniBatchKMeans
        The initial kmeans model for training.
    r   N)rangelenpartial_fit)data
chunk_sizemodelichunkr   r   r   process_chunksy   s   r8     r   
   c              
   C   s  t d g }d}	t|ddk}
|
D ]K}t||||| t||kr.t|||  |	d7 }	g }|	d | dkr]t d|	 d |tj	|d tj	|d	| j
 d
| d}t| | qt||krrt|||  W d   dS W d   dS 1 s}w   Y  dS )a  Train a  Kmeans model .

    Arguments
    ---------
    model : MiniBatchKMeans
        The initial kmeans model for training.
    train_set : Dataloader
        Batches of tarining data.
    ssl_model : torch.nn.Module
        SSL-model used to  extract features used for clustering.
    save_path: string
        Path to save intra-checkpoints and dataloader.
    ssl_layer_num : int
        Specify output of which layer of the ssl_model should be used.
    kmeans_batch_size : int
        Size of the mini batches.
    device : str
        `cpu` or `cuda` device.
    checkpoint_interval: int
        Determine at which iterations to save the checkpoints.
    zStart training kmeans model.r   T)dynamic_ncolsr   z'Saving intra-checkpoints for iteration r   zdataloader-TRAIN.ckptzkmeans-cluster-z-layer-z.ptN)r*   r+   r   r   r1   r8   _speechbrain_saver'   r(   joinr   
save_model)r5   	train_setr   	save_pathr   kmeans_batch_sizer   checkpoint_intervalr   	iterationtr   r.   r   r   r   train   sD   



"rE   c                 C   s   t | t|d dS )zSave a  Kmeans model .

    Arguments
    ---------
    model : MiniBatchKMeans
        The  kmeans model to be saved.
    checkpoint_path : str
        Path to save the model.
    wbN)r,   dumpopen)r5   r.   r   r   r   r>      s   
r>   )r9   r   r:   )__doc__r'   tqdm.contribr   speechbrain.utils.loggerr   sklearn.clusterr   ImportErrorerr_msgr,   __name__r*   r   r/   r8   rE   r>   r   r   r   r   <module>   s0    A
D