o
    {ima                     @   s  d Z ddlZddlmZ ddlmZ ddlmZ ddlm	Z	 ddl
mZmZ ddlmZmZmZmZmZmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZ ddl Z ddl!m"Z" ddl#Z#dZ$eddd\Z%Z&ee%e&dd\Z%Z&e 'e%Z%e%( Z)ej*dge)d< ej*ej*ge)d< dd Z+dd Z,dd Z-dd Z.d d! Z/d"d# Z0d$d% Z1d&d' Z2d(d) Z3d*d+ Z4d,d- Z5d.d/ Z6d0d1 Z7d2d3 Z8d4d5 Z9d6d7 Z:d8d9 Z;d:d; Z<d<d= Z=d>d? Z>d@dA Z?dBdC Z@dDdE ZAdFdG ZBdHdI ZCdJdK ZDdLdM ZEdNdO ZFdPdQ ZGdRdS ZHdTdU ZIdVdW ZJdXdY ZKdZd[ ZLd\d] ZMd^d_ ZNd`da ZOdbdc ZPddde ZQdfdg ZRdhdi ZSdjdk ZTe jUjVdldmdndo ZWdS )pzi
Tests for HDBSCAN clustering algorithm
Shamelessly based on (i.e. ripped off from) the DBSCAN test code
    N)distance)sparse)stats)check_estimator)assert_array_equalassert_array_almost_equal)HDBSCANhdbscanvalidity_indexapproximate_predictapproximate_predict_scoresmembership_vectorall_points_membership_vectors)
make_blobs)shuffle)StandardScaler)mode)mkdtempwraps)datasets      
   )	n_samplesrandom_state   )r         c                  C   s   t  t} | jd dksJ | jd dksJ | jd dks!J | jd dks*J | jd dks3J ttddttdd }t  t| }t|j| j| sUJ dS )zcTests if nan data are treated as infinite distance from all other points and assigned to -1 clusterr   r   r      r   N)	r   fitX_missing_datalabels_probabilities_listrangenpallclose)modelclean_indicesclean_model r,   N/home/ubuntu/.local/lib/python3.10/site-packages/hdbscan/tests/test_hdbscan.pytest_missing_data1   s   r.   c                       t   fdd}|S )zhTest decorator that skips test if matplotlib not installed.

    Parameters
    ----------
    func
    c                     sX   zdd l }|d dd lm} |  W n ty$   td Y d S w  | i |S )Nr   AggzMatplotlib not available.)
matplotlibusematplotlib.pyplotpyplotfigureImportErrorpytestskip)argskwargsr1   pltfuncr,   r-   run_testF   s   
zif_matplotlib.<locals>.run_testr   r=   r>   r,   r<   r-   if_matplotlib>   s   r@   c                    r/   )z7Test decorator that skips test if pandas not installed.c                     :   zdd l }W n ty   td Y d S w  | i |S )Nr   zPandas not available.)pandasr6   r7   r8   )r9   r:   rB   r<   r,   r-   r>   [      zif_pandas.<locals>.run_testr   r?   r,   r<   r-   	if_pandasX      rD   c                    r/   )z9Test decorator that skips test if networkx not installed.c                     rA   )Nr   zNetworkX not available.)networkxr6   r7   r8   )r9   r:   rF   r<   r,   r-   r>   j   rC   zif_networkx.<locals>.run_testr   r?   r,   r<   r-   if_networkxg   rE   rG   c                  C   sZ   t jdddgddd\} }t jdddd\}}tjjdd	}|d
dd}t| ||gS )Nr   )g      g      @)      ?       @g      ?*   r   centerscluster_stdr   g?)r   noiser   )seed            @)2      )r   r   
make_moonsr'   randomdefault_rnguniformvstack)blobs_moonsrngrN   r,   r,   r-   generate_noisy_datav   s   
r]   c                 C   s   d}t | D ](}|| |k }z
t|dd\}}W n
   t|\}}Y |t||d k7 }qt |D ](}| ||k }z
t|dd\}}W n
   t|\}}Y |t||d k7 }q3|d S )N        T)keepdimsr   rI   )setr   r'   sum)labels1labels2
num_missedlabelmatches
match_mode
mode_countr,   r,   r-   homogeneity   s    ri   c            
      C   s   t t t} | t|  } t| dd\}}}}}}tt|t	d|v  }|t
ks-J tdd| j}tt|t	d|v  }|t
ksHJ t| |ddd}	|	dksVJ d S )Nprecomputedmetricr   rS   )rl   dg333333?)r   
squareformpdistXr'   maxr	   lenr`   int
n_clustersr   r!   r#   r
   )
Dlabelsppersistctreeltreemtreen_clusters_1n_clusters_2validityr,   r,   r-   test_hdbscan_distance_matrix   s   r   c            
      C   s   t t t} | t|  } t|  d}d| | |k< t	
| } |   t| dd\}}}}}}tt|td|v  }|tksDJ tddd| j}tt|td|v  }	|	tks`J d S )NrR   r^   rj   rk   r   T)rl   gen_min_span_tree)r   rn   ro   rp   r'   rq   r   scoreatpercentileflattenr   
csr_matrixeliminate_zerosr	   rr   r`   rs   rt   r   r!   r#   )
ru   	thresholdrv   rw   rx   ry   rz   r{   r|   r}   r,   r,   r-   #test_hdbscan_sparse_distance_matrix   s   
r   c            	      C   s   t t\} }}}}}tt| td| v  }|tksJ t tj} tt| td| v  }|tks5J t	t| }|dks@J d S )Nr   皙?)
r	   rp   rr   r`   rs   rt   r   r!   r#   r
   )	rv   rw   rx   ry   rz   r{   r|   r}   r~   r,   r,   r-   test_hdbscan_feature_vector   s   
r   c                  C      t tdd\} }}}}}tt| td| v  }|tksJ tdddtj} tt| td| v  }|tks:J t	
t t tddd W d    d S 1 sRw   Y  d S )Nprims_kdtree	algorithmr   Tr   r   	russelraor   rl   r	   rp   rr   r`   rs   rt   r   r!   r#   r7   raises
ValueErrorrv   rw   rx   ry   rz   r{   r|   r}   r,   r,   r-   test_hdbscan_prims_kdtree      "r   c                  C   r   )Nprims_balltreer   r   Tr   cosiner   r   r   r,   r,   r-   test_hdbscan_prims_balltree   r   r   c                  C   r   )Nboruvka_kdtreer   r   Tr   r   r   r   r   r,   r,   r-   test_hdbscan_boruvka_kdtree   r   r   c                  C   r   )Nboruvka_balltreer   r   Tr   r   r   r   r   r,   r,   r-   test_hdbscan_boruvka_balltree   s   "r   c                  C   sx   t tdd\} }}}}}tt| td| v  }|tksJ tdddtj} tt| td| v  }|tks:J d S )Ngenericr   r   Tr   )	r	   rp   rr   r`   rs   rt   r   r!   r#   r   r,   r,   r-   test_hdbscan_generic   s   r   c                  C   s>   t  t} | d}tt|td|v  }t|ksJ d S )Ng333333?r   )r   r!   rp   dbscan_clusteringrr   r`   rs   rt   )	clustererrv   r|   r,   r,   r-   test_hdbscan_dbscan_clustering  s   
r   c            
      C   s   t dddd\} }t | } t| \}}}}}}tt|td|v  }|tks+J tddt	
| jd d	| j}tt|td|v  }	|	tksNJ d S )
NrR   r   @   r   r   
n_featuresr   best
seuclideanr   )r   rl   V)r   r   fit_transformr	   rr   r`   rs   rt   r   r'   onesshaper!   r#   )
Hyrv   rw   rx   ry   rz   r{   r|   r}   r,   r,   r-   test_hdbscan_high_dimensional  s   r   c                  C   s   t tdttjd d\} }}}}}tt| td| v  }|tks%J t	dttjd d
tj} tt| td| v  }|tksGJ d S )Nr   r   )rl   r   r   )r	   rp   r'   r   r   rr   r`   rs   rt   r   r!   r#   r   r,   r,   r-   !test_hdbscan_best_balltree_metric  s    r   c                  C   s   t tttd d\} }}}}}tt| td| v  }|dks"J tttd dtj} tt| td| v  }|dksAJ d S )Nr   min_cluster_sizer   r   )r	   rp   rr   r`   rs   r   r!   r#   r   r,   r,   r-   test_hdbscan_no_clusters*  s    r   c                  C   s   t dttd dD ]I} tt| d\}}}}}}dd |D }t|dkr1tt|| ks1J t| dtj	}dd |D }t|dkrStt|| ksSJ q
d S )NrS   r   r   c                 S      g | ]}|d kr|qS r   r,   .0re   r,   r,   r-   
<listcomp>9      z1test_hdbscan_min_cluster_size.<locals>.<listcomp>r   c                 S   r   r   r,   r   r,   r,   r-   r   >  r   )
r&   rr   rp   r	   r'   minbincountr   r!   r#   )r   rv   rw   rx   ry   rz   r{   true_labelsr,   r,   r-   test_hdbscan_min_cluster_size4  s   r   c            	      C   s|   t j} tt| d\}}}}}}tt|td|v  }|tks!J t| d	tj
}tt|td|v  }|tks<J d S )Nrk   r   )r   	euclideanr	   rp   rr   r`   rs   rt   r   r!   r#   )	rl   rv   rw   rx   ry   rz   r{   r|   r}   r,   r,   r-   test_hdbscan_callable_metricC  s   r   c                  C   s    ddgddgg} t  |  d S )NrH   rI   rQ   g      @)r   r!   rp   r,   r,   r-   test_hdbscan_input_listsP  s   r   c            	      C      t  } t| dd\}}}}}}t| dd\}}}}}}t||}|t| jd  dk s-J tdd| }tdd| }t||}|t| jd  dk sOJ d S )Nr   r   r   r   333333?r]   r	   ri   floatr   r   fit_predict	datalabels_primsrw   rx   ry   rz   r{   labels_boruvkanum_mismatchesr,   r,   r-   #test_hdbscan_boruvka_kdtree_matchesU     

r   c            	      C   r   )Nr   r   r   r   r   r   r   r,   r,   r-   %test_hdbscan_boruvka_balltree_matchesj  r   r   c                  C   sB   t ddt} t| jjddddd t| jjdddd d S )	NTr   )rgbReds)select_clusterslabel_clustersselection_palettecmapFnone)log_sizecolorbarr   )r   r!   rp   r@   condensed_tree_plotr   r,   r,   r-   test_condensed_tree_plot  s   


r   c                  C   s@   t ddt} t| jjdd t| jjdddddd	 d S )
NTr   r   )r   Flastpr   r   )vary_line_widthtruncate_moderw   r   r   )r   r!   rp   r@   single_linkage_tree_r   r   r,   r,   r-   test_single_linkage_tree_plot  s
   


r   c                  C   s   t ddt} t| jjdd tdddd\}}t |}t dd|} t| jjdd	d	d
 tdddd\}}t |}t dd|} t| jjdd	d	d
 d S )NTr   r   )	edge_cmaprR   r   r   r   F)r   r   r   (   )	r   r!   rp   r@   minimum_spanning_tree_r   r   r   r   )r   r   r   r,   r,   r-   test_min_span_tree_plot  s   


r   c                  C   s2   t ddt} | j  | j  | j  d S NTr   )r   r!   rp   r   to_numpyr   r   r   r,   r,   r-   test_tree_numpy_output_formats  s   

r   c                  C   >   t ddt} t| jj  t| jj  t| jj  d S r   )r   r!   rp   rD   r   	to_pandasr   r   r   r,   r,   r-   test_tree_pandas_output_formats     r   c                  C   r   r   )r   r!   rp   rG   r   to_networkxr   r   r   r,   r,   r-   !test_tree_networkx_output_formats  r   r   c                  C   s&   t ddt} | j}|d usJ d S r   )r   r!   rp   outlier_scores_)r   scoresr,   r,   r-   test_hdbscan_outliers  s   r   c                  C   s   t ddt} t| tddgg\}}|dksJ t| tddgg\}}|dks.J t| tddgg\}}|d	ksAJ d S )
NTprediction_datag      rP   rS         ?r   r^   r   )r   r!   rp   r   r'   array)r   clusterprobr,   r,   r-    test_hdbscan_approximate_predict  s   r   c                  C   s8  t ddt} tt t| t W d    n1 sw   Y  |   tt t| t	g dg W d    n1 sAw   Y  t
jdd#}t
d t| t	ddgg d	t|d
 jv shJ W d    n1 srw   Y  t ddt} t| t}t|| j | dksJ | dksJ d S )Nr   r   )r   rS   r   T)recordalwaysr   rP   z,Clusterer does not have any defined clustersr   r   r   r   )r   r!   rp   r7   r   r   r   generate_prediction_datar'   r   warningscatch_warningssimplefilterstrmessager   r   r   rq   )r   wr   r,   r,   r-   &test_hdbscan_approximate_predict_score  s$   

r  c                  C   s8   t dddt} t| }t|t| jjj	d  d S )NTr   )r   r   r   )
r   r!   rp   r   r   r'   zerosprediction_data_raw_datar   )r   vectsr,   r,   r-   *test_hdbscan_all_points_membership_vectors   s   r  c                   C   s  t t tdd W d    n1 sw   Y  t t td d W d    n1 s/w   Y  t t ttdd W d    n1 sJw   Y  t t ttdd W d    n1 sew   Y  t t ttdd W d    n1 sw   Y  t t ttdd W d    n1 sw   Y  t t ttd d W d    n1 sw   Y  t t ttddd	 W d    n1 sw   Y  t t ttddd
d W d    n1 sw   Y  t t ttdddd W d    n	1 sw   Y  t t ttdddd W d    n	1 s+w   Y  t t ttddd W d    n	1 sHw   Y  t t ttdd
d W d    n	1 sew   Y  t t ttddd W d    n	1 sw   Y  t t ttddd W d    n	1 sw   Y  t t ttdd W d    n	1 sw   Y  t t ttdd W d    n	1 sw   Y  t t ttdd W d    n	1 sw   Y  t t ttdd d	 W d    n	1 sw   Y  t t ttdd W d    n	1 s,w   Y  t t ttdd W d    n	1 sHw   Y  t t ttdd W d    n	1 sdw   Y  t t ttdd W d    d S 1 sw   Y  d S )Nfailr   r   )min_samplesr   imperialrk   	minkowski)rl   rw   r   )rl   rw   r   r   r   rj   r   )rl   r   )alphasomething_elser   r   )	leaf_size)cluster_selection_epsilon)cluster_selection_persistence)cluster_selection_epsilon_max)r7   r   r   r	   rp   	Exception	TypeErrorr,   r,   r,   r-   test_hdbscan_badargs&  s   $r  c                  C   s@   t t} t | j}tt|td|v  }|dksJ d S )Nr   r   )	r   r   rp   r   r!   r#   rr   r`   rs   )sparse_Xrv   rt   r,   r,   r-   test_hdbscan_sparseW  s   
r  c                  C   sp   t  } t| ddtj}t| dddtj}tt|td|v  }tt|td|v  }||ks6J d S )Nr   )memoryr	  r    )r  r	  r   r   )r   r   r!   rp   r#   rr   r`   rs   )cachedirrb   rc   n_clusters1n_clusters2r,   r,   r-   test_hdbscan_caching`  s   r  c                  C   sn   ddg} t dd| dd\}}t |}t| D ]\}}||}t||dd ||}t||dd qd S )	N)r^   r^   )rQ   rQ   i  r         ?)r   r   rL   rM   r   )decimal)r   r   r!   	enumerateweighted_cluster_centroidr   weighted_cluster_medoid)rL   r   r   r   idxcentercentroidmedoidr,   r,   r-   test_hdbscan_centroids_medoidsj  s   

r%  c                  C   sz   t  t} tt | d W d    n1 sw   Y  tt | d W d    d S 1 s6w   Y  d S )Nr   )r   r!   rp   r7   r   r   r  r   r   r,   r,   r-   )test_hdbscan_no_centroid_medoid_for_noisew  s   "r&  c                  C   s   t jd t jdd} tddddd| }t j|dd	\}}t|dks)J ||d
k dks3J tddddd| }t j|dd	\}}t|dksOJ ||d
k dksYJ d S )Nr      rS   r   r^   eomT)r   r  cluster_selection_methodallow_single_cluster)return_countsr   .   皙?r'   rU   rO   randr   r   uniquerr   )no_structurerv   unique_labelscountsr,   r,   r-   .test_hdbscan_allow_single_cluster_with_epsilon  s0   r4  c                  C   sL   t jd t jdd} tdddd| }t |}t|dks$J d S )	Nr   r'  rS   r   r  r(  )r   r  r)  r   r.  )r1  rv   r2  r,   r,   r-   "test_hdbscan_persistence_threshold  s   
r5  c                  C   s   t dg dddd\} }tddd}||  tt|jtd	d
g tddd}||  tt|jtg d dS )z{Test that reducing the cluster_selection_epsilon_max parameter
    results in more clusters with smaller sizes being found.rR   )r   r   )r   r   )r   r   )r   r   r-  rJ   rK   rI   T)r  r*  r   r   rH   )r   r   r   rS   r   N)r   r   r!   r   r'   r0  r#   r   rY   rZ   r   r,   r,   r-   *test_hdbscan_cluster_selection_epsilon_max  s   


 r8  c                  C   sL   t dg dddd\} }tddd}||  	 td	d
dd}||  d S )NrR   r6  r-  rJ   rK   r   T)max_cluster_sizer*  g=
ףp=?r   )r  r  r*  )r   r   r!   r7  r,   r,   r-   -test_hdbscan_parameters_do_not_trigger_errors  s    


r:  z(need to refactor to meet newer standards)reasonc                   C   s   t t d S )N)r   r   r,   r,   r,   r-   !test_hdbscan_is_sklearn_estimator  s   r<  )X__doc__numpyr'   scipy.spatialr   scipyr   r   sklearn.utils.estimator_checksr   sklearn.utils._testingr   r   r	   r   r
   r   r   r   r   sklearn.datasetsr   sklearn.utilsr   sklearn.preprocessingr   scipy.statsr   tempfiler   	functoolsr   r7   sklearnr   r   rt   rp   r   r   copyr"   nanr.   r@   rD   rG   r]   ri   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r  r  r%  r&  r4  r5  r8  r:  markr8   r<  r,   r,   r,   r-   <module>   s    $	
	%
/1	
