From cd19b57742e7edaba1bfaffa3d058dcdf3874add Mon Sep 17 00:00:00 2001 From: Zane Dufour Date: Sat, 6 Apr 2019 21:37:30 -0400 Subject: [PATCH 01/24] Added kmedoids code --- sklearn_extra/cluster/__init__.py | 5 + sklearn_extra/cluster/k_medoids_.py | 398 ++++++++++++++++++++++++++++ 2 files changed, 403 insertions(+) create mode 100644 sklearn_extra/cluster/__init__.py create mode 100644 sklearn_extra/cluster/k_medoids_.py diff --git a/sklearn_extra/cluster/__init__.py b/sklearn_extra/cluster/__init__.py new file mode 100644 index 00000000..d30e7b64 --- /dev/null +++ b/sklearn_extra/cluster/__init__.py @@ -0,0 +1,5 @@ +from .k_medoids_ import KMedoids + +__all__ = [ + 'KMedoids', +] diff --git a/sklearn_extra/cluster/k_medoids_.py b/sklearn_extra/cluster/k_medoids_.py new file mode 100644 index 00000000..5c26cbc4 --- /dev/null +++ b/sklearn_extra/cluster/k_medoids_.py @@ -0,0 +1,398 @@ +# -*- coding: utf-8 -*- +"""K-medoids clustering""" + +# Authors: Timo Erkkilä +# Antti Lehmussola +# Kornel Kiełczewski +# Zane Dufour +# License: BSD 3 clause + +import warnings + +import numpy as np + +from ..base import BaseEstimator, ClusterMixin, TransformerMixin +from ..metrics.pairwise import pairwise_distances, pairwise_distances_argmin +from ..utils import check_array, check_random_state +from ..utils.extmath import stable_cumsum +from ..utils.validation import check_is_fitted +from ..exceptions import ConvergenceWarning + + +class KMedoids(BaseEstimator, ClusterMixin, TransformerMixin): + """k-medoids clustering. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_clusters : int, optional, default: 8 + The number of clusters to form as well as the number of medoids to + generate. + + metric : string, or callable, optional, default: 'euclidean' + What distance metric to use. See :func:metrics.pairwise_distances + + init : {'random', 'heuristic'}, optional, default: 'heuristic' + Specify medoid initialization method. Random selects n_clusters + elements from the dataset, while heuristic picks the n_clusters points + with the smallest sum distance to every other point. + + max_iter : int, optional, default : 300 + Specify the maximum number of iterations when fitting. + + random_state : int, RandomState instance or None, optional + Specify random state for the random number generator. Used to + initialise medoids when init='random'. + + Attributes + ---------- + cluster_centers_ : array, shape = (n_clusters, n_features) + or None if metric == 'precomputed' + Cluster centers, i.e. medoids (elements from the original dataset) + + medoid_indices_ : array, shape = (n_clusters,) + The indices of the medoid rows in X + + labels_ : array, shape = (n_samples,) + Labels of each point + + inertia_ : float + Sum of distances of samples to their closest cluster center. + + Examples + -------- + >>> from sklearn.cluster import KMedoids + >>> import numpy as np + + >>> X = np.asarray([[1, 2], [1, 4], [1, 0], + ... [4, 2], [4, 4], [4, 0]]) + >>> kmedoids = KMedoids(n_clusters=2, random_state=0).fit(X) + >>> kmedoids.labels_ + array([0, 0, 0, 1, 1, 1]) + >>> kmedoids.predict([[0,0], [4,4]]) + array([0, 1]) + >>> kmedoids.cluster_centers_ + array([[1, 2], + [4, 2]]) + >>> kmedoids.inertia_ + 8.0 + + References + ---------- + Kaufman, L. and Rousseeuw, P.J., Statistical Data Analysis Based on + the L1–Norm and Related Methods, edited by Y. Dodge, North-Holland, + 405–416. 1987 + + See also + -------- + + KMeans + The KMeans algorithm minimizes the within-cluster sum-of-squares + criterion. It scales well to large number of samples. + + Notes + ----- + Since all pairwise distances are calculated and stored in memory for + the duration of fit, the space complexity is O(n_samples ** 2). + """ + + def __init__(self, n_clusters=8, metric='euclidean', + init='heuristic', max_iter=300, random_state=None): + self.n_clusters = n_clusters + self.metric = metric + self.init = init + self.max_iter = max_iter + self.random_state = random_state + + def _check_nonnegative_int(self, value, desc): + """Validates if value is a valid integer > 0""" + + if (value is None or value <= 0 or + not isinstance(value, (int, np.integer))): + raise ValueError("%s should be a nonnegative integer. " + "%s was given" % (desc, value)) + + def _check_init_args(self): + """Validates the input arguments. """ + + # Check n_clusters and max_iter + self._check_nonnegative_int(self.n_clusters, "n_clusters") + self._check_nonnegative_int(self.max_iter, "max_iter") + + # Check init + init_methods = ['random', 'heuristic', 'k-medoids++'] + if self.init not in init_methods: + raise ValueError("init needs to be one of " + + "the following: " + + "%s" % init_methods) + + def fit(self, X, y=None): + """Fit K-Medoids to the provided data. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape = (n_samples, n_features), \ + or (n_samples, n_samples) if metric == 'precomputed' + Dataset to cluster. + + y : Ignored + + Returns + ------- + self + """ + random_state_ = check_random_state(self.random_state) + + self._check_init_args() + X = check_array(X, accept_sparse=['csr', 'csc']) + if self.n_clusters > X.shape[0]: + raise ValueError("The number of medoids (%d) must be less " + "than the number of samples %d." + % (self.n_clusters, X.shape[0])) + + D = pairwise_distances(X, metric=self.metric) + medoid_idxs = self._initialize_medoids(D, + self.n_clusters, + random_state_, + ) + labels = None + + # Continue the algorithm as long as + # the medoids keep changing and the maximum number + # of iterations is not exceeded + for self.n_iter_ in range(0, self.max_iter): + old_medoid_idxs = np.copy(medoid_idxs) + labels = np.argmin(D[medoid_idxs, :], axis=0) + + # Update medoids with the new cluster indices + self._update_medoid_idxs_in_place(D, labels, medoid_idxs) + if np.all(old_medoid_idxs == medoid_idxs): + break + elif self.n_iter_ == self.max_iter - 1: + warnings.warn("Maximum number of iteration reached before " + "convergence. Consider increasing max_iter to " + "improve the fit.", + ConvergenceWarning) + + # Set the resulting instance variables. + if self.metric == "precomputed": + self.cluster_centers_ = None + else: + self.cluster_centers_ = X[medoid_idxs] + + # Expose labels_ which are the assignments of + # the training data to clusters + self.labels_ = labels + self.medoid_indices_ = medoid_idxs + self.inertia_ = self._compute_inertia(self.transform(X)) + + # Return self to enable method chaining + return self + + def _update_medoid_idxs_in_place(self, D, labels, medoid_idxs): + """In-place update of the medoid indices""" + + # Update the medoids for each cluster + for k in range(self.n_clusters): + # Extract the distance matrix between the data points + # inside the cluster k + cluster_k_idxs = np.where(labels == k)[0] + + if len(cluster_k_idxs) == 0: + warnings.warn( + "Cluster {k} is empty! " + "self.labels_[self.medoid_indices_[{k}]] " + "may not be labeled with " + "its corresponding cluster ({k}).".format(k=k)) + continue + + in_cluster_distances = D[cluster_k_idxs, + cluster_k_idxs[:, np.newaxis]] + + # Calculate all costs from each point to all others in the cluster + in_cluster_all_costs = np.sum(in_cluster_distances, axis=1) + + min_cost_idx = np.argmin(in_cluster_all_costs) + min_cost = in_cluster_all_costs[min_cost_idx] + curr_cost = in_cluster_all_costs[ + np.argmax(cluster_k_idxs == medoid_idxs[k])] + + # Adopt a new medoid if its distance is smaller then the current + if min_cost < curr_cost: + medoid_idxs[k] = cluster_k_idxs[min_cost_idx] + + def transform(self, X): + """Transforms X to cluster-distance space. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_query, n_features), \ + or (n_query, n_indexed) if metric == 'precomputed' + Data to transform. + + Returns + ------- + X_new : {array-like, sparse matrix}, shape=(n_samples, n_clusters) + X transformed in the new space of distances to cluster centers. + """ + X = check_array(X, accept_sparse=['csr', 'csc']) + + if self.metric == "precomputed": + check_is_fitted(self, "medoid_indices_") + return X[:, self.medoid_indices_] + else: + check_is_fitted(self, "cluster_centers_") + + Y = self.cluster_centers_ + return pairwise_distances(X, Y=Y, + metric=self.metric) + + def predict(self, X): + """Predict the closest cluster for each sample in X. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_query, n_features), \ + or (n_query, n_indexed) if metric == 'precomputed' + New data to predict. + + Returns + ------- + labels : array, shape = (n_samples,) + Index of the cluster each sample belongs to. + """ + X = check_array(X, accept_sparse=['csr', 'csc']) + + if self.metric == "precomputed": + check_is_fitted(self, "medoid_indices_") + return np.argmin(X[:, self.medoid_indices_], axis=1) + else: + check_is_fitted(self, "cluster_centers_") + + # Return data points to clusters based on which cluster assignment + # yields the smallest distance + return pairwise_distances_argmin(X, Y=self.cluster_centers_, + metric=self.metric) + + def _compute_inertia(self, distances): + """Compute inertia of new samples. Inertia is defined as the sum of the + sample distances to closest cluster centers. + + Parameters + ---------- + distances : {array-like, sparse matrix}, shape=(n_samples, n_clusters) + Distances to cluster centers. + + Returns + ------- + Sum of sample distances to closest cluster centers. + """ + + # Define inertia as the sum of the sample-distances + # to closest cluster centers + inertia = np.sum(np.min(distances, axis=1)) + + return inertia + + def _initialize_medoids(self, D, n_clusters, random_state_): + """Select initial mediods when beginning clustering.""" + + if self.init == 'random': # Random initialization + # Pick random k medoids as the initial ones. + medoids = random_state_.choice(len(D), n_clusters) + elif self.init == 'k-medoids++': + medoids = self._kpp_init(D, random_state_) + elif self.init == "heuristic": # Initialization by heuristic + # Pick K first data points that have the smallest sum distance + # to every other point. These are the initial medoids. + medoids = np.argpartition(np.sum(D, axis=1), + n_clusters-1)[:n_clusters] + else: + raise ValueError("init value '{init}' not recognized" + .format(init=self.init)) + + return medoids + + def _kpp_init(self, D, random_state_, n_local_trials=None): + """Init n_clusters seeds with a method similar to k-means++ + + Parameters + ----------- + D : array, shape (n_samples, n_samples) + The distance matrix we will use to select medoid indices. + + n_clusters : integer + The number of seeds to choose + + x_squared_norms : array, shape (n_samples,) + Squared Euclidean norm of each data point. + + random_state : RandomState + The generator used to initialize the centers. + + n_local_trials : integer, optional + The number of seeding trials for each center (except the first), + of which the one reducing inertia the most is greedily chosen. + Set to None to make the number of trials depend logarithmically + on the number of seeds (2+log(k)); this is the default. + + Notes + ----- + Selects initial cluster centers for k-medoid clustering in a smart way + to speed up convergence. see: Arthur, D. and Vassilvitskii, S. + "k-means++: the advantages of careful seeding". ACM-SIAM symposium + on Discrete algorithms. 2007 + + Version ported from http://www.stanford.edu/~darthur/kMeansppTest.zip, + which is the implementation used in the aforementioned paper. + """ + n_samples, _ = D.shape + + centers = np.empty(self.n_clusters, dtype=int) + + # Set the number of local seeding trials if none is given + if n_local_trials is None: + # This is what Arthur/Vassilvitskii tried, but did not report + # specific results for other than mentioning in the conclusion + # that it helped. + n_local_trials = 2 + int(np.log(self.n_clusters)) + + center_id = random_state_.randint(n_samples) + centers[0] = center_id + + # Initialize list of closest distances and calculate current potential + closest_dist_sq = D[centers[0], :]**2 + current_pot = closest_dist_sq.sum() + + # pick the remaining self.n_clusters-1 points + for cluster_index in range(1, self.n_clusters): + rand_vals = (random_state_.random_sample(n_local_trials) + * current_pot) + candidate_ids = np.searchsorted(stable_cumsum(closest_dist_sq), + rand_vals) + + # Compute distances to center candidates + distance_to_candidates = D[candidate_ids, :]**2 + + # Decide which candidate is the best + best_candidate = None + best_pot = None + best_dist_sq = None + for trial in range(n_local_trials): + # Compute potential when including center candidate + new_dist_sq = np.minimum(closest_dist_sq, + distance_to_candidates[trial]) + new_pot = new_dist_sq.sum() + + # Store result if it is the best local trial so far + if (best_candidate is None) or (new_pot < best_pot): + best_candidate = candidate_ids[trial] + best_pot = new_pot + best_dist_sq = new_dist_sq + + centers[cluster_index] = best_candidate + current_pot = best_pot + closest_dist_sq = best_dist_sq + + return centers From 3e184446f40596e2062d92deba06b320f568e470 Mon Sep 17 00:00:00 2001 From: Zane Dufour Date: Mon, 22 Apr 2019 21:39:39 -0400 Subject: [PATCH 02/24] changed k_medoids_ imports to absolute --- sklearn_extra/cluster/k_medoids_.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sklearn_extra/cluster/k_medoids_.py b/sklearn_extra/cluster/k_medoids_.py index 5c26cbc4..19a22d8e 100644 --- a/sklearn_extra/cluster/k_medoids_.py +++ b/sklearn_extra/cluster/k_medoids_.py @@ -11,12 +11,12 @@ import numpy as np -from ..base import BaseEstimator, ClusterMixin, TransformerMixin -from ..metrics.pairwise import pairwise_distances, pairwise_distances_argmin -from ..utils import check_array, check_random_state -from ..utils.extmath import stable_cumsum -from ..utils.validation import check_is_fitted -from ..exceptions import ConvergenceWarning +from sklearn.base import BaseEstimator, ClusterMixin, TransformerMixin +from sklearn.metrics.pairwise import pairwise_distances, pairwise_distances_argmin +from sklearn.utils import check_array, check_random_state +from sklearn.utils.extmath import stable_cumsum +from sklearn.utils.validation import check_is_fitted +from sklearn.exceptions import ConvergenceWarning class KMedoids(BaseEstimator, ClusterMixin, TransformerMixin): From d4c086c7addd7173935f25c986e18d1072d84149 Mon Sep 17 00:00:00 2001 From: Zane Dufour Date: Sun, 28 Apr 2019 21:53:10 -0400 Subject: [PATCH 03/24] Added .vscode to .gitignore --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 442f8c2a..498fbc60 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,9 @@ __pycache__/ # C extensions *.so +# Text Editors +.vscode/ + # scikit-learn specific doc/_build/ doc/auto_examples/ From bacc9317fcd622178e34b19789db18c1fadd9afc Mon Sep 17 00:00:00 2001 From: Zane Dufour Date: Sun, 28 Apr 2019 22:17:03 -0400 Subject: [PATCH 04/24] Add venv to .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 498fbc60..7d0d0c2b 100644 --- a/.gitignore +++ b/.gitignore @@ -20,6 +20,7 @@ doc/datasets/generated/ # Distribution / packaging .Python +venv/ env/ build/ develop-eggs/ From 0cb8e436691b860be981edae73632ba5737a7e31 Mon Sep 17 00:00:00 2001 From: Zane Dufour Date: Sun, 28 Apr 2019 22:19:59 -0400 Subject: [PATCH 05/24] Added cluster tests --- sklearn_extra/cluster/tests/test_k_medoids.py | 294 ++++++++++++++++++ 1 file changed, 294 insertions(+) create mode 100644 sklearn_extra/cluster/tests/test_k_medoids.py diff --git a/sklearn_extra/cluster/tests/test_k_medoids.py b/sklearn_extra/cluster/tests/test_k_medoids.py new file mode 100644 index 00000000..f5abf722 --- /dev/null +++ b/sklearn_extra/cluster/tests/test_k_medoids.py @@ -0,0 +1,294 @@ +"""Testing for K-Medoids""" +import warnings +import numpy as np +from scipy.sparse import csc_matrix + +from sklearn.datasets import load_iris +from sklearn.metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS +from sklearn.metrics.pairwise import euclidean_distances +from sklearn.utils.testing import assert_array_equal, assert_equal +from sklearn.utils.testing import assert_raise_message, assert_warns_message +from sklearn.utils.testing import assert_allclose + +from sklearn_extra.cluster import KMedoids +from sklearn.cluster import KMeans + +seed = 0 +X = np.random.RandomState(seed).rand(100, 5) + + +def test_kmedoids_input_validation_and_fit_check(): + rng = np.random.RandomState(seed) + # Invalid parameters + assert_raise_message(ValueError, "n_clusters should be a nonnegative " + "integer. 0 was given", + KMedoids(n_clusters=0).fit, X) + + assert_raise_message(ValueError, "n_clusters should be a nonnegative " + "integer. None was given", + KMedoids(n_clusters=None).fit, X) + + assert_raise_message(ValueError, "max_iter should be a nonnegative " + "integer. 0 was given", + KMedoids(n_clusters=1, max_iter=0).fit, X) + + assert_raise_message(ValueError, "max_iter should be a nonnegative " + "integer. None was given", + KMedoids(n_clusters=1, max_iter=None).fit, X) + + assert_raise_message(ValueError, "init needs to be one of the following: " + "['random', 'heuristic', 'k-medoids++']", + KMedoids(init=None).fit, X) + + # Trying to fit 3 samples to 8 clusters + Xsmall = rng.rand(5, 2) + assert_raise_message(ValueError, "The number of medoids (8) must be less " + "than the number of samples 5.", + KMedoids(n_clusters=8).fit, Xsmall) + + +def test_random_deterministic(): + """Random_state should determine 'random' init output.""" + rng = np.random.RandomState(seed) + + X = load_iris()["data"] + D = euclidean_distances(X) + + medoids = KMedoids( + init="random", + )._initialize_medoids(D, 4, rng) + assert_array_equal(medoids, [47, 117, 67, 103]) + + +def test_heuristic_deterministic(): + """Result of heuristic init method should not depend on rnadom state.""" + rng1 = np.random.RandomState(1) + rng2 = np.random.RandomState(2) + X = load_iris()["data"] + D = euclidean_distances(X) + + medoids_1 = KMedoids( + init="heuristic", + )._initialize_medoids(D, 10, rng1) + + medoids_2 = KMedoids( + init="heuristic", + )._initialize_medoids(D, 10, rng2) + + assert_array_equal(medoids_1, medoids_2) + + +def test_update_medoid_idxs_empty_cluster(): + """Label is unchanged for an empty cluster.""" + D = np.zeros((3, 3)) + labels = np.array([0, 0, 0]) + medoid_idxs = np.array([0, 1]) + kmedoids = KMedoids(n_clusters=2) + + # Swallow empty cluster warning + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + kmedoids._update_medoid_idxs_in_place(D, labels, medoid_idxs) + + assert_array_equal(medoid_idxs, [0, 1]) + + +def test_kmedoids_empty_clusters(): + """When a cluster is empty, it should throw a warning.""" + rng = np.random.RandomState(seed) + X = [[1], [1], [1]] + kmedoids = KMedoids(n_clusters=2, random_state=rng) + assert_warns_message(UserWarning, "Cluster 1 is empty!", kmedoids.fit, X) + + +def test_kmedoids_pp(): + """Initial clusters should be well-separated for k-medoids++""" + rng = np.random.RandomState(seed) + kmedoids = KMedoids(n_clusters=3, + init="k-medoids++", + random_state=rng) + X = [[10, 0], + [11, 0], + [0, 10], + [0, 11], + [10, 10], + [11, 10], + [12, 10], + [10, 11], + ] + D = euclidean_distances(X) + + centers = kmedoids._initialize_medoids(D, 3, random_state_=rng) + + assert len(centers) == 3 + + inter_medoid_distances = D[centers][:, centers] + assert np.all((inter_medoid_distances > 5) | (inter_medoid_distances == 0)) + + +def test_precomputed(): + """Test the 'precomputed' distance metric.""" + rng = np.random.RandomState(seed) + X_1 = [ + [1.0, 0.0], + [1.1, 0.0], + [0.0, 1.0], + [0.0, 1.1] + ] + D_1 = euclidean_distances(X_1) + X_2 = [ + [1.1, 0.0], + [0.0, 0.9] + ] + D_2 = euclidean_distances(X_2, X_1) + + kmedoids = KMedoids(metric="precomputed", + n_clusters=2, + random_state=rng, + ) + kmedoids.fit(D_1) + + assert_allclose(kmedoids.inertia_, 0.2) + assert_array_equal(kmedoids.medoid_indices_, [2, 0]) + assert_array_equal(kmedoids.labels_, [1, 1, 0, 0]) + assert kmedoids.cluster_centers_ is None + + med_1, med_2 = tuple(kmedoids.medoid_indices_) + predictions = kmedoids.predict(D_2) + assert_array_equal(predictions, [med_1 // 2, med_2 // 2]) + + transformed = kmedoids.transform(D_2) + assert_array_equal(transformed, D_2[:, kmedoids.medoid_indices_]) + + +def test_kmedoids_fit_naive(): + n_clusters = 3 + metric = 'euclidean' + + model = KMedoids(n_clusters=n_clusters, metric=metric) + Xnaive = np.asarray([[1, 0, 0], [0, 1, 0], [0, 0, 1]]) + + model.fit(Xnaive) + + assert_array_equal(model.cluster_centers_, + [[1, 0, 0], [0, 1, 0], [0, 0, 1]]) + assert_array_equal(model.labels_, [0, 1, 2]) + assert model.inertia_ == 0. + + # diagonal must be zero, off-diagonals must be positive + X_new = model.transform(Xnaive) + for c in range(n_clusters): + assert X_new[c, c] == 0 + for c2 in range(n_clusters): + if c != c2: + assert X_new[c, c2] > 0 + + +def test_max_iter(): + """Test that warning message is thrown when max_iter is reached.""" + rng = np.random.RandomState(seed) + X_iris = load_iris()['data'] + + model = KMedoids(n_clusters=10, + init='random', + random_state=rng, + max_iter=1, + ) + assert_warns_message(UserWarning, + "Maximum number of iteration reached before", + model.fit, + X_iris, + ) + + +def test_kmedoids_iris(): + """Test kmedoids on the Iris dataset""" + rng = np.random.RandomState(seed) + X_iris = load_iris()['data'] + + ref_model = KMeans(n_clusters=3).fit(X_iris) + + avg_dist_to_closest_centroid = ref_model\ + .transform(X_iris).min(axis=1).mean() + + for init in ['random', 'heuristic', 'k-medoids++']: + distance_metric = 'euclidean' + model = KMedoids(n_clusters=3, + metric=distance_metric, + init=init, + random_state=rng, + ) + model.fit(X_iris) + + # test convergence in reasonable number of steps + assert model.n_iter_ < (len(X_iris) // 10) + + distances = PAIRWISE_DISTANCE_FUNCTIONS[distance_metric](X_iris) + avg_dist_to_random_medoid = np.mean(distances.ravel()) + avg_dist_to_closest_medoid = model.inertia_ / X_iris.shape[0] + # We want distance-to-closest-medoid to be reduced from average + # distance by more than 50% + assert avg_dist_to_random_medoid > 2 * avg_dist_to_closest_medoid + # When K-Medoids is using Euclidean distance, + # we can compare its performance to + # K-Means. We want the average distance to cluster centers + # to be similar between K-Means and K-Medoids + assert_allclose(avg_dist_to_closest_medoid, + avg_dist_to_closest_centroid, rtol=0.1) + + +def test_kmedoids_fit_predict_transform(): + rng = np.random.RandomState(seed) + model = KMedoids(random_state=rng) + + labels1 = model.fit_predict(X) + assert_equal(len(labels1), 100) + assert_array_equal(labels1, model.labels_) + + labels2 = model.predict(X) + assert_array_equal(labels1, labels2) + + Xt1 = model.fit_transform(X) + assert_array_equal(Xt1.shape, (100, model.n_clusters)) + + Xt2 = model.transform(X) + assert_array_equal(Xt1, Xt2) + + +def test_callable_distance_metric(): + rng = np.random.RandomState(seed) + + def my_metric(a, b): + return np.sqrt(np.sum(np.power(a - b, 2))) + + model = KMedoids(random_state=rng, metric=my_metric) + labels1 = model.fit_predict(X) + assert_equal(len(labels1), 100) + assert_array_equal(labels1, model.labels_) + + +def test_outlier_robustness(): + rng = np.random.RandomState(seed) + kmeans = KMeans(n_clusters=2, random_state=rng) + kmedoids = KMedoids(n_clusters=2, random_state=rng) + + X = [[-11, 0], [-10, 0], [-9, 0], + [0, 0], [1, 0], [2, 0], [1000, 0]] + + kmeans.fit(X) + kmedoids.fit(X) + + assert_array_equal(kmeans.labels_, [0, 0, 0, 0, 0, 0, 1]) + assert_array_equal(kmedoids.labels_, [0, 0, 0, 1, 1, 1, 1]) + + +def test_kmedoids_on_sparse_input(): + rng = np.random.RandomState(seed) + model = KMedoids(n_clusters=2, random_state=rng) + row = np.array([1, 0]) + col = np.array([0, 4]) + data = np.array([1, 1]) + X = csc_matrix((data, (row, col)), shape=(2, 5)) + labels = model.fit_predict(X) + assert_equal(len(labels), 2) + assert_array_equal(labels, model.labels_) From 96f3a2ec14d2548c683e666b3ac58ddcca31af14 Mon Sep 17 00:00:00 2001 From: Zane Dufour Date: Sun, 28 Apr 2019 22:20:40 -0400 Subject: [PATCH 06/24] Fix KMedoids docstring --- sklearn_extra/cluster/k_medoids_.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn_extra/cluster/k_medoids_.py b/sklearn_extra/cluster/k_medoids_.py index 19a22d8e..917caa47 100644 --- a/sklearn_extra/cluster/k_medoids_.py +++ b/sklearn_extra/cluster/k_medoids_.py @@ -62,7 +62,7 @@ class KMedoids(BaseEstimator, ClusterMixin, TransformerMixin): Examples -------- - >>> from sklearn.cluster import KMedoids + >>> from sklearn_extra.cluster import KMedoids >>> import numpy as np >>> X = np.asarray([[1, 2], [1, 4], [1, 0], From 8d9d9d6498319ca935ceb50d6bb8431e078a8364 Mon Sep 17 00:00:00 2001 From: Zane Dufour Date: Mon, 29 Apr 2019 22:18:52 -0400 Subject: [PATCH 07/24] Reconfigure _kpp_init tests --- sklearn_extra/cluster/k_medoids_.py | 12 ++++---- sklearn_extra/cluster/tests/test_k_medoids.py | 29 ++++++++++++++++--- 2 files changed, 31 insertions(+), 10 deletions(-) diff --git a/sklearn_extra/cluster/k_medoids_.py b/sklearn_extra/cluster/k_medoids_.py index 917caa47..b3209929 100644 --- a/sklearn_extra/cluster/k_medoids_.py +++ b/sklearn_extra/cluster/k_medoids_.py @@ -302,7 +302,7 @@ def _initialize_medoids(self, D, n_clusters, random_state_): # Pick random k medoids as the initial ones. medoids = random_state_.choice(len(D), n_clusters) elif self.init == 'k-medoids++': - medoids = self._kpp_init(D, random_state_) + medoids = self._kpp_init(D, n_clusters, random_state_) elif self.init == "heuristic": # Initialization by heuristic # Pick K first data points that have the smallest sum distance # to every other point. These are the initial medoids. @@ -314,7 +314,7 @@ def _initialize_medoids(self, D, n_clusters, random_state_): return medoids - def _kpp_init(self, D, random_state_, n_local_trials=None): + def _kpp_init(self, D, n_clusters, random_state_, n_local_trials=None): """Init n_clusters seeds with a method similar to k-means++ Parameters @@ -349,14 +349,14 @@ def _kpp_init(self, D, random_state_, n_local_trials=None): """ n_samples, _ = D.shape - centers = np.empty(self.n_clusters, dtype=int) + centers = np.empty(n_clusters, dtype=int) # Set the number of local seeding trials if none is given if n_local_trials is None: # This is what Arthur/Vassilvitskii tried, but did not report # specific results for other than mentioning in the conclusion # that it helped. - n_local_trials = 2 + int(np.log(self.n_clusters)) + n_local_trials = 2 + int(np.log(n_clusters)) center_id = random_state_.randint(n_samples) centers[0] = center_id @@ -365,8 +365,8 @@ def _kpp_init(self, D, random_state_, n_local_trials=None): closest_dist_sq = D[centers[0], :]**2 current_pot = closest_dist_sq.sum() - # pick the remaining self.n_clusters-1 points - for cluster_index in range(1, self.n_clusters): + # pick the remaining n_clusters-1 points + for cluster_index in range(1, n_clusters): rand_vals = (random_state_.random_sample(n_local_trials) * current_pot) candidate_ids = np.searchsorted(stable_cumsum(closest_dist_sq), diff --git a/sklearn_extra/cluster/tests/test_k_medoids.py b/sklearn_extra/cluster/tests/test_k_medoids.py index f5abf722..0a802525 100644 --- a/sklearn_extra/cluster/tests/test_k_medoids.py +++ b/sklearn_extra/cluster/tests/test_k_medoids.py @@ -1,6 +1,7 @@ """Testing for K-Medoids""" import warnings import numpy as np +from unittest import mock from scipy.sparse import csc_matrix from sklearn.datasets import load_iris @@ -101,12 +102,30 @@ def test_kmedoids_empty_clusters(): assert_warns_message(UserWarning, "Cluster 1 is empty!", kmedoids.fit, X) +@mock.patch.object(KMedoids, '_kpp_init', return_value=object()) +def test_kpp_called(_kpp_init_mocked): + """KMedoids._kpp_init method should be called by _initialize_medoids""" + D = np.array([[0, 1], [1, 0]]) + n_clusters = 2 + rng = np.random.RandomState(seed) + kmedoids = KMedoids() + kmedoids.init = 'k-medoids++' + # set _kpp_init_mocked.return_value to a singleton + initial_medoids = kmedoids._initialize_medoids( + D, + n_clusters, + rng, + ) + + # assert that _kpp_init was called and its result was returned. + _kpp_init_mocked.assert_called_once_with(D, n_clusters, rng) + assert initial_medoids == _kpp_init_mocked.return_value + + def test_kmedoids_pp(): """Initial clusters should be well-separated for k-medoids++""" rng = np.random.RandomState(seed) - kmedoids = KMedoids(n_clusters=3, - init="k-medoids++", - random_state=rng) + kmedoids = KMedoids() X = [[10, 0], [11, 0], [0, 10], @@ -118,7 +137,9 @@ def test_kmedoids_pp(): ] D = euclidean_distances(X) - centers = kmedoids._initialize_medoids(D, 3, random_state_=rng) + centers = kmedoids._kpp_init(D, + n_clusters=3, + random_state_=rng) assert len(centers) == 3 From 8e534e8a9cf0f1b4a26d079de40eb096f51cfa69 Mon Sep 17 00:00:00 2001 From: Zane Dufour Date: Sat, 11 May 2019 19:33:25 -0400 Subject: [PATCH 08/24] added documentation --- doc/api.rst | 10 +++++++++ doc/user_guide.rst | 56 +++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 65 insertions(+), 1 deletion(-) diff --git a/doc/api.rst b/doc/api.rst index e8de935e..fcb9b8a0 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -12,3 +12,13 @@ Kernel approximation :template: class.rst kernel_approximation.Fastfood + +Clustering +==================== + +.. autosummary:: + :toctree: generated/ + :template: class.rst + + cluster.KMedoids + diff --git a/doc/user_guide.rst b/doc/user_guide.rst index a190e568..910339c3 100644 --- a/doc/user_guide.rst +++ b/doc/user_guide.rst @@ -6,4 +6,58 @@ User guide ========== -To add. +.. _k_medoids: + +K-Medoids +========= + +:class:`KMedoids` is related to the :class:`KMeans` algorithm. While +:class:`KMeans` tries to minimize the within cluster sum-of-squares, +:class:`KMedoids` tries to minimize the sum of distances between each point and +the medoid of its cluster. The medoid is a data point (unlike the centroid) +which has least total distance to the other members of its cluster. The use of +a data point to represent each cluster's center allows the use of any distance +metric for clustering. + +:class:`KMedoids` can be more robust to noise and outliers than :class:`KMeans` +as it will choose one of the cluster members as the medoid while +:class:`KMeans` will move the center of the cluster towards the outlier which +might in turn move other points away from the cluster centre. + +:class:`KMedoids` is also different from K-Medians, which is analogous to :class:`KMeans` +except that the Manhattan Median is used for each cluster center instead of +the centroid. K-Medians is robust to outliers, but it is limited to the +Manhattan Distance metric and, similar to :class:`KMeans`, it does not guarantee +that the center of each cluster will be a member of the original dataset. + +The complexity of K-Medoids is :math:`O(N^2 K T)` where :math:`N` is the number +of samples, :math:`T` is the number of iterations and :math:`K` is the number of +clusters. This makes it more suitable for smaller datasets in comparison to +:class:`KMeans` which is :math:`O(N K T)`. + +.. topic:: Examples: + + + +**Algorithm description:** +There are several algorithms to compute K-Medoids, though :class:`KMedoids` +currently only supports Partitioning Around Medoids (PAM). The PAM algorithm +uses a greedy search, which may fail to find the global optimum. It consists of +two alternating steps commonly called the +Assignment and Update steps (BUILD and SWAP in Kaufmann and Rousseeuw, 1987). + +PAM works as follows: + +* Initialize: Select ``n_clusters`` from the dataset as the medoids using + a heuristic, random, or k-medoids++ approach (configurable using the ``init`` parameter). +* Assignment step: assign each element from the dataset to the closest medoid. +* Update step: Identify the new medoid of each cluster. +* Repeat the assignment and update step while the medoids keep changing or + maximum number of iterations ``max_iter`` is reached. + +.. topic:: References: + + * "Clustering by Means of Medoids'" + Kaufman, L. and Rousseeuw, P.J., + Statistical Data Analysis Based on the L1Norm and Related Methods, edited + by Y. Dodge, North-Holland, 405416. 1987 \ No newline at end of file From 4d615291d91c8d4ca0e5582c2182a0fd1cf60b24 Mon Sep 17 00:00:00 2001 From: Zane Dufour Date: Thu, 25 Jul 2019 22:33:24 -0400 Subject: [PATCH 09/24] Rename k_medoids_.py -> _k_medoids.py --- sklearn_extra/cluster/__init__.py | 2 +- sklearn_extra/cluster/{k_medoids_.py => _k_medoids.py} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename sklearn_extra/cluster/{k_medoids_.py => _k_medoids.py} (100%) diff --git a/sklearn_extra/cluster/__init__.py b/sklearn_extra/cluster/__init__.py index d30e7b64..6b7d4c8d 100644 --- a/sklearn_extra/cluster/__init__.py +++ b/sklearn_extra/cluster/__init__.py @@ -1,4 +1,4 @@ -from .k_medoids_ import KMedoids +from ._k_medoids import KMedoids __all__ = [ 'KMedoids', diff --git a/sklearn_extra/cluster/k_medoids_.py b/sklearn_extra/cluster/_k_medoids.py similarity index 100% rename from sklearn_extra/cluster/k_medoids_.py rename to sklearn_extra/cluster/_k_medoids.py From 03f9e5492cc872d3d7d4eefe86175f8d77c9d244 Mon Sep 17 00:00:00 2001 From: Zane Dufour Date: Thu, 25 Jul 2019 22:33:44 -0400 Subject: [PATCH 10/24] Update conf.py to include mathjax --- doc/conf.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/doc/conf.py b/doc/conf.py index a4cf131d..6e6357df 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -48,13 +48,23 @@ # pngmath / imgmath compatibility layer for different sphinx versions import sphinx from distutils.version import LooseVersion -if LooseVersion(sphinx.__version__) < LooseVersion('1.4'): - extensions.append('sphinx.ext.pngmath') -else: - extensions.append('sphinx.ext.imgmath') +# if LooseVersion(sphinx.__version__) < LooseVersion('1.4'): +# extensions.append('sphinx.ext.pngmath') +# else: +# extensions.append('sphinx.ext.imgmath') autodoc_default_flags = ['members', 'inherited-members'] +# For maths, use mathjax by default and svg if NO_MATHJAX env variable is set +# (useful for viewing the doc offline) +if os.environ.get('NO_MATHJAX'): + extensions.append('sphinx.ext.imgmath') + imgmath_image_format = 'svg' +else: + extensions.append('sphinx.ext.mathjax') + mathjax_path = ('https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/' + 'MathJax.js?config=TeX-AMS_SVG') + # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] From 2e952872def7ea7119ee9acbc1de96ebc2008c17 Mon Sep 17 00:00:00 2001 From: Zane Dufour Date: Fri, 26 Jul 2019 00:17:28 -0400 Subject: [PATCH 11/24] Add KMedoids to test_common.py --- sklearn_extra/tests/test_common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn_extra/tests/test_common.py b/sklearn_extra/tests/test_common.py index f22cdab0..f1b1e862 100644 --- a/sklearn_extra/tests/test_common.py +++ b/sklearn_extra/tests/test_common.py @@ -3,11 +3,11 @@ from sklearn.utils.estimator_checks import check_estimator from sklearn_extra.kernel_approximation import Fastfood - +from sklearn_extra.cluster import KMedoids @pytest.mark.parametrize( "Estimator", - [Fastfood] + [Fastfood, KMedoids] ) def test_all_estimators(Estimator, request): return check_estimator(Estimator) From 0e1ee5bb3651d8af2db39a8bbc6c0442f81f0903 Mon Sep 17 00:00:00 2001 From: Zane Dufour Date: Fri, 26 Jul 2019 00:17:37 -0400 Subject: [PATCH 12/24] add plot_kmedoids_digits.py --- examples/plot_kmedoids_digits.py | 97 ++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 examples/plot_kmedoids_digits.py diff --git a/examples/plot_kmedoids_digits.py b/examples/plot_kmedoids_digits.py new file mode 100644 index 00000000..dbeab2e7 --- /dev/null +++ b/examples/plot_kmedoids_digits.py @@ -0,0 +1,97 @@ +# -*- coding: utf-8 -*- +""" +============================================================= +A demo of K-Medoids clustering on the handwritten digits data +============================================================= +In this example we compare different pairwise distance +metrics for K-Medoids. +""" +import numpy as np +import matplotlib.pyplot as plt + +from collections import namedtuple +from sklearn.cluster import KMeans +from sklearn_extra.cluster import KMedoids +from sklearn.datasets import load_digits +from sklearn.decomposition import PCA +from sklearn.preprocessing import scale + +print(__doc__) + +# Authors: Timo Erkkilä +# Antti Lehmussola +# Kornel Kiełczewski +# License: BSD 3 clause + +np.random.seed(42) + +digits = load_digits() +data = scale(digits.data) +n_digits = len(np.unique(digits.target)) + +reduced_data = PCA(n_components=2).fit_transform(data) + +# Step size of the mesh. Decrease to increase the quality of the VQ. +h = .02 # point in the mesh [x_min, m_max]x[y_min, y_max]. + +# Plot the decision boundary. For that, we will assign a color to each +x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1 +y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1 +xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) + +plt.figure() +plt.clf() + +plt.suptitle("Comparing multiple K-Medoids metrics to K-Means and each other", + fontsize=14) + +Algorithm = namedtuple('ClusterAlgorithm', ['model', 'description']) + +selected_models = [ + Algorithm(KMedoids(metric='manhattan', + n_clusters=n_digits), + 'KMedoids (manhattan)'), + Algorithm(KMedoids(metric='euclidean', + n_clusters=n_digits), + 'KMedoids (euclidean)'), + Algorithm(KMedoids(metric='cosine', + n_clusters=n_digits), + 'KMedoids (cosine)'), + Algorithm(KMeans(n_clusters=n_digits), + 'KMeans') + ] + +plot_rows = int(np.ceil(len(selected_models) / 2.0)) +plot_cols = 2 + +for i, (model, description) in enumerate(selected_models): + + # Obtain labels for each point in mesh. Use last trained model. + model.fit(reduced_data) + Z = model.predict(np.c_[xx.ravel(), yy.ravel()]) + + # Put the result into a color plot + Z = Z.reshape(xx.shape) + plt.subplot(plot_cols, plot_rows, i + 1) + plt.imshow(Z, interpolation='nearest', + extent=(xx.min(), xx.max(), yy.min(), yy.max()), + cmap=plt.cm.Paired, + aspect='auto', origin='lower') + + plt.plot(reduced_data[:, 0], + reduced_data[:, 1], + 'k.', markersize=2, + alpha=0.3, + ) + # Plot the centroids as a white X + centroids = model.cluster_centers_ + plt.scatter(centroids[:, 0], centroids[:, 1], + marker='x', s=169, linewidths=3, + color='w', zorder=10) + plt.title(description) + plt.xlim(x_min, x_max) + plt.ylim(y_min, y_max) + plt.xticks(()) + plt.yticks(()) + +plt.show() \ No newline at end of file From ee1688be53543e0e263d0d217282c585a6f6d530 Mon Sep 17 00:00:00 2001 From: Zane Dufour Date: Fri, 26 Jul 2019 00:24:02 -0400 Subject: [PATCH 13/24] Add Examples line to KMedoids docstring --- sklearn_extra/cluster/_k_medoids.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sklearn_extra/cluster/_k_medoids.py b/sklearn_extra/cluster/_k_medoids.py index b3209929..313b70cd 100644 --- a/sklearn_extra/cluster/_k_medoids.py +++ b/sklearn_extra/cluster/_k_medoids.py @@ -95,6 +95,12 @@ class KMedoids(BaseEstimator, ClusterMixin, TransformerMixin): ----- Since all pairwise distances are calculated and stored in memory for the duration of fit, the space complexity is O(n_samples ** 2). + + Examples + -------- + See scikit-learn-extra/examples/plot_kmedoids_digits.py for examples + of KMedoids with various distance metrics. + """ def __init__(self, n_clusters=8, metric='euclidean', From e96e2b08067dfc661da0607b3614c8a35e3091c5 Mon Sep 17 00:00:00 2001 From: Zane Dufour Date: Fri, 26 Jul 2019 00:25:51 -0400 Subject: [PATCH 14/24] Remove duplicate examples section in _k_medoids.py docstring --- sklearn_extra/cluster/_k_medoids.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sklearn_extra/cluster/_k_medoids.py b/sklearn_extra/cluster/_k_medoids.py index 313b70cd..5dc3aa86 100644 --- a/sklearn_extra/cluster/_k_medoids.py +++ b/sklearn_extra/cluster/_k_medoids.py @@ -78,6 +78,9 @@ class KMedoids(BaseEstimator, ClusterMixin, TransformerMixin): >>> kmedoids.inertia_ 8.0 + See scikit-learn-extra/examples/plot_kmedoids_digits.py for examples + of KMedoids with various distance metrics. + References ---------- Kaufman, L. and Rousseeuw, P.J., Statistical Data Analysis Based on @@ -98,8 +101,7 @@ class KMedoids(BaseEstimator, ClusterMixin, TransformerMixin): Examples -------- - See scikit-learn-extra/examples/plot_kmedoids_digits.py for examples - of KMedoids with various distance metrics. + """ From 07f6e3c2634a2f3b4bf6451dfec7de1c06f774c6 Mon Sep 17 00:00:00 2001 From: Zane Dufour Date: Fri, 26 Jul 2019 00:26:39 -0400 Subject: [PATCH 15/24] ACTUALLY remove duplicate examples section --- sklearn_extra/cluster/_k_medoids.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/sklearn_extra/cluster/_k_medoids.py b/sklearn_extra/cluster/_k_medoids.py index 5dc3aa86..706d1f2b 100644 --- a/sklearn_extra/cluster/_k_medoids.py +++ b/sklearn_extra/cluster/_k_medoids.py @@ -98,10 +98,6 @@ class KMedoids(BaseEstimator, ClusterMixin, TransformerMixin): ----- Since all pairwise distances are calculated and stored in memory for the duration of fit, the space complexity is O(n_samples ** 2). - - Examples - -------- - """ From 99108048a0a4c3d72a7db81616da0ed6f3c26b46 Mon Sep 17 00:00:00 2001 From: Zane Dufour Date: Fri, 26 Jul 2019 00:35:24 -0400 Subject: [PATCH 16/24] Add sphinx gallery of plot_kmedoids_digits.py --- doc/user_guide.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/user_guide.rst b/doc/user_guide.rst index 910339c3..084e838b 100644 --- a/doc/user_guide.rst +++ b/doc/user_guide.rst @@ -37,6 +37,8 @@ clusters. This makes it more suitable for smaller datasets in comparison to .. topic:: Examples: + * :ref:`sphx_glr_auto_examples_plot_kmedoids_digits.py`: Applying K-Medoids on digits + with various distance metrics. **Algorithm description:** From 0c8d032d145f889f659de5380479ff72caa55b3f Mon Sep 17 00:00:00 2001 From: Zane Dufour Date: Fri, 26 Jul 2019 00:55:19 -0400 Subject: [PATCH 17/24] Added k-medoids++ to help message --- sklearn_extra/cluster/_k_medoids.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/sklearn_extra/cluster/_k_medoids.py b/sklearn_extra/cluster/_k_medoids.py index 706d1f2b..638076e5 100644 --- a/sklearn_extra/cluster/_k_medoids.py +++ b/sklearn_extra/cluster/_k_medoids.py @@ -33,10 +33,14 @@ class KMedoids(BaseEstimator, ClusterMixin, TransformerMixin): metric : string, or callable, optional, default: 'euclidean' What distance metric to use. See :func:metrics.pairwise_distances - init : {'random', 'heuristic'}, optional, default: 'heuristic' - Specify medoid initialization method. Random selects n_clusters - elements from the dataset, while heuristic picks the n_clusters points - with the smallest sum distance to every other point. + init : {'random', 'heuristic', 'k-medoids++'}, optional, default: 'heuristic' + Specify medoid initialization method. 'random' selects n_clusters + elements from the dataset. 'heuristic' picks the n_clusters points + with the smallest sum distance to every other point. 'k-medoids++' + follows an approach based on k-means++_, and in general, gives initial + medoids which are more separated than those generated by the other methods. + + .. _k-means++: https://theory.stanford.edu/~sergei/papers/kMeansPP-soda.pdf max_iter : int, optional, default : 300 Specify the maximum number of iterations when fitting. From 3d7100195d636e5cb7aec6425e0f29bb9a097fe3 Mon Sep 17 00:00:00 2001 From: Zane Dufour Date: Sat, 27 Jul 2019 09:16:25 -0400 Subject: [PATCH 18/24] Run `black` on code --- benchmarks/bench_rbfsampler_fastfood.py | 23 +- doc/conf.py | 204 ++++++++++-------- examples/plot_kmedoids_digits.py | 71 +++--- sklearn_extra/cluster/__init__.py | 4 +- sklearn_extra/cluster/_k_medoids.py | 121 +++++++---- sklearn_extra/cluster/tests/test_k_medoids.py | 175 ++++++++------- sklearn_extra/tests/test_common.py | 6 +- 7 files changed, 328 insertions(+), 276 deletions(-) diff --git a/benchmarks/bench_rbfsampler_fastfood.py b/benchmarks/bench_rbfsampler_fastfood.py index 42bea9b4..11f5df9b 100644 --- a/benchmarks/bench_rbfsampler_fastfood.py +++ b/benchmarks/bench_rbfsampler_fastfood.py @@ -15,9 +15,9 @@ Y /= Y.sum(axis=1)[:, np.newaxis] # calculate feature maps -gamma = 10. +gamma = 10.0 sigma = np.sqrt(1 / (2 * gamma)) -number_of_features_to_generate = 4096*4 +number_of_features_to_generate = 4096 * 4 exact_start = datetime.datetime.utcnow() # original rbf kernel method: @@ -27,23 +27,24 @@ exact_spent_time = exact_end - exact_start print("Timimg exact rbf: \t\t", exact_spent_time) -rbf_transform = Fastfood(sigma=sigma, - n_components=number_of_features_to_generate, - tradeoff_mem_accuracy='mem', - random_state=42) +rbf_transform = Fastfood( + sigma=sigma, + n_components=number_of_features_to_generate, + tradeoff_mem_accuracy="mem", + random_state=42, +) _ = rbf_transform.fit(X) fastfood_fast_vec_start = datetime.datetime.utcnow() # Fastfood: approximate kernel mapping _ = rbf_transform.transform(X) _ = rbf_transform.transform(Y) fastfood_fast_vec_end = datetime.datetime.utcnow() -fastfood_fast_vec_spent_time = fastfood_fast_vec_end - \ - fastfood_fast_vec_start +fastfood_fast_vec_spent_time = fastfood_fast_vec_end - fastfood_fast_vec_start print("Timimg fastfood fast vectorized: \t\t", fastfood_fast_vec_spent_time) -rks_rbf_transform = RBFSampler(gamma=gamma, - n_components=number_of_features_to_generate, - random_state=42) +rks_rbf_transform = RBFSampler( + gamma=gamma, n_components=number_of_features_to_generate, random_state=42 +) _ = rks_rbf_transform.fit(X) rks_start = datetime.datetime.utcnow() # Random Kitchens Sinks: approximate kernel mapping diff --git a/doc/conf.py b/doc/conf.py index 4f3e502c..1cbdd442 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -21,24 +21,24 @@ # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. -#sys.path.insert(0, os.path.abspath('.')) +# sys.path.insert(0, os.path.abspath('.')) # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. -#needs_sphinx = '1.0' +# needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.autosummary', - 'sphinx.ext.doctest', - 'sphinx.ext.intersphinx', - 'sphinx.ext.viewcode', - 'numpydoc', - 'sphinx_gallery.gen_gallery', + "sphinx.ext.autodoc", + "sphinx.ext.autosummary", + "sphinx.ext.doctest", + "sphinx.ext.intersphinx", + "sphinx.ext.viewcode", + "numpydoc", + "sphinx_gallery.gen_gallery", ] # this is needed for some reason... @@ -48,44 +48,47 @@ # pngmath / imgmath compatibility layer for different sphinx versions import sphinx from distutils.version import LooseVersion + # if LooseVersion(sphinx.__version__) < LooseVersion('1.4'): # extensions.append('sphinx.ext.pngmath') # else: # extensions.append('sphinx.ext.imgmath') -autodoc_default_flags = ['members', 'inherited-members'] +autodoc_default_flags = ["members", "inherited-members"] # For maths, use mathjax by default and svg if NO_MATHJAX env variable is set # (useful for viewing the doc offline) -if os.environ.get('NO_MATHJAX'): - extensions.append('sphinx.ext.imgmath') - imgmath_image_format = 'svg' +if os.environ.get("NO_MATHJAX"): + extensions.append("sphinx.ext.imgmath") + imgmath_image_format = "svg" else: - extensions.append('sphinx.ext.mathjax') - mathjax_path = ('https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/' - 'MathJax.js?config=TeX-AMS_SVG') + extensions.append("sphinx.ext.mathjax") + mathjax_path = ( + "https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/" + "MathJax.js?config=TeX-AMS_SVG" + ) # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # generate autosummary even if no references autosummary_generate = True # The suffix of source filenames. -source_suffix = '.rst' +source_suffix = ".rst" # The encoding of source files. -#source_encoding = 'utf-8-sig' +# source_encoding = 'utf-8-sig' # Generate the plots for the gallery plot_gallery = True # The master toctree document. -master_doc = 'index' +master_doc = "index" # General information about the project. -project = u'scikit-learn-extra' -copyright = u'2019, scikit-learn-extra developpers' +project = u"scikit-learn-extra" +copyright = u"2019, scikit-learn-extra developpers" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the @@ -93,177 +96,181 @@ # # The short X.Y version. from sklearn_extra import __version__ + version = __version__ # The full version, including alpha/beta/rc tags. release = __version__ # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. -#language = None +# language = None # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: -#today = '' +# today = '' # Else, today_fmt is used as the format for a strftime call. -#today_fmt = '%B %d, %Y' +# today_fmt = '%B %d, %Y' # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. -exclude_patterns = ['_build', '_templates'] +exclude_patterns = ["_build", "_templates"] # The reST default role (used for this markup: `text`) to use for all # documents. -#default_role = None +# default_role = None # If true, '()' will be appended to :func: etc. cross-reference text. -#add_function_parentheses = True +# add_function_parentheses = True # If true, the current module name will be prepended to all description # unit titles (such as .. function::). -#add_module_names = True +# add_module_names = True # If true, sectionauthor and moduleauthor directives will be shown in the # output. They are ignored by default. -#show_authors = False +# show_authors = False # The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' +pygments_style = "sphinx" # Custom style -html_style = 'css/project-template.css' +html_style = "css/project-template.css" # A list of ignored prefixes for module index sorting. -#modindex_common_prefix = [] +# modindex_common_prefix = [] # If true, keep warnings as "system message" paragraphs in the built documents. -#keep_warnings = False +# keep_warnings = False # -- Options for HTML output ---------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. -html_theme = 'sphinx_rtd_theme' +html_theme = "sphinx_rtd_theme" # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. -#html_theme_options = {} +# html_theme_options = {} # Add any paths that contain custom themes here, relative to this directory. html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] # The name for this set of Sphinx documents. If None, it defaults to # " v documentation". -#html_title = None +# html_title = None # A shorter title for the navigation bar. Default is the same as html_title. -#html_short_title = None +# html_short_title = None # The name of an image file (relative to this directory) to place at the top # of the sidebar. -#html_logo = None +# html_logo = None # The name of an image file (within the static path) to use as favicon of the # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. -#html_favicon = None +# html_favicon = None # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = ["_static"] # Add any extra paths that contain custom files (such as robots.txt or # .htaccess) here, relative to this directory. These files are copied # directly to the root of the documentation. -#html_extra_path = [] +# html_extra_path = [] # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. -#html_last_updated_fmt = '%b %d, %Y' +# html_last_updated_fmt = '%b %d, %Y' # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. -#html_use_smartypants = True +# html_use_smartypants = True # Custom sidebar templates, maps document names to template names. -#html_sidebars = {} +# html_sidebars = {} # Additional templates that should be rendered to pages, maps page names to # template names. -#html_additional_pages = {} +# html_additional_pages = {} # If false, no module index is generated. -#html_domain_indices = True +# html_domain_indices = True # If false, no index is generated. -#html_use_index = True +# html_use_index = True # If true, the index is split into individual pages for each letter. -#html_split_index = False +# html_split_index = False # If true, links to the reST sources are added to the pages. -#html_show_sourcelink = True +# html_show_sourcelink = True # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. -#html_show_sphinx = True +# html_show_sphinx = True # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. -#html_show_copyright = True +# html_show_copyright = True # If true, an OpenSearch description file will be output, and all pages will # contain a tag referring to it. The value of this option must be the # base URL from which the finished HTML is served. -#html_use_opensearch = '' +# html_use_opensearch = '' # This is the file name suffix for HTML files (e.g. ".xhtml"). -#html_file_suffix = None +# html_file_suffix = None # Output file base name for HTML help builder. -htmlhelp_basename = 'project-templatedoc' +htmlhelp_basename = "project-templatedoc" # -- Options for LaTeX output --------------------------------------------- latex_elements = { -# The paper size ('letterpaper' or 'a4paper'). -#'papersize': 'letterpaper', - -# The font size ('10pt', '11pt' or '12pt'). -#'pointsize': '10pt', - -# Additional stuff for the LaTeX preamble. -#'preamble': '', + # The paper size ('letterpaper' or 'a4paper'). + #'papersize': 'letterpaper', + # The font size ('10pt', '11pt' or '12pt'). + #'pointsize': '10pt', + # Additional stuff for the LaTeX preamble. + #'preamble': '', } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - ('index', 'project-template.tex', u'project-template Documentation', - u'Vighnesh Birodkar', 'manual'), + ( + "index", + "project-template.tex", + u"project-template Documentation", + u"Vighnesh Birodkar", + "manual", + ) ] # The name of an image file (relative to this directory) to place at the top of # the title page. -#latex_logo = None +# latex_logo = None # For "manual" documents, if this is true, then toplevel headings are parts, # not chapters. -#latex_use_parts = False +# latex_use_parts = False # If true, show page references after internal links. -#latex_show_pagerefs = False +# latex_show_pagerefs = False # If true, show URL addresses after external links. -#latex_show_urls = False +# latex_show_urls = False # Documents to append as an appendix to all manuals. -#latex_appendices = [] +# latex_appendices = [] # If false, no module index is generated. -#latex_domain_indices = True +# latex_domain_indices = True # -- Options for manual page output --------------------------------------- @@ -271,12 +278,17 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ - ('index', 'project-template', u'project-template Documentation', - [u'Vighnesh Birodkar'], 1) + ( + "index", + "project-template", + u"project-template Documentation", + [u"Vighnesh Birodkar"], + 1, + ) ] # If true, show URL addresses after external links. -#man_show_urls = False +# man_show_urls = False # -- Options for Texinfo output ------------------------------------------- @@ -285,43 +297,51 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - ('index', 'project-template', u'project-template Documentation', - u'Vighnesh Birodkar', 'project-template', 'One line description of project.', - 'Miscellaneous'), + ( + "index", + "project-template", + u"project-template Documentation", + u"Vighnesh Birodkar", + "project-template", + "One line description of project.", + "Miscellaneous", + ) ] # Documents to append as an appendix to all manuals. -#texinfo_appendices = [] +# texinfo_appendices = [] # If false, no module index is generated. -#texinfo_domain_indices = True +# texinfo_domain_indices = True # How to display URL addresses: 'footnote', 'no', or 'inline'. -#texinfo_show_urls = 'footnote' +# texinfo_show_urls = 'footnote' # If true, do not generate a @detailmenu in the "Top" node's menu. -#texinfo_no_detailmenu = False +# texinfo_no_detailmenu = False # Example configuration for intersphinx: refer to the Python standard library. # intersphinx configuration intersphinx_mapping = { - 'python': ('https://docs.python.org/{.major}'.format( - sys.version_info), None), - 'numpy': ('https://docs.scipy.org/doc/numpy/', None), - 'scipy': ('https://docs.scipy.org/doc/scipy/reference', None), - 'matplotlib': ('https://matplotlib.org/', None), - 'sklearn': ('http://scikit-learn.org/stable', None) + "python": ( + "https://docs.python.org/{.major}".format(sys.version_info), + None, + ), + "numpy": ("https://docs.scipy.org/doc/numpy/", None), + "scipy": ("https://docs.scipy.org/doc/scipy/reference", None), + "matplotlib": ("https://matplotlib.org/", None), + "sklearn": ("http://scikit-learn.org/stable", None), } # sphinx-gallery configuration sphinx_gallery_conf = { - 'doc_module': 'sklearn_extra', - 'backreferences_dir': os.path.join('generated'), - 'reference_url': { - 'sklearn_extra': None} + "doc_module": "sklearn_extra", + "backreferences_dir": os.path.join("generated"), + "reference_url": {"sklearn_extra": None}, } + def setup(app): # a copy button to copy snippet of code from the documentation - app.add_javascript('js/copybutton.js') + app.add_javascript("js/copybutton.js") diff --git a/examples/plot_kmedoids_digits.py b/examples/plot_kmedoids_digits.py index dbeab2e7..c74d9ab2 100644 --- a/examples/plot_kmedoids_digits.py +++ b/examples/plot_kmedoids_digits.py @@ -32,7 +32,7 @@ reduced_data = PCA(n_components=2).fit_transform(data) # Step size of the mesh. Decrease to increase the quality of the VQ. -h = .02 # point in the mesh [x_min, m_max]x[y_min, y_max]. +h = 0.02 # point in the mesh [x_min, m_max]x[y_min, y_max]. # Plot the decision boundary. For that, we will assign a color to each x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1 @@ -42,24 +42,27 @@ plt.figure() plt.clf() -plt.suptitle("Comparing multiple K-Medoids metrics to K-Means and each other", - fontsize=14) +plt.suptitle( + "Comparing multiple K-Medoids metrics to K-Means and each other", + fontsize=14, +) -Algorithm = namedtuple('ClusterAlgorithm', ['model', 'description']) +Algorithm = namedtuple("ClusterAlgorithm", ["model", "description"]) selected_models = [ - Algorithm(KMedoids(metric='manhattan', - n_clusters=n_digits), - 'KMedoids (manhattan)'), - Algorithm(KMedoids(metric='euclidean', - n_clusters=n_digits), - 'KMedoids (euclidean)'), - Algorithm(KMedoids(metric='cosine', - n_clusters=n_digits), - 'KMedoids (cosine)'), - Algorithm(KMeans(n_clusters=n_digits), - 'KMeans') - ] + Algorithm( + KMedoids(metric="manhattan", n_clusters=n_digits), + "KMedoids (manhattan)", + ), + Algorithm( + KMedoids(metric="euclidean", n_clusters=n_digits), + "KMedoids (euclidean)", + ), + Algorithm( + KMedoids(metric="cosine", n_clusters=n_digits), "KMedoids (cosine)" + ), + Algorithm(KMeans(n_clusters=n_digits), "KMeans"), +] plot_rows = int(np.ceil(len(selected_models) / 2.0)) plot_cols = 2 @@ -73,25 +76,33 @@ # Put the result into a color plot Z = Z.reshape(xx.shape) plt.subplot(plot_cols, plot_rows, i + 1) - plt.imshow(Z, interpolation='nearest', - extent=(xx.min(), xx.max(), yy.min(), yy.max()), - cmap=plt.cm.Paired, - aspect='auto', origin='lower') - - plt.plot(reduced_data[:, 0], - reduced_data[:, 1], - 'k.', markersize=2, - alpha=0.3, - ) + plt.imshow( + Z, + interpolation="nearest", + extent=(xx.min(), xx.max(), yy.min(), yy.max()), + cmap=plt.cm.Paired, + aspect="auto", + origin="lower", + ) + + plt.plot( + reduced_data[:, 0], reduced_data[:, 1], "k.", markersize=2, alpha=0.3 + ) # Plot the centroids as a white X centroids = model.cluster_centers_ - plt.scatter(centroids[:, 0], centroids[:, 1], - marker='x', s=169, linewidths=3, - color='w', zorder=10) + plt.scatter( + centroids[:, 0], + centroids[:, 1], + marker="x", + s=169, + linewidths=3, + color="w", + zorder=10, + ) plt.title(description) plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) plt.xticks(()) plt.yticks(()) -plt.show() \ No newline at end of file +plt.show() diff --git a/sklearn_extra/cluster/__init__.py b/sklearn_extra/cluster/__init__.py index 6b7d4c8d..bbdaaf41 100644 --- a/sklearn_extra/cluster/__init__.py +++ b/sklearn_extra/cluster/__init__.py @@ -1,5 +1,3 @@ from ._k_medoids import KMedoids -__all__ = [ - 'KMedoids', -] +__all__ = ["KMedoids"] diff --git a/sklearn_extra/cluster/_k_medoids.py b/sklearn_extra/cluster/_k_medoids.py index 638076e5..def6560d 100644 --- a/sklearn_extra/cluster/_k_medoids.py +++ b/sklearn_extra/cluster/_k_medoids.py @@ -12,7 +12,10 @@ import numpy as np from sklearn.base import BaseEstimator, ClusterMixin, TransformerMixin -from sklearn.metrics.pairwise import pairwise_distances, pairwise_distances_argmin +from sklearn.metrics.pairwise import ( + pairwise_distances, + pairwise_distances_argmin, +) from sklearn.utils import check_array, check_random_state from sklearn.utils.extmath import stable_cumsum from sklearn.utils.validation import check_is_fitted @@ -105,8 +108,14 @@ class KMedoids(BaseEstimator, ClusterMixin, TransformerMixin): """ - def __init__(self, n_clusters=8, metric='euclidean', - init='heuristic', max_iter=300, random_state=None): + def __init__( + self, + n_clusters=8, + metric="euclidean", + init="heuristic", + max_iter=300, + random_state=None, + ): self.n_clusters = n_clusters self.metric = metric self.init = init @@ -116,10 +125,15 @@ def __init__(self, n_clusters=8, metric='euclidean', def _check_nonnegative_int(self, value, desc): """Validates if value is a valid integer > 0""" - if (value is None or value <= 0 or - not isinstance(value, (int, np.integer))): - raise ValueError("%s should be a nonnegative integer. " - "%s was given" % (desc, value)) + if ( + value is None + or value <= 0 + or not isinstance(value, (int, np.integer)) + ): + raise ValueError( + "%s should be a nonnegative integer. " + "%s was given" % (desc, value) + ) def _check_init_args(self): """Validates the input arguments. """ @@ -129,11 +143,13 @@ def _check_init_args(self): self._check_nonnegative_int(self.max_iter, "max_iter") # Check init - init_methods = ['random', 'heuristic', 'k-medoids++'] + init_methods = ["random", "heuristic", "k-medoids++"] if self.init not in init_methods: - raise ValueError("init needs to be one of " + - "the following: " + - "%s" % init_methods) + raise ValueError( + "init needs to be one of " + + "the following: " + + "%s" % init_methods + ) def fit(self, X, y=None): """Fit K-Medoids to the provided data. @@ -153,17 +169,18 @@ def fit(self, X, y=None): random_state_ = check_random_state(self.random_state) self._check_init_args() - X = check_array(X, accept_sparse=['csr', 'csc']) + X = check_array(X, accept_sparse=["csr", "csc"]) if self.n_clusters > X.shape[0]: - raise ValueError("The number of medoids (%d) must be less " - "than the number of samples %d." - % (self.n_clusters, X.shape[0])) + raise ValueError( + "The number of medoids (%d) must be less " + "than the number of samples %d." + % (self.n_clusters, X.shape[0]) + ) D = pairwise_distances(X, metric=self.metric) - medoid_idxs = self._initialize_medoids(D, - self.n_clusters, - random_state_, - ) + medoid_idxs = self._initialize_medoids( + D, self.n_clusters, random_state_ + ) labels = None # Continue the algorithm as long as @@ -178,10 +195,12 @@ def fit(self, X, y=None): if np.all(old_medoid_idxs == medoid_idxs): break elif self.n_iter_ == self.max_iter - 1: - warnings.warn("Maximum number of iteration reached before " - "convergence. Consider increasing max_iter to " - "improve the fit.", - ConvergenceWarning) + warnings.warn( + "Maximum number of iteration reached before " + "convergence. Consider increasing max_iter to " + "improve the fit.", + ConvergenceWarning, + ) # Set the resulting instance variables. if self.metric == "precomputed": @@ -212,11 +231,13 @@ def _update_medoid_idxs_in_place(self, D, labels, medoid_idxs): "Cluster {k} is empty! " "self.labels_[self.medoid_indices_[{k}]] " "may not be labeled with " - "its corresponding cluster ({k}).".format(k=k)) + "its corresponding cluster ({k}).".format(k=k) + ) continue - in_cluster_distances = D[cluster_k_idxs, - cluster_k_idxs[:, np.newaxis]] + in_cluster_distances = D[ + cluster_k_idxs, cluster_k_idxs[:, np.newaxis] + ] # Calculate all costs from each point to all others in the cluster in_cluster_all_costs = np.sum(in_cluster_distances, axis=1) @@ -224,7 +245,8 @@ def _update_medoid_idxs_in_place(self, D, labels, medoid_idxs): min_cost_idx = np.argmin(in_cluster_all_costs) min_cost = in_cluster_all_costs[min_cost_idx] curr_cost = in_cluster_all_costs[ - np.argmax(cluster_k_idxs == medoid_idxs[k])] + np.argmax(cluster_k_idxs == medoid_idxs[k]) + ] # Adopt a new medoid if its distance is smaller then the current if min_cost < curr_cost: @@ -244,7 +266,7 @@ def transform(self, X): X_new : {array-like, sparse matrix}, shape=(n_samples, n_clusters) X transformed in the new space of distances to cluster centers. """ - X = check_array(X, accept_sparse=['csr', 'csc']) + X = check_array(X, accept_sparse=["csr", "csc"]) if self.metric == "precomputed": check_is_fitted(self, "medoid_indices_") @@ -253,8 +275,7 @@ def transform(self, X): check_is_fitted(self, "cluster_centers_") Y = self.cluster_centers_ - return pairwise_distances(X, Y=Y, - metric=self.metric) + return pairwise_distances(X, Y=Y, metric=self.metric) def predict(self, X): """Predict the closest cluster for each sample in X. @@ -270,7 +291,7 @@ def predict(self, X): labels : array, shape = (n_samples,) Index of the cluster each sample belongs to. """ - X = check_array(X, accept_sparse=['csr', 'csc']) + X = check_array(X, accept_sparse=["csr", "csc"]) if self.metric == "precomputed": check_is_fitted(self, "medoid_indices_") @@ -280,8 +301,9 @@ def predict(self, X): # Return data points to clusters based on which cluster assignment # yields the smallest distance - return pairwise_distances_argmin(X, Y=self.cluster_centers_, - metric=self.metric) + return pairwise_distances_argmin( + X, Y=self.cluster_centers_, metric=self.metric + ) def _compute_inertia(self, distances): """Compute inertia of new samples. Inertia is defined as the sum of the @@ -306,19 +328,21 @@ def _compute_inertia(self, distances): def _initialize_medoids(self, D, n_clusters, random_state_): """Select initial mediods when beginning clustering.""" - if self.init == 'random': # Random initialization + if self.init == "random": # Random initialization # Pick random k medoids as the initial ones. medoids = random_state_.choice(len(D), n_clusters) - elif self.init == 'k-medoids++': + elif self.init == "k-medoids++": medoids = self._kpp_init(D, n_clusters, random_state_) elif self.init == "heuristic": # Initialization by heuristic # Pick K first data points that have the smallest sum distance # to every other point. These are the initial medoids. - medoids = np.argpartition(np.sum(D, axis=1), - n_clusters-1)[:n_clusters] + medoids = np.argpartition(np.sum(D, axis=1), n_clusters - 1)[ + :n_clusters + ] else: - raise ValueError("init value '{init}' not recognized" - .format(init=self.init)) + raise ValueError( + "init value '{init}' not recognized".format(init=self.init) + ) return medoids @@ -370,18 +394,20 @@ def _kpp_init(self, D, n_clusters, random_state_, n_local_trials=None): centers[0] = center_id # Initialize list of closest distances and calculate current potential - closest_dist_sq = D[centers[0], :]**2 + closest_dist_sq = D[centers[0], :] ** 2 current_pot = closest_dist_sq.sum() # pick the remaining n_clusters-1 points for cluster_index in range(1, n_clusters): - rand_vals = (random_state_.random_sample(n_local_trials) - * current_pot) - candidate_ids = np.searchsorted(stable_cumsum(closest_dist_sq), - rand_vals) + rand_vals = ( + random_state_.random_sample(n_local_trials) * current_pot + ) + candidate_ids = np.searchsorted( + stable_cumsum(closest_dist_sq), rand_vals + ) # Compute distances to center candidates - distance_to_candidates = D[candidate_ids, :]**2 + distance_to_candidates = D[candidate_ids, :] ** 2 # Decide which candidate is the best best_candidate = None @@ -389,8 +415,9 @@ def _kpp_init(self, D, n_clusters, random_state_, n_local_trials=None): best_dist_sq = None for trial in range(n_local_trials): # Compute potential when including center candidate - new_dist_sq = np.minimum(closest_dist_sq, - distance_to_candidates[trial]) + new_dist_sq = np.minimum( + closest_dist_sq, distance_to_candidates[trial] + ) new_pot = new_dist_sq.sum() # Store result if it is the best local trial so far diff --git a/sklearn_extra/cluster/tests/test_k_medoids.py b/sklearn_extra/cluster/tests/test_k_medoids.py index 0a802525..0b125f36 100644 --- a/sklearn_extra/cluster/tests/test_k_medoids.py +++ b/sklearn_extra/cluster/tests/test_k_medoids.py @@ -21,31 +21,51 @@ def test_kmedoids_input_validation_and_fit_check(): rng = np.random.RandomState(seed) # Invalid parameters - assert_raise_message(ValueError, "n_clusters should be a nonnegative " - "integer. 0 was given", - KMedoids(n_clusters=0).fit, X) + assert_raise_message( + ValueError, + "n_clusters should be a nonnegative " "integer. 0 was given", + KMedoids(n_clusters=0).fit, + X, + ) - assert_raise_message(ValueError, "n_clusters should be a nonnegative " - "integer. None was given", - KMedoids(n_clusters=None).fit, X) + assert_raise_message( + ValueError, + "n_clusters should be a nonnegative " "integer. None was given", + KMedoids(n_clusters=None).fit, + X, + ) - assert_raise_message(ValueError, "max_iter should be a nonnegative " - "integer. 0 was given", - KMedoids(n_clusters=1, max_iter=0).fit, X) + assert_raise_message( + ValueError, + "max_iter should be a nonnegative " "integer. 0 was given", + KMedoids(n_clusters=1, max_iter=0).fit, + X, + ) - assert_raise_message(ValueError, "max_iter should be a nonnegative " - "integer. None was given", - KMedoids(n_clusters=1, max_iter=None).fit, X) + assert_raise_message( + ValueError, + "max_iter should be a nonnegative " "integer. None was given", + KMedoids(n_clusters=1, max_iter=None).fit, + X, + ) - assert_raise_message(ValueError, "init needs to be one of the following: " - "['random', 'heuristic', 'k-medoids++']", - KMedoids(init=None).fit, X) + assert_raise_message( + ValueError, + "init needs to be one of the following: " + "['random', 'heuristic', 'k-medoids++']", + KMedoids(init=None).fit, + X, + ) # Trying to fit 3 samples to 8 clusters Xsmall = rng.rand(5, 2) - assert_raise_message(ValueError, "The number of medoids (8) must be less " - "than the number of samples 5.", - KMedoids(n_clusters=8).fit, Xsmall) + assert_raise_message( + ValueError, + "The number of medoids (8) must be less " + "than the number of samples 5.", + KMedoids(n_clusters=8).fit, + Xsmall, + ) def test_random_deterministic(): @@ -55,9 +75,7 @@ def test_random_deterministic(): X = load_iris()["data"] D = euclidean_distances(X) - medoids = KMedoids( - init="random", - )._initialize_medoids(D, 4, rng) + medoids = KMedoids(init="random")._initialize_medoids(D, 4, rng) assert_array_equal(medoids, [47, 117, 67, 103]) @@ -68,13 +86,9 @@ def test_heuristic_deterministic(): X = load_iris()["data"] D = euclidean_distances(X) - medoids_1 = KMedoids( - init="heuristic", - )._initialize_medoids(D, 10, rng1) + medoids_1 = KMedoids(init="heuristic")._initialize_medoids(D, 10, rng1) - medoids_2 = KMedoids( - init="heuristic", - )._initialize_medoids(D, 10, rng2) + medoids_2 = KMedoids(init="heuristic")._initialize_medoids(D, 10, rng2) assert_array_equal(medoids_1, medoids_2) @@ -102,20 +116,16 @@ def test_kmedoids_empty_clusters(): assert_warns_message(UserWarning, "Cluster 1 is empty!", kmedoids.fit, X) -@mock.patch.object(KMedoids, '_kpp_init', return_value=object()) +@mock.patch.object(KMedoids, "_kpp_init", return_value=object()) def test_kpp_called(_kpp_init_mocked): """KMedoids._kpp_init method should be called by _initialize_medoids""" D = np.array([[0, 1], [1, 0]]) n_clusters = 2 rng = np.random.RandomState(seed) kmedoids = KMedoids() - kmedoids.init = 'k-medoids++' + kmedoids.init = "k-medoids++" # set _kpp_init_mocked.return_value to a singleton - initial_medoids = kmedoids._initialize_medoids( - D, - n_clusters, - rng, - ) + initial_medoids = kmedoids._initialize_medoids(D, n_clusters, rng) # assert that _kpp_init was called and its result was returned. _kpp_init_mocked.assert_called_once_with(D, n_clusters, rng) @@ -126,20 +136,19 @@ def test_kmedoids_pp(): """Initial clusters should be well-separated for k-medoids++""" rng = np.random.RandomState(seed) kmedoids = KMedoids() - X = [[10, 0], - [11, 0], - [0, 10], - [0, 11], - [10, 10], - [11, 10], - [12, 10], - [10, 11], - ] + X = [ + [10, 0], + [11, 0], + [0, 10], + [0, 11], + [10, 10], + [11, 10], + [12, 10], + [10, 11], + ] D = euclidean_distances(X) - centers = kmedoids._kpp_init(D, - n_clusters=3, - random_state_=rng) + centers = kmedoids._kpp_init(D, n_clusters=3, random_state_=rng) assert len(centers) == 3 @@ -150,23 +159,12 @@ def test_kmedoids_pp(): def test_precomputed(): """Test the 'precomputed' distance metric.""" rng = np.random.RandomState(seed) - X_1 = [ - [1.0, 0.0], - [1.1, 0.0], - [0.0, 1.0], - [0.0, 1.1] - ] + X_1 = [[1.0, 0.0], [1.1, 0.0], [0.0, 1.0], [0.0, 1.1]] D_1 = euclidean_distances(X_1) - X_2 = [ - [1.1, 0.0], - [0.0, 0.9] - ] + X_2 = [[1.1, 0.0], [0.0, 0.9]] D_2 = euclidean_distances(X_2, X_1) - kmedoids = KMedoids(metric="precomputed", - n_clusters=2, - random_state=rng, - ) + kmedoids = KMedoids(metric="precomputed", n_clusters=2, random_state=rng) kmedoids.fit(D_1) assert_allclose(kmedoids.inertia_, 0.2) @@ -184,17 +182,18 @@ def test_precomputed(): def test_kmedoids_fit_naive(): n_clusters = 3 - metric = 'euclidean' + metric = "euclidean" model = KMedoids(n_clusters=n_clusters, metric=metric) Xnaive = np.asarray([[1, 0, 0], [0, 1, 0], [0, 0, 1]]) model.fit(Xnaive) - assert_array_equal(model.cluster_centers_, - [[1, 0, 0], [0, 1, 0], [0, 0, 1]]) + assert_array_equal( + model.cluster_centers_, [[1, 0, 0], [0, 1, 0], [0, 0, 1]] + ) assert_array_equal(model.labels_, [0, 1, 2]) - assert model.inertia_ == 0. + assert model.inertia_ == 0.0 # diagonal must be zero, off-diagonals must be positive X_new = model.transform(Xnaive) @@ -208,37 +207,35 @@ def test_kmedoids_fit_naive(): def test_max_iter(): """Test that warning message is thrown when max_iter is reached.""" rng = np.random.RandomState(seed) - X_iris = load_iris()['data'] + X_iris = load_iris()["data"] - model = KMedoids(n_clusters=10, - init='random', - random_state=rng, - max_iter=1, - ) - assert_warns_message(UserWarning, - "Maximum number of iteration reached before", - model.fit, - X_iris, - ) + model = KMedoids( + n_clusters=10, init="random", random_state=rng, max_iter=1 + ) + assert_warns_message( + UserWarning, + "Maximum number of iteration reached before", + model.fit, + X_iris, + ) def test_kmedoids_iris(): """Test kmedoids on the Iris dataset""" rng = np.random.RandomState(seed) - X_iris = load_iris()['data'] + X_iris = load_iris()["data"] ref_model = KMeans(n_clusters=3).fit(X_iris) - avg_dist_to_closest_centroid = ref_model\ - .transform(X_iris).min(axis=1).mean() + avg_dist_to_closest_centroid = ( + ref_model.transform(X_iris).min(axis=1).mean() + ) - for init in ['random', 'heuristic', 'k-medoids++']: - distance_metric = 'euclidean' - model = KMedoids(n_clusters=3, - metric=distance_metric, - init=init, - random_state=rng, - ) + for init in ["random", "heuristic", "k-medoids++"]: + distance_metric = "euclidean" + model = KMedoids( + n_clusters=3, metric=distance_metric, init=init, random_state=rng + ) model.fit(X_iris) # test convergence in reasonable number of steps @@ -254,8 +251,9 @@ def test_kmedoids_iris(): # we can compare its performance to # K-Means. We want the average distance to cluster centers # to be similar between K-Means and K-Medoids - assert_allclose(avg_dist_to_closest_medoid, - avg_dist_to_closest_centroid, rtol=0.1) + assert_allclose( + avg_dist_to_closest_medoid, avg_dist_to_closest_centroid, rtol=0.1 + ) def test_kmedoids_fit_predict_transform(): @@ -293,8 +291,7 @@ def test_outlier_robustness(): kmeans = KMeans(n_clusters=2, random_state=rng) kmedoids = KMedoids(n_clusters=2, random_state=rng) - X = [[-11, 0], [-10, 0], [-9, 0], - [0, 0], [1, 0], [2, 0], [1000, 0]] + X = [[-11, 0], [-10, 0], [-9, 0], [0, 0], [1, 0], [2, 0], [1000, 0]] kmeans.fit(X) kmedoids.fit(X) diff --git a/sklearn_extra/tests/test_common.py b/sklearn_extra/tests/test_common.py index f1b1e862..cfcbf9d0 100644 --- a/sklearn_extra/tests/test_common.py +++ b/sklearn_extra/tests/test_common.py @@ -5,9 +5,7 @@ from sklearn_extra.kernel_approximation import Fastfood from sklearn_extra.cluster import KMedoids -@pytest.mark.parametrize( - "Estimator", - [Fastfood, KMedoids] -) + +@pytest.mark.parametrize("Estimator", [Fastfood, KMedoids]) def test_all_estimators(Estimator, request): return check_estimator(Estimator) From 182d505ed50e2acbd7ebd8e1d7adcd2cfae017e8 Mon Sep 17 00:00:00 2001 From: Zane Dufour Date: Sat, 27 Jul 2019 16:43:30 -0400 Subject: [PATCH 19/24] Remove commented out math code --- doc/conf.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/doc/conf.py b/doc/conf.py index 1cbdd442..c39936a0 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -45,15 +45,6 @@ # see https://github.com/numpy/numpydoc/issues/69 numpydoc_show_class_members = False -# pngmath / imgmath compatibility layer for different sphinx versions -import sphinx -from distutils.version import LooseVersion - -# if LooseVersion(sphinx.__version__) < LooseVersion('1.4'): -# extensions.append('sphinx.ext.pngmath') -# else: -# extensions.append('sphinx.ext.imgmath') - autodoc_default_flags = ["members", "inherited-members"] # For maths, use mathjax by default and svg if NO_MATHJAX env variable is set From 88d9630329e2116dfe42550db8f1cb8398dbf699 Mon Sep 17 00:00:00 2001 From: Zane Dufour Date: Sat, 27 Jul 2019 16:45:24 -0400 Subject: [PATCH 20/24] Remove unnecessary plot_kmedoids_digits.py --- examples/plot_kmedoids_digits.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/examples/plot_kmedoids_digits.py b/examples/plot_kmedoids_digits.py index c74d9ab2..28c7659d 100644 --- a/examples/plot_kmedoids_digits.py +++ b/examples/plot_kmedoids_digits.py @@ -9,7 +9,6 @@ import numpy as np import matplotlib.pyplot as plt -from collections import namedtuple from sklearn.cluster import KMeans from sklearn_extra.cluster import KMedoids from sklearn.datasets import load_digits @@ -47,21 +46,18 @@ fontsize=14, ) -Algorithm = namedtuple("ClusterAlgorithm", ["model", "description"]) selected_models = [ - Algorithm( + ( KMedoids(metric="manhattan", n_clusters=n_digits), "KMedoids (manhattan)", ), - Algorithm( + ( KMedoids(metric="euclidean", n_clusters=n_digits), "KMedoids (euclidean)", ), - Algorithm( - KMedoids(metric="cosine", n_clusters=n_digits), "KMedoids (cosine)" - ), - Algorithm(KMeans(n_clusters=n_digits), "KMeans"), + (KMedoids(metric="cosine", n_clusters=n_digits), "KMedoids (cosine)"), + (KMeans(n_clusters=n_digits), "KMeans"), ] plot_rows = int(np.ceil(len(selected_models) / 2.0)) From 9405d98fdc34da6bfa58fde4f30c1ff5d03b366a Mon Sep 17 00:00:00 2001 From: Zane Dufour Date: Sat, 27 Jul 2019 17:39:13 -0400 Subject: [PATCH 21/24] Remove `x_squared_norms` from _kpp_init (copied over from kmeans) --- sklearn_extra/cluster/_k_medoids.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/sklearn_extra/cluster/_k_medoids.py b/sklearn_extra/cluster/_k_medoids.py index def6560d..515d1602 100644 --- a/sklearn_extra/cluster/_k_medoids.py +++ b/sklearn_extra/cluster/_k_medoids.py @@ -357,9 +357,6 @@ def _kpp_init(self, D, n_clusters, random_state_, n_local_trials=None): n_clusters : integer The number of seeds to choose - x_squared_norms : array, shape (n_samples,) - Squared Euclidean norm of each data point. - random_state : RandomState The generator used to initialize the centers. From 0989f8853c4bee853b4c18634386b24483ab0128 Mon Sep 17 00:00:00 2001 From: Zane Dufour Date: Sat, 27 Jul 2019 17:44:26 -0400 Subject: [PATCH 22/24] Add comment for _kpp_init mentnioning k_means_._k_init copypasta --- sklearn_extra/cluster/_k_medoids.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn_extra/cluster/_k_medoids.py b/sklearn_extra/cluster/_k_medoids.py index 515d1602..ba160970 100644 --- a/sklearn_extra/cluster/_k_medoids.py +++ b/sklearn_extra/cluster/_k_medoids.py @@ -346,6 +346,7 @@ def _initialize_medoids(self, D, n_clusters, random_state_): return medoids + # Copied from sklearn.cluster.k_means_._k_init def _kpp_init(self, D, n_clusters, random_state_, n_local_trials=None): """Init n_clusters seeds with a method similar to k-means++ From d76d6b87059dd1476a1d58bf4a420690f8a621cf Mon Sep 17 00:00:00 2001 From: Zane Dufour Date: Sat, 27 Jul 2019 21:28:41 -0400 Subject: [PATCH 23/24] update n_samples -> n_query, where appropriate --- sklearn_extra/cluster/_k_medoids.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn_extra/cluster/_k_medoids.py b/sklearn_extra/cluster/_k_medoids.py index ba160970..298195d9 100644 --- a/sklearn_extra/cluster/_k_medoids.py +++ b/sklearn_extra/cluster/_k_medoids.py @@ -263,7 +263,7 @@ def transform(self, X): Returns ------- - X_new : {array-like, sparse matrix}, shape=(n_samples, n_clusters) + X_new : {array-like, sparse matrix}, shape=(n_query, n_clusters) X transformed in the new space of distances to cluster centers. """ X = check_array(X, accept_sparse=["csr", "csc"]) @@ -288,7 +288,7 @@ def predict(self, X): Returns ------- - labels : array, shape = (n_samples,) + labels : array, shape = (n_query,) Index of the cluster each sample belongs to. """ X = check_array(X, accept_sparse=["csr", "csc"]) From c060b0e5c1fcbf80c0792ef4c4ce96fae5f36f6f Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Mon, 29 Jul 2019 14:12:11 +0200 Subject: [PATCH 24/24] Add sklearn_extra/cluster/tests/__init__.py --- sklearn_extra/cluster/tests/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 sklearn_extra/cluster/tests/__init__.py diff --git a/sklearn_extra/cluster/tests/__init__.py b/sklearn_extra/cluster/tests/__init__.py new file mode 100644 index 00000000..e69de29b