From cd19b57742e7edaba1bfaffa3d058dcdf3874add Mon Sep 17 00:00:00 2001
From: Zane Dufour <zane.dufour@gmail.com>
Date: Sat, 6 Apr 2019 21:37:30 -0400
Subject: [PATCH 01/24] Added kmedoids code

---
 sklearn_extra/cluster/__init__.py   |   5 +
 sklearn_extra/cluster/k_medoids_.py | 398 ++++++++++++++++++++++++++++
 2 files changed, 403 insertions(+)
 create mode 100644 sklearn_extra/cluster/__init__.py
 create mode 100644 sklearn_extra/cluster/k_medoids_.py

diff --git a/sklearn_extra/cluster/__init__.py b/sklearn_extra/cluster/__init__.py
new file mode 100644
index 00000000..d30e7b64
--- /dev/null
+++ b/sklearn_extra/cluster/__init__.py
@@ -0,0 +1,5 @@
+from .k_medoids_ import KMedoids
+
+__all__ = [
+    'KMedoids',
+]
diff --git a/sklearn_extra/cluster/k_medoids_.py b/sklearn_extra/cluster/k_medoids_.py
new file mode 100644
index 00000000..5c26cbc4
--- /dev/null
+++ b/sklearn_extra/cluster/k_medoids_.py
@@ -0,0 +1,398 @@
+# -*- coding: utf-8 -*-
+"""K-medoids clustering"""
+
+# Authors: Timo Erkkilä <timo.erkkila@gmail.com>
+#          Antti Lehmussola <antti.lehmussola@gmail.com>
+#          Kornel Kiełczewski <kornel.mail@gmail.com>
+#          Zane Dufour <zane.dufour@gmail.com>
+# License: BSD 3 clause
+
+import warnings
+
+import numpy as np
+
+from ..base import BaseEstimator, ClusterMixin, TransformerMixin
+from ..metrics.pairwise import pairwise_distances, pairwise_distances_argmin
+from ..utils import check_array, check_random_state
+from ..utils.extmath import stable_cumsum
+from ..utils.validation import check_is_fitted
+from ..exceptions import ConvergenceWarning
+
+
+class KMedoids(BaseEstimator, ClusterMixin, TransformerMixin):
+    """k-medoids clustering.
+
+    Read more in the :ref:`User Guide <k_medoids>`.
+
+    Parameters
+    ----------
+    n_clusters : int, optional, default: 8
+        The number of clusters to form as well as the number of medoids to
+        generate.
+
+    metric : string, or callable, optional, default: 'euclidean'
+        What distance metric to use. See :func:metrics.pairwise_distances
+
+    init : {'random', 'heuristic'}, optional, default: 'heuristic'
+        Specify medoid initialization method. Random selects n_clusters
+        elements from the dataset, while heuristic picks the n_clusters points
+        with the smallest sum distance to every other point.
+
+    max_iter : int, optional, default : 300
+        Specify the maximum number of iterations when fitting.
+
+    random_state : int, RandomState instance or None, optional
+        Specify random state for the random number generator. Used to
+        initialise medoids when init='random'.
+
+    Attributes
+    ----------
+    cluster_centers_ : array, shape = (n_clusters, n_features)
+            or None if metric == 'precomputed'
+        Cluster centers, i.e. medoids (elements from the original dataset)
+
+    medoid_indices_ : array, shape = (n_clusters,)
+        The indices of the medoid rows in X
+
+    labels_ : array, shape = (n_samples,)
+        Labels of each point
+
+    inertia_ : float
+        Sum of distances of samples to their closest cluster center.
+
+    Examples
+    --------
+    >>> from sklearn.cluster import KMedoids
+    >>> import numpy as np
+
+    >>> X = np.asarray([[1, 2], [1, 4], [1, 0],
+    ...                 [4, 2], [4, 4], [4, 0]])
+    >>> kmedoids = KMedoids(n_clusters=2, random_state=0).fit(X)
+    >>> kmedoids.labels_
+    array([0, 0, 0, 1, 1, 1])
+    >>> kmedoids.predict([[0,0], [4,4]])
+    array([0, 1])
+    >>> kmedoids.cluster_centers_
+    array([[1, 2],
+           [4, 2]])
+    >>> kmedoids.inertia_
+    8.0
+
+    References
+    ----------
+    Kaufman, L. and Rousseeuw, P.J., Statistical Data Analysis Based on
+    the L1–Norm and Related Methods, edited by Y. Dodge, North-Holland,
+    405–416. 1987
+
+    See also
+    --------
+
+    KMeans
+        The KMeans algorithm minimizes the within-cluster sum-of-squares
+        criterion. It scales well to large number of samples.
+
+    Notes
+    -----
+    Since all pairwise distances are calculated and stored in memory for
+    the duration of fit, the space complexity is O(n_samples ** 2).
+    """
+
+    def __init__(self, n_clusters=8, metric='euclidean',
+                 init='heuristic', max_iter=300, random_state=None):
+        self.n_clusters = n_clusters
+        self.metric = metric
+        self.init = init
+        self.max_iter = max_iter
+        self.random_state = random_state
+
+    def _check_nonnegative_int(self, value, desc):
+        """Validates if value is a valid integer > 0"""
+
+        if (value is None or value <= 0 or
+                not isinstance(value, (int, np.integer))):
+            raise ValueError("%s should be a nonnegative integer. "
+                             "%s was given" % (desc, value))
+
+    def _check_init_args(self):
+        """Validates the input arguments. """
+
+        # Check n_clusters and max_iter
+        self._check_nonnegative_int(self.n_clusters, "n_clusters")
+        self._check_nonnegative_int(self.max_iter, "max_iter")
+
+        # Check init
+        init_methods = ['random', 'heuristic', 'k-medoids++']
+        if self.init not in init_methods:
+            raise ValueError("init needs to be one of " +
+                             "the following: " +
+                             "%s" % init_methods)
+
+    def fit(self, X, y=None):
+        """Fit K-Medoids to the provided data.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape = (n_samples, n_features), \
+                or (n_samples, n_samples) if metric == 'precomputed'
+            Dataset to cluster.
+
+        y : Ignored
+
+        Returns
+        -------
+        self
+        """
+        random_state_ = check_random_state(self.random_state)
+
+        self._check_init_args()
+        X = check_array(X, accept_sparse=['csr', 'csc'])
+        if self.n_clusters > X.shape[0]:
+            raise ValueError("The number of medoids (%d) must be less "
+                             "than the number of samples %d."
+                             % (self.n_clusters, X.shape[0]))
+
+        D = pairwise_distances(X, metric=self.metric)
+        medoid_idxs = self._initialize_medoids(D,
+                                               self.n_clusters,
+                                               random_state_,
+                                               )
+        labels = None
+
+        # Continue the algorithm as long as
+        # the medoids keep changing and the maximum number
+        # of iterations is not exceeded
+        for self.n_iter_ in range(0, self.max_iter):
+            old_medoid_idxs = np.copy(medoid_idxs)
+            labels = np.argmin(D[medoid_idxs, :], axis=0)
+
+            # Update medoids with the new cluster indices
+            self._update_medoid_idxs_in_place(D, labels, medoid_idxs)
+            if np.all(old_medoid_idxs == medoid_idxs):
+                break
+            elif self.n_iter_ == self.max_iter - 1:
+                warnings.warn("Maximum number of iteration reached before "
+                              "convergence. Consider increasing max_iter to "
+                              "improve the fit.",
+                              ConvergenceWarning)
+
+        # Set the resulting instance variables.
+        if self.metric == "precomputed":
+            self.cluster_centers_ = None
+        else:
+            self.cluster_centers_ = X[medoid_idxs]
+
+        # Expose labels_ which are the assignments of
+        # the training data to clusters
+        self.labels_ = labels
+        self.medoid_indices_ = medoid_idxs
+        self.inertia_ = self._compute_inertia(self.transform(X))
+
+        # Return self to enable method chaining
+        return self
+
+    def _update_medoid_idxs_in_place(self, D, labels, medoid_idxs):
+        """In-place update of the medoid indices"""
+
+        # Update the medoids for each cluster
+        for k in range(self.n_clusters):
+            # Extract the distance matrix between the data points
+            # inside the cluster k
+            cluster_k_idxs = np.where(labels == k)[0]
+
+            if len(cluster_k_idxs) == 0:
+                warnings.warn(
+                    "Cluster {k} is empty! "
+                    "self.labels_[self.medoid_indices_[{k}]] "
+                    "may not be labeled with "
+                    "its corresponding cluster ({k}).".format(k=k))
+                continue
+
+            in_cluster_distances = D[cluster_k_idxs,
+                                     cluster_k_idxs[:, np.newaxis]]
+
+            # Calculate all costs from each point to all others in the cluster
+            in_cluster_all_costs = np.sum(in_cluster_distances, axis=1)
+
+            min_cost_idx = np.argmin(in_cluster_all_costs)
+            min_cost = in_cluster_all_costs[min_cost_idx]
+            curr_cost = in_cluster_all_costs[
+                np.argmax(cluster_k_idxs == medoid_idxs[k])]
+
+            # Adopt a new medoid if its distance is smaller then the current
+            if min_cost < curr_cost:
+                medoid_idxs[k] = cluster_k_idxs[min_cost_idx]
+
+    def transform(self, X):
+        """Transforms X to cluster-distance space.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_query, n_features), \
+                or (n_query, n_indexed) if metric == 'precomputed'
+            Data to transform.
+
+        Returns
+        -------
+        X_new : {array-like, sparse matrix}, shape=(n_samples, n_clusters)
+            X transformed in the new space of distances to cluster centers.
+        """
+        X = check_array(X, accept_sparse=['csr', 'csc'])
+
+        if self.metric == "precomputed":
+            check_is_fitted(self, "medoid_indices_")
+            return X[:, self.medoid_indices_]
+        else:
+            check_is_fitted(self, "cluster_centers_")
+
+            Y = self.cluster_centers_
+            return pairwise_distances(X, Y=Y,
+                                      metric=self.metric)
+
+    def predict(self, X):
+        """Predict the closest cluster for each sample in X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_query, n_features), \
+                or (n_query, n_indexed) if metric == 'precomputed'
+            New data to predict.
+
+        Returns
+        -------
+        labels : array, shape = (n_samples,)
+            Index of the cluster each sample belongs to.
+        """
+        X = check_array(X, accept_sparse=['csr', 'csc'])
+
+        if self.metric == "precomputed":
+            check_is_fitted(self, "medoid_indices_")
+            return np.argmin(X[:, self.medoid_indices_], axis=1)
+        else:
+            check_is_fitted(self, "cluster_centers_")
+
+            # Return data points to clusters based on which cluster assignment
+            # yields the smallest distance
+            return pairwise_distances_argmin(X, Y=self.cluster_centers_,
+                                             metric=self.metric)
+
+    def _compute_inertia(self, distances):
+        """Compute inertia of new samples. Inertia is defined as the sum of the
+        sample distances to closest cluster centers.
+
+        Parameters
+        ----------
+        distances : {array-like, sparse matrix}, shape=(n_samples, n_clusters)
+            Distances to cluster centers.
+
+        Returns
+        -------
+        Sum of sample distances to closest cluster centers.
+        """
+
+        # Define inertia as the sum of the sample-distances
+        # to closest cluster centers
+        inertia = np.sum(np.min(distances, axis=1))
+
+        return inertia
+
+    def _initialize_medoids(self, D, n_clusters, random_state_):
+        """Select initial mediods when beginning clustering."""
+
+        if self.init == 'random':  # Random initialization
+            # Pick random k medoids as the initial ones.
+            medoids = random_state_.choice(len(D), n_clusters)
+        elif self.init == 'k-medoids++':
+            medoids = self._kpp_init(D, random_state_)
+        elif self.init == "heuristic":  # Initialization by heuristic
+            # Pick K first data points that have the smallest sum distance
+            # to every other point. These are the initial medoids.
+            medoids = np.argpartition(np.sum(D, axis=1),
+                                      n_clusters-1)[:n_clusters]
+        else:
+            raise ValueError("init value '{init}' not recognized"
+                             .format(init=self.init))
+
+        return medoids
+
+    def _kpp_init(self, D, random_state_, n_local_trials=None):
+        """Init n_clusters seeds with a method similar to k-means++
+
+        Parameters
+        -----------
+        D : array, shape (n_samples, n_samples)
+            The distance matrix we will use to select medoid indices.
+
+        n_clusters : integer
+            The number of seeds to choose
+
+        x_squared_norms : array, shape (n_samples,)
+            Squared Euclidean norm of each data point.
+
+        random_state : RandomState
+            The generator used to initialize the centers.
+
+        n_local_trials : integer, optional
+            The number of seeding trials for each center (except the first),
+            of which the one reducing inertia the most is greedily chosen.
+            Set to None to make the number of trials depend logarithmically
+            on the number of seeds (2+log(k)); this is the default.
+
+        Notes
+        -----
+        Selects initial cluster centers for k-medoid clustering in a smart way
+        to speed up convergence. see: Arthur, D. and Vassilvitskii, S.
+        "k-means++: the advantages of careful seeding". ACM-SIAM symposium
+        on Discrete algorithms. 2007
+
+        Version ported from http://www.stanford.edu/~darthur/kMeansppTest.zip,
+        which is the implementation used in the aforementioned paper.
+        """
+        n_samples, _ = D.shape
+
+        centers = np.empty(self.n_clusters, dtype=int)
+
+        # Set the number of local seeding trials if none is given
+        if n_local_trials is None:
+            # This is what Arthur/Vassilvitskii tried, but did not report
+            # specific results for other than mentioning in the conclusion
+            # that it helped.
+            n_local_trials = 2 + int(np.log(self.n_clusters))
+
+        center_id = random_state_.randint(n_samples)
+        centers[0] = center_id
+
+        # Initialize list of closest distances and calculate current potential
+        closest_dist_sq = D[centers[0], :]**2
+        current_pot = closest_dist_sq.sum()
+
+        # pick the remaining self.n_clusters-1 points
+        for cluster_index in range(1, self.n_clusters):
+            rand_vals = (random_state_.random_sample(n_local_trials)
+                         * current_pot)
+            candidate_ids = np.searchsorted(stable_cumsum(closest_dist_sq),
+                                            rand_vals)
+
+            # Compute distances to center candidates
+            distance_to_candidates = D[candidate_ids, :]**2
+
+            # Decide which candidate is the best
+            best_candidate = None
+            best_pot = None
+            best_dist_sq = None
+            for trial in range(n_local_trials):
+                # Compute potential when including center candidate
+                new_dist_sq = np.minimum(closest_dist_sq,
+                                         distance_to_candidates[trial])
+                new_pot = new_dist_sq.sum()
+
+                # Store result if it is the best local trial so far
+                if (best_candidate is None) or (new_pot < best_pot):
+                    best_candidate = candidate_ids[trial]
+                    best_pot = new_pot
+                    best_dist_sq = new_dist_sq
+
+            centers[cluster_index] = best_candidate
+            current_pot = best_pot
+            closest_dist_sq = best_dist_sq
+
+        return centers

From 3e184446f40596e2062d92deba06b320f568e470 Mon Sep 17 00:00:00 2001
From: Zane Dufour <zane.dufour@gmail.com>
Date: Mon, 22 Apr 2019 21:39:39 -0400
Subject: [PATCH 02/24] changed k_medoids_ imports to absolute

---
 sklearn_extra/cluster/k_medoids_.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/sklearn_extra/cluster/k_medoids_.py b/sklearn_extra/cluster/k_medoids_.py
index 5c26cbc4..19a22d8e 100644
--- a/sklearn_extra/cluster/k_medoids_.py
+++ b/sklearn_extra/cluster/k_medoids_.py
@@ -11,12 +11,12 @@
 
 import numpy as np
 
-from ..base import BaseEstimator, ClusterMixin, TransformerMixin
-from ..metrics.pairwise import pairwise_distances, pairwise_distances_argmin
-from ..utils import check_array, check_random_state
-from ..utils.extmath import stable_cumsum
-from ..utils.validation import check_is_fitted
-from ..exceptions import ConvergenceWarning
+from sklearn.base import BaseEstimator, ClusterMixin, TransformerMixin
+from sklearn.metrics.pairwise import pairwise_distances, pairwise_distances_argmin
+from sklearn.utils import check_array, check_random_state
+from sklearn.utils.extmath import stable_cumsum
+from sklearn.utils.validation import check_is_fitted
+from sklearn.exceptions import ConvergenceWarning
 
 
 class KMedoids(BaseEstimator, ClusterMixin, TransformerMixin):

From d4c086c7addd7173935f25c986e18d1072d84149 Mon Sep 17 00:00:00 2001
From: Zane Dufour <zane.dufour@gmail.com>
Date: Sun, 28 Apr 2019 21:53:10 -0400
Subject: [PATCH 03/24] Added .vscode to .gitignore

---
 .gitignore | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitignore b/.gitignore
index 442f8c2a..498fbc60 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,6 +8,9 @@ __pycache__/
 # C extensions
 *.so
 
+# Text Editors
+.vscode/
+
 # scikit-learn specific
 doc/_build/
 doc/auto_examples/

From bacc9317fcd622178e34b19789db18c1fadd9afc Mon Sep 17 00:00:00 2001
From: Zane Dufour <zane.dufour@gmail.com>
Date: Sun, 28 Apr 2019 22:17:03 -0400
Subject: [PATCH 04/24] Add venv to .gitignore

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 498fbc60..7d0d0c2b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -20,6 +20,7 @@ doc/datasets/generated/
 # Distribution / packaging
 
 .Python
+venv/
 env/
 build/
 develop-eggs/

From 0cb8e436691b860be981edae73632ba5737a7e31 Mon Sep 17 00:00:00 2001
From: Zane Dufour <zane.dufour@gmail.com>
Date: Sun, 28 Apr 2019 22:19:59 -0400
Subject: [PATCH 05/24] Added cluster tests

---
 sklearn_extra/cluster/tests/test_k_medoids.py | 294 ++++++++++++++++++
 1 file changed, 294 insertions(+)
 create mode 100644 sklearn_extra/cluster/tests/test_k_medoids.py

diff --git a/sklearn_extra/cluster/tests/test_k_medoids.py b/sklearn_extra/cluster/tests/test_k_medoids.py
new file mode 100644
index 00000000..f5abf722
--- /dev/null
+++ b/sklearn_extra/cluster/tests/test_k_medoids.py
@@ -0,0 +1,294 @@
+"""Testing for K-Medoids"""
+import warnings
+import numpy as np
+from scipy.sparse import csc_matrix
+
+from sklearn.datasets import load_iris
+from sklearn.metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS
+from sklearn.metrics.pairwise import euclidean_distances
+from sklearn.utils.testing import assert_array_equal, assert_equal
+from sklearn.utils.testing import assert_raise_message, assert_warns_message
+from sklearn.utils.testing import assert_allclose
+
+from sklearn_extra.cluster import KMedoids
+from sklearn.cluster import KMeans
+
+seed = 0
+X = np.random.RandomState(seed).rand(100, 5)
+
+
+def test_kmedoids_input_validation_and_fit_check():
+    rng = np.random.RandomState(seed)
+    # Invalid parameters
+    assert_raise_message(ValueError, "n_clusters should be a nonnegative "
+                                     "integer. 0 was given",
+                         KMedoids(n_clusters=0).fit, X)
+
+    assert_raise_message(ValueError, "n_clusters should be a nonnegative "
+                                     "integer. None was given",
+                         KMedoids(n_clusters=None).fit, X)
+
+    assert_raise_message(ValueError, "max_iter should be a nonnegative "
+                                     "integer. 0 was given",
+                         KMedoids(n_clusters=1, max_iter=0).fit, X)
+
+    assert_raise_message(ValueError, "max_iter should be a nonnegative "
+                                     "integer. None was given",
+                         KMedoids(n_clusters=1, max_iter=None).fit, X)
+
+    assert_raise_message(ValueError, "init needs to be one of the following: "
+                                     "['random', 'heuristic', 'k-medoids++']",
+                         KMedoids(init=None).fit, X)
+
+    # Trying to fit 3 samples to 8 clusters
+    Xsmall = rng.rand(5, 2)
+    assert_raise_message(ValueError, "The number of medoids (8) must be less "
+                                     "than the number of samples 5.",
+                         KMedoids(n_clusters=8).fit, Xsmall)
+
+
+def test_random_deterministic():
+    """Random_state should determine 'random' init output."""
+    rng = np.random.RandomState(seed)
+
+    X = load_iris()["data"]
+    D = euclidean_distances(X)
+
+    medoids = KMedoids(
+        init="random",
+        )._initialize_medoids(D, 4, rng)
+    assert_array_equal(medoids, [47, 117, 67, 103])
+
+
+def test_heuristic_deterministic():
+    """Result of heuristic init method should not depend on rnadom state."""
+    rng1 = np.random.RandomState(1)
+    rng2 = np.random.RandomState(2)
+    X = load_iris()["data"]
+    D = euclidean_distances(X)
+
+    medoids_1 = KMedoids(
+        init="heuristic",
+        )._initialize_medoids(D, 10, rng1)
+
+    medoids_2 = KMedoids(
+        init="heuristic",
+        )._initialize_medoids(D, 10, rng2)
+
+    assert_array_equal(medoids_1, medoids_2)
+
+
+def test_update_medoid_idxs_empty_cluster():
+    """Label is unchanged for an empty cluster."""
+    D = np.zeros((3, 3))
+    labels = np.array([0, 0, 0])
+    medoid_idxs = np.array([0, 1])
+    kmedoids = KMedoids(n_clusters=2)
+
+    # Swallow empty cluster warning
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        kmedoids._update_medoid_idxs_in_place(D, labels, medoid_idxs)
+
+    assert_array_equal(medoid_idxs, [0, 1])
+
+
+def test_kmedoids_empty_clusters():
+    """When a cluster is empty, it should throw a warning."""
+    rng = np.random.RandomState(seed)
+    X = [[1], [1], [1]]
+    kmedoids = KMedoids(n_clusters=2, random_state=rng)
+    assert_warns_message(UserWarning, "Cluster 1 is empty!", kmedoids.fit, X)
+
+
+def test_kmedoids_pp():
+    """Initial clusters should be well-separated for k-medoids++"""
+    rng = np.random.RandomState(seed)
+    kmedoids = KMedoids(n_clusters=3,
+                        init="k-medoids++",
+                        random_state=rng)
+    X = [[10, 0],
+         [11, 0],
+         [0, 10],
+         [0, 11],
+         [10, 10],
+         [11, 10],
+         [12, 10],
+         [10, 11],
+         ]
+    D = euclidean_distances(X)
+
+    centers = kmedoids._initialize_medoids(D, 3, random_state_=rng)
+
+    assert len(centers) == 3
+
+    inter_medoid_distances = D[centers][:, centers]
+    assert np.all((inter_medoid_distances > 5) | (inter_medoid_distances == 0))
+
+
+def test_precomputed():
+    """Test the 'precomputed' distance metric."""
+    rng = np.random.RandomState(seed)
+    X_1 = [
+        [1.0, 0.0],
+        [1.1, 0.0],
+        [0.0, 1.0],
+        [0.0, 1.1]
+    ]
+    D_1 = euclidean_distances(X_1)
+    X_2 = [
+        [1.1, 0.0],
+        [0.0, 0.9]
+    ]
+    D_2 = euclidean_distances(X_2, X_1)
+
+    kmedoids = KMedoids(metric="precomputed",
+                        n_clusters=2,
+                        random_state=rng,
+                        )
+    kmedoids.fit(D_1)
+
+    assert_allclose(kmedoids.inertia_, 0.2)
+    assert_array_equal(kmedoids.medoid_indices_, [2, 0])
+    assert_array_equal(kmedoids.labels_, [1, 1, 0, 0])
+    assert kmedoids.cluster_centers_ is None
+
+    med_1, med_2 = tuple(kmedoids.medoid_indices_)
+    predictions = kmedoids.predict(D_2)
+    assert_array_equal(predictions, [med_1 // 2, med_2 // 2])
+
+    transformed = kmedoids.transform(D_2)
+    assert_array_equal(transformed, D_2[:, kmedoids.medoid_indices_])
+
+
+def test_kmedoids_fit_naive():
+    n_clusters = 3
+    metric = 'euclidean'
+
+    model = KMedoids(n_clusters=n_clusters, metric=metric)
+    Xnaive = np.asarray([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
+
+    model.fit(Xnaive)
+
+    assert_array_equal(model.cluster_centers_,
+                       [[1, 0, 0], [0, 1, 0], [0, 0, 1]])
+    assert_array_equal(model.labels_, [0, 1, 2])
+    assert model.inertia_ == 0.
+
+    # diagonal must be zero, off-diagonals must be positive
+    X_new = model.transform(Xnaive)
+    for c in range(n_clusters):
+        assert X_new[c, c] == 0
+        for c2 in range(n_clusters):
+            if c != c2:
+                assert X_new[c, c2] > 0
+
+
+def test_max_iter():
+    """Test that warning message is thrown when max_iter is reached."""
+    rng = np.random.RandomState(seed)
+    X_iris = load_iris()['data']
+
+    model = KMedoids(n_clusters=10,
+                     init='random',
+                     random_state=rng,
+                     max_iter=1,
+                     )
+    assert_warns_message(UserWarning,
+                         "Maximum number of iteration reached before",
+                         model.fit,
+                         X_iris,
+                         )
+
+
+def test_kmedoids_iris():
+    """Test kmedoids on the Iris dataset"""
+    rng = np.random.RandomState(seed)
+    X_iris = load_iris()['data']
+
+    ref_model = KMeans(n_clusters=3).fit(X_iris)
+
+    avg_dist_to_closest_centroid = ref_model\
+        .transform(X_iris).min(axis=1).mean()
+
+    for init in ['random', 'heuristic', 'k-medoids++']:
+        distance_metric = 'euclidean'
+        model = KMedoids(n_clusters=3,
+                         metric=distance_metric,
+                         init=init,
+                         random_state=rng,
+                         )
+        model.fit(X_iris)
+
+        # test convergence in reasonable number of steps
+        assert model.n_iter_ < (len(X_iris) // 10)
+
+        distances = PAIRWISE_DISTANCE_FUNCTIONS[distance_metric](X_iris)
+        avg_dist_to_random_medoid = np.mean(distances.ravel())
+        avg_dist_to_closest_medoid = model.inertia_ / X_iris.shape[0]
+        # We want distance-to-closest-medoid to be reduced from average
+        # distance by more than 50%
+        assert avg_dist_to_random_medoid > 2 * avg_dist_to_closest_medoid
+        # When K-Medoids is using Euclidean distance,
+        # we can compare its performance to
+        # K-Means. We want the average distance to cluster centers
+        # to be similar between K-Means and K-Medoids
+        assert_allclose(avg_dist_to_closest_medoid,
+                        avg_dist_to_closest_centroid, rtol=0.1)
+
+
+def test_kmedoids_fit_predict_transform():
+    rng = np.random.RandomState(seed)
+    model = KMedoids(random_state=rng)
+
+    labels1 = model.fit_predict(X)
+    assert_equal(len(labels1), 100)
+    assert_array_equal(labels1, model.labels_)
+
+    labels2 = model.predict(X)
+    assert_array_equal(labels1, labels2)
+
+    Xt1 = model.fit_transform(X)
+    assert_array_equal(Xt1.shape, (100, model.n_clusters))
+
+    Xt2 = model.transform(X)
+    assert_array_equal(Xt1, Xt2)
+
+
+def test_callable_distance_metric():
+    rng = np.random.RandomState(seed)
+
+    def my_metric(a, b):
+        return np.sqrt(np.sum(np.power(a - b, 2)))
+
+    model = KMedoids(random_state=rng, metric=my_metric)
+    labels1 = model.fit_predict(X)
+    assert_equal(len(labels1), 100)
+    assert_array_equal(labels1, model.labels_)
+
+
+def test_outlier_robustness():
+    rng = np.random.RandomState(seed)
+    kmeans = KMeans(n_clusters=2, random_state=rng)
+    kmedoids = KMedoids(n_clusters=2, random_state=rng)
+
+    X = [[-11, 0], [-10, 0], [-9, 0],
+         [0, 0], [1, 0], [2, 0], [1000, 0]]
+
+    kmeans.fit(X)
+    kmedoids.fit(X)
+
+    assert_array_equal(kmeans.labels_, [0, 0, 0, 0, 0, 0, 1])
+    assert_array_equal(kmedoids.labels_, [0, 0, 0, 1, 1, 1, 1])
+
+
+def test_kmedoids_on_sparse_input():
+    rng = np.random.RandomState(seed)
+    model = KMedoids(n_clusters=2, random_state=rng)
+    row = np.array([1, 0])
+    col = np.array([0, 4])
+    data = np.array([1, 1])
+    X = csc_matrix((data, (row, col)), shape=(2, 5))
+    labels = model.fit_predict(X)
+    assert_equal(len(labels), 2)
+    assert_array_equal(labels, model.labels_)

From 96f3a2ec14d2548c683e666b3ac58ddcca31af14 Mon Sep 17 00:00:00 2001
From: Zane Dufour <zane.dufour@gmail.com>
Date: Sun, 28 Apr 2019 22:20:40 -0400
Subject: [PATCH 06/24] Fix KMedoids docstring

---
 sklearn_extra/cluster/k_medoids_.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn_extra/cluster/k_medoids_.py b/sklearn_extra/cluster/k_medoids_.py
index 19a22d8e..917caa47 100644
--- a/sklearn_extra/cluster/k_medoids_.py
+++ b/sklearn_extra/cluster/k_medoids_.py
@@ -62,7 +62,7 @@ class KMedoids(BaseEstimator, ClusterMixin, TransformerMixin):
 
     Examples
     --------
-    >>> from sklearn.cluster import KMedoids
+    >>> from sklearn_extra.cluster import KMedoids
     >>> import numpy as np
 
     >>> X = np.asarray([[1, 2], [1, 4], [1, 0],

From 8d9d9d6498319ca935ceb50d6bb8431e078a8364 Mon Sep 17 00:00:00 2001
From: Zane Dufour <zane.dufour@gmail.com>
Date: Mon, 29 Apr 2019 22:18:52 -0400
Subject: [PATCH 07/24] Reconfigure _kpp_init tests

---
 sklearn_extra/cluster/k_medoids_.py           | 12 ++++----
 sklearn_extra/cluster/tests/test_k_medoids.py | 29 ++++++++++++++++---
 2 files changed, 31 insertions(+), 10 deletions(-)

diff --git a/sklearn_extra/cluster/k_medoids_.py b/sklearn_extra/cluster/k_medoids_.py
index 917caa47..b3209929 100644
--- a/sklearn_extra/cluster/k_medoids_.py
+++ b/sklearn_extra/cluster/k_medoids_.py
@@ -302,7 +302,7 @@ def _initialize_medoids(self, D, n_clusters, random_state_):
             # Pick random k medoids as the initial ones.
             medoids = random_state_.choice(len(D), n_clusters)
         elif self.init == 'k-medoids++':
-            medoids = self._kpp_init(D, random_state_)
+            medoids = self._kpp_init(D, n_clusters, random_state_)
         elif self.init == "heuristic":  # Initialization by heuristic
             # Pick K first data points that have the smallest sum distance
             # to every other point. These are the initial medoids.
@@ -314,7 +314,7 @@ def _initialize_medoids(self, D, n_clusters, random_state_):
 
         return medoids
 
-    def _kpp_init(self, D, random_state_, n_local_trials=None):
+    def _kpp_init(self, D, n_clusters, random_state_, n_local_trials=None):
         """Init n_clusters seeds with a method similar to k-means++
 
         Parameters
@@ -349,14 +349,14 @@ def _kpp_init(self, D, random_state_, n_local_trials=None):
         """
         n_samples, _ = D.shape
 
-        centers = np.empty(self.n_clusters, dtype=int)
+        centers = np.empty(n_clusters, dtype=int)
 
         # Set the number of local seeding trials if none is given
         if n_local_trials is None:
             # This is what Arthur/Vassilvitskii tried, but did not report
             # specific results for other than mentioning in the conclusion
             # that it helped.
-            n_local_trials = 2 + int(np.log(self.n_clusters))
+            n_local_trials = 2 + int(np.log(n_clusters))
 
         center_id = random_state_.randint(n_samples)
         centers[0] = center_id
@@ -365,8 +365,8 @@ def _kpp_init(self, D, random_state_, n_local_trials=None):
         closest_dist_sq = D[centers[0], :]**2
         current_pot = closest_dist_sq.sum()
 
-        # pick the remaining self.n_clusters-1 points
-        for cluster_index in range(1, self.n_clusters):
+        # pick the remaining n_clusters-1 points
+        for cluster_index in range(1, n_clusters):
             rand_vals = (random_state_.random_sample(n_local_trials)
                          * current_pot)
             candidate_ids = np.searchsorted(stable_cumsum(closest_dist_sq),
diff --git a/sklearn_extra/cluster/tests/test_k_medoids.py b/sklearn_extra/cluster/tests/test_k_medoids.py
index f5abf722..0a802525 100644
--- a/sklearn_extra/cluster/tests/test_k_medoids.py
+++ b/sklearn_extra/cluster/tests/test_k_medoids.py
@@ -1,6 +1,7 @@
 """Testing for K-Medoids"""
 import warnings
 import numpy as np
+from unittest import mock
 from scipy.sparse import csc_matrix
 
 from sklearn.datasets import load_iris
@@ -101,12 +102,30 @@ def test_kmedoids_empty_clusters():
     assert_warns_message(UserWarning, "Cluster 1 is empty!", kmedoids.fit, X)
 
 
+@mock.patch.object(KMedoids, '_kpp_init', return_value=object())
+def test_kpp_called(_kpp_init_mocked):
+    """KMedoids._kpp_init method should be called by _initialize_medoids"""
+    D = np.array([[0, 1], [1, 0]])
+    n_clusters = 2
+    rng = np.random.RandomState(seed)
+    kmedoids = KMedoids()
+    kmedoids.init = 'k-medoids++'
+    # set _kpp_init_mocked.return_value to a singleton
+    initial_medoids = kmedoids._initialize_medoids(
+        D,
+        n_clusters,
+        rng,
+    )
+
+    # assert that _kpp_init was called and its result was returned.
+    _kpp_init_mocked.assert_called_once_with(D, n_clusters, rng)
+    assert initial_medoids == _kpp_init_mocked.return_value
+
+
 def test_kmedoids_pp():
     """Initial clusters should be well-separated for k-medoids++"""
     rng = np.random.RandomState(seed)
-    kmedoids = KMedoids(n_clusters=3,
-                        init="k-medoids++",
-                        random_state=rng)
+    kmedoids = KMedoids()
     X = [[10, 0],
          [11, 0],
          [0, 10],
@@ -118,7 +137,9 @@ def test_kmedoids_pp():
          ]
     D = euclidean_distances(X)
 
-    centers = kmedoids._initialize_medoids(D, 3, random_state_=rng)
+    centers = kmedoids._kpp_init(D,
+                                 n_clusters=3,
+                                 random_state_=rng)
 
     assert len(centers) == 3
 

From 8e534e8a9cf0f1b4a26d079de40eb096f51cfa69 Mon Sep 17 00:00:00 2001
From: Zane Dufour <zane.dufour@gmail.com>
Date: Sat, 11 May 2019 19:33:25 -0400
Subject: [PATCH 08/24] added documentation

---
 doc/api.rst        | 10 +++++++++
 doc/user_guide.rst | 56 +++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 65 insertions(+), 1 deletion(-)

diff --git a/doc/api.rst b/doc/api.rst
index e8de935e..fcb9b8a0 100644
--- a/doc/api.rst
+++ b/doc/api.rst
@@ -12,3 +12,13 @@ Kernel approximation
    :template: class.rst
 
    kernel_approximation.Fastfood
+
+Clustering
+====================
+
+.. autosummary::
+   :toctree: generated/
+   :template: class.rst
+
+   cluster.KMedoids
+
diff --git a/doc/user_guide.rst b/doc/user_guide.rst
index a190e568..910339c3 100644
--- a/doc/user_guide.rst
+++ b/doc/user_guide.rst
@@ -6,4 +6,58 @@
 User guide
 ==========
 
-To add.
+.. _k_medoids:
+
+K-Medoids
+=========
+
+:class:`KMedoids` is related to the :class:`KMeans` algorithm. While
+:class:`KMeans` tries to minimize the within cluster sum-of-squares,
+:class:`KMedoids` tries to minimize the sum of distances between each point and
+the medoid of its cluster. The medoid is a data point (unlike the centroid)
+which has least total distance to the other members of its cluster. The use of
+a data point to represent each cluster's center allows the use of any distance
+metric for clustering.
+
+:class:`KMedoids` can be more robust to noise and outliers than :class:`KMeans`
+as it will choose one of the cluster members as the medoid while
+:class:`KMeans` will move the center of the cluster towards the outlier which
+might in turn move other points away from the cluster centre.
+
+:class:`KMedoids` is also different from K-Medians, which is analogous to :class:`KMeans`
+except that the Manhattan Median is used for each cluster center instead of
+the centroid. K-Medians is robust to outliers, but it is limited to the
+Manhattan Distance metric and, similar to :class:`KMeans`, it does not guarantee
+that the center of each cluster will be a member of the original dataset.
+
+The complexity of K-Medoids is :math:`O(N^2 K T)` where :math:`N` is the number
+of samples, :math:`T` is the number of iterations and :math:`K` is the number of
+clusters. This makes it more suitable for smaller datasets in comparison to
+:class:`KMeans` which is :math:`O(N K T)`.
+
+.. topic:: Examples:
+
+
+
+**Algorithm description:**
+There are several algorithms to compute K-Medoids, though :class:`KMedoids`
+currently only supports Partitioning Around Medoids (PAM). The PAM algorithm
+uses a greedy search, which may fail to find the global optimum. It consists of
+two alternating steps commonly called the
+Assignment and Update steps (BUILD and SWAP in Kaufmann and Rousseeuw, 1987).
+
+PAM works as follows:
+
+* Initialize: Select ``n_clusters`` from the dataset as the medoids using
+  a heuristic, random, or k-medoids++ approach (configurable using the ``init`` parameter).
+* Assignment step: assign each element from the dataset to the closest medoid.
+* Update step: Identify the new medoid of each cluster.
+* Repeat the assignment and update step while the medoids keep changing or
+  maximum number of iterations ``max_iter`` is reached.
+
+.. topic:: References:
+
+ * "Clustering by Means of Medoids'"
+   Kaufman, L. and Rousseeuw, P.J.,
+   Statistical Data Analysis Based on the L1Norm and Related Methods, edited
+   by Y. Dodge, North-Holland, 405416. 1987
\ No newline at end of file

From 4d615291d91c8d4ca0e5582c2182a0fd1cf60b24 Mon Sep 17 00:00:00 2001
From: Zane Dufour <zane.dufour@gmail.com>
Date: Thu, 25 Jul 2019 22:33:24 -0400
Subject: [PATCH 09/24] Rename k_medoids_.py -> _k_medoids.py

---
 sklearn_extra/cluster/__init__.py                      | 2 +-
 sklearn_extra/cluster/{k_medoids_.py => _k_medoids.py} | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename sklearn_extra/cluster/{k_medoids_.py => _k_medoids.py} (100%)

diff --git a/sklearn_extra/cluster/__init__.py b/sklearn_extra/cluster/__init__.py
index d30e7b64..6b7d4c8d 100644
--- a/sklearn_extra/cluster/__init__.py
+++ b/sklearn_extra/cluster/__init__.py
@@ -1,4 +1,4 @@
-from .k_medoids_ import KMedoids
+from ._k_medoids import KMedoids
 
 __all__ = [
     'KMedoids',
diff --git a/sklearn_extra/cluster/k_medoids_.py b/sklearn_extra/cluster/_k_medoids.py
similarity index 100%
rename from sklearn_extra/cluster/k_medoids_.py
rename to sklearn_extra/cluster/_k_medoids.py

From 03f9e5492cc872d3d7d4eefe86175f8d77c9d244 Mon Sep 17 00:00:00 2001
From: Zane Dufour <zane.dufour@gmail.com>
Date: Thu, 25 Jul 2019 22:33:44 -0400
Subject: [PATCH 10/24] Update conf.py to include mathjax

---
 doc/conf.py | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/doc/conf.py b/doc/conf.py
index a4cf131d..6e6357df 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -48,13 +48,23 @@
 # pngmath / imgmath compatibility layer for different sphinx versions
 import sphinx
 from distutils.version import LooseVersion
-if LooseVersion(sphinx.__version__) < LooseVersion('1.4'):
-    extensions.append('sphinx.ext.pngmath')
-else:
-    extensions.append('sphinx.ext.imgmath')
+# if LooseVersion(sphinx.__version__) < LooseVersion('1.4'):
+#     extensions.append('sphinx.ext.pngmath')
+# else:
+#     extensions.append('sphinx.ext.imgmath')
 
 autodoc_default_flags = ['members', 'inherited-members']
 
+# For maths, use mathjax by default and svg if NO_MATHJAX env variable is set
+# (useful for viewing the doc offline)
+if os.environ.get('NO_MATHJAX'):
+    extensions.append('sphinx.ext.imgmath')
+    imgmath_image_format = 'svg'
+else:
+    extensions.append('sphinx.ext.mathjax')
+    mathjax_path = ('https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/'
+                    'MathJax.js?config=TeX-AMS_SVG')
+
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']
 

From 2e952872def7ea7119ee9acbc1de96ebc2008c17 Mon Sep 17 00:00:00 2001
From: Zane Dufour <zane.dufour@gmail.com>
Date: Fri, 26 Jul 2019 00:17:28 -0400
Subject: [PATCH 11/24] Add KMedoids to test_common.py

---
 sklearn_extra/tests/test_common.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn_extra/tests/test_common.py b/sklearn_extra/tests/test_common.py
index f22cdab0..f1b1e862 100644
--- a/sklearn_extra/tests/test_common.py
+++ b/sklearn_extra/tests/test_common.py
@@ -3,11 +3,11 @@
 from sklearn.utils.estimator_checks import check_estimator
 
 from sklearn_extra.kernel_approximation import Fastfood
-
+from sklearn_extra.cluster import KMedoids
 
 @pytest.mark.parametrize(
     "Estimator",
-    [Fastfood]
+    [Fastfood, KMedoids]
 )
 def test_all_estimators(Estimator, request):
     return check_estimator(Estimator)

From 0e1ee5bb3651d8af2db39a8bbc6c0442f81f0903 Mon Sep 17 00:00:00 2001
From: Zane Dufour <zane.dufour@gmail.com>
Date: Fri, 26 Jul 2019 00:17:37 -0400
Subject: [PATCH 12/24] add plot_kmedoids_digits.py

---
 examples/plot_kmedoids_digits.py | 97 ++++++++++++++++++++++++++++++++
 1 file changed, 97 insertions(+)
 create mode 100644 examples/plot_kmedoids_digits.py

diff --git a/examples/plot_kmedoids_digits.py b/examples/plot_kmedoids_digits.py
new file mode 100644
index 00000000..dbeab2e7
--- /dev/null
+++ b/examples/plot_kmedoids_digits.py
@@ -0,0 +1,97 @@
+# -*- coding: utf-8 -*-
+"""
+=============================================================
+A demo of K-Medoids clustering on the handwritten digits data
+=============================================================
+In this example we compare different pairwise distance
+metrics for K-Medoids.
+"""
+import numpy as np
+import matplotlib.pyplot as plt
+
+from collections import namedtuple
+from sklearn.cluster import KMeans
+from sklearn_extra.cluster import KMedoids
+from sklearn.datasets import load_digits
+from sklearn.decomposition import PCA
+from sklearn.preprocessing import scale
+
+print(__doc__)
+
+# Authors: Timo Erkkilä <timo.erkkila@gmail.com>
+#          Antti Lehmussola <antti.lehmussola@gmail.com>
+#          Kornel Kiełczewski <kornel.mail@gmail.com>
+# License: BSD 3 clause
+
+np.random.seed(42)
+
+digits = load_digits()
+data = scale(digits.data)
+n_digits = len(np.unique(digits.target))
+
+reduced_data = PCA(n_components=2).fit_transform(data)
+
+# Step size of the mesh. Decrease to increase the quality of the VQ.
+h = .02     # point in the mesh [x_min, m_max]x[y_min, y_max].
+
+# Plot the decision boundary. For that, we will assign a color to each
+x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
+y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
+xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
+
+plt.figure()
+plt.clf()
+
+plt.suptitle("Comparing multiple K-Medoids metrics to K-Means and each other",
+             fontsize=14)
+
+Algorithm = namedtuple('ClusterAlgorithm', ['model', 'description'])
+
+selected_models = [
+    Algorithm(KMedoids(metric='manhattan',
+                       n_clusters=n_digits),
+              'KMedoids (manhattan)'),
+    Algorithm(KMedoids(metric='euclidean',
+                       n_clusters=n_digits),
+              'KMedoids (euclidean)'),
+    Algorithm(KMedoids(metric='cosine',
+                       n_clusters=n_digits),
+              'KMedoids (cosine)'),
+    Algorithm(KMeans(n_clusters=n_digits),
+              'KMeans')
+    ]
+
+plot_rows = int(np.ceil(len(selected_models) / 2.0))
+plot_cols = 2
+
+for i, (model, description) in enumerate(selected_models):
+
+    # Obtain labels for each point in mesh. Use last trained model.
+    model.fit(reduced_data)
+    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
+
+    # Put the result into a color plot
+    Z = Z.reshape(xx.shape)
+    plt.subplot(plot_cols, plot_rows, i + 1)
+    plt.imshow(Z, interpolation='nearest',
+               extent=(xx.min(), xx.max(), yy.min(), yy.max()),
+               cmap=plt.cm.Paired,
+               aspect='auto', origin='lower')
+
+    plt.plot(reduced_data[:, 0],
+             reduced_data[:, 1],
+             'k.', markersize=2,
+             alpha=0.3,
+             )
+    # Plot the centroids as a white X
+    centroids = model.cluster_centers_
+    plt.scatter(centroids[:, 0], centroids[:, 1],
+                marker='x', s=169, linewidths=3,
+                color='w', zorder=10)
+    plt.title(description)
+    plt.xlim(x_min, x_max)
+    plt.ylim(y_min, y_max)
+    plt.xticks(())
+    plt.yticks(())
+
+plt.show()
\ No newline at end of file

From ee1688be53543e0e263d0d217282c585a6f6d530 Mon Sep 17 00:00:00 2001
From: Zane Dufour <zane.dufour@gmail.com>
Date: Fri, 26 Jul 2019 00:24:02 -0400
Subject: [PATCH 13/24] Add Examples line to KMedoids docstring

---
 sklearn_extra/cluster/_k_medoids.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/sklearn_extra/cluster/_k_medoids.py b/sklearn_extra/cluster/_k_medoids.py
index b3209929..313b70cd 100644
--- a/sklearn_extra/cluster/_k_medoids.py
+++ b/sklearn_extra/cluster/_k_medoids.py
@@ -95,6 +95,12 @@ class KMedoids(BaseEstimator, ClusterMixin, TransformerMixin):
     -----
     Since all pairwise distances are calculated and stored in memory for
     the duration of fit, the space complexity is O(n_samples ** 2).
+
+    Examples
+    --------
+    See scikit-learn-extra/examples/plot_kmedoids_digits.py for examples
+    of KMedoids with various distance metrics.
+    
     """
 
     def __init__(self, n_clusters=8, metric='euclidean',

From e96e2b08067dfc661da0607b3614c8a35e3091c5 Mon Sep 17 00:00:00 2001
From: Zane Dufour <zane.dufour@gmail.com>
Date: Fri, 26 Jul 2019 00:25:51 -0400
Subject: [PATCH 14/24] Remove duplicate examples section in _k_medoids.py
 docstring

---
 sklearn_extra/cluster/_k_medoids.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/sklearn_extra/cluster/_k_medoids.py b/sklearn_extra/cluster/_k_medoids.py
index 313b70cd..5dc3aa86 100644
--- a/sklearn_extra/cluster/_k_medoids.py
+++ b/sklearn_extra/cluster/_k_medoids.py
@@ -78,6 +78,9 @@ class KMedoids(BaseEstimator, ClusterMixin, TransformerMixin):
     >>> kmedoids.inertia_
     8.0
 
+    See scikit-learn-extra/examples/plot_kmedoids_digits.py for examples
+    of KMedoids with various distance metrics.
+
     References
     ----------
     Kaufman, L. and Rousseeuw, P.J., Statistical Data Analysis Based on
@@ -98,8 +101,7 @@ class KMedoids(BaseEstimator, ClusterMixin, TransformerMixin):
 
     Examples
     --------
-    See scikit-learn-extra/examples/plot_kmedoids_digits.py for examples
-    of KMedoids with various distance metrics.
+
     
     """
 

From 07f6e3c2634a2f3b4bf6451dfec7de1c06f774c6 Mon Sep 17 00:00:00 2001
From: Zane Dufour <zane.dufour@gmail.com>
Date: Fri, 26 Jul 2019 00:26:39 -0400
Subject: [PATCH 15/24] ACTUALLY remove duplicate examples section

---
 sklearn_extra/cluster/_k_medoids.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/sklearn_extra/cluster/_k_medoids.py b/sklearn_extra/cluster/_k_medoids.py
index 5dc3aa86..706d1f2b 100644
--- a/sklearn_extra/cluster/_k_medoids.py
+++ b/sklearn_extra/cluster/_k_medoids.py
@@ -98,10 +98,6 @@ class KMedoids(BaseEstimator, ClusterMixin, TransformerMixin):
     -----
     Since all pairwise distances are calculated and stored in memory for
     the duration of fit, the space complexity is O(n_samples ** 2).
-
-    Examples
-    --------
-
     
     """
 

From 99108048a0a4c3d72a7db81616da0ed6f3c26b46 Mon Sep 17 00:00:00 2001
From: Zane Dufour <zane.dufour@gmail.com>
Date: Fri, 26 Jul 2019 00:35:24 -0400
Subject: [PATCH 16/24] Add sphinx gallery of plot_kmedoids_digits.py

---
 doc/user_guide.rst | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/doc/user_guide.rst b/doc/user_guide.rst
index 910339c3..084e838b 100644
--- a/doc/user_guide.rst
+++ b/doc/user_guide.rst
@@ -37,6 +37,8 @@ clusters. This makes it more suitable for smaller datasets in comparison to
 
 .. topic:: Examples:
 
+ * :ref:`sphx_glr_auto_examples_plot_kmedoids_digits.py`: Applying K-Medoids on digits
+   with various distance metrics.
 
 
 **Algorithm description:**

From 0c8d032d145f889f659de5380479ff72caa55b3f Mon Sep 17 00:00:00 2001
From: Zane Dufour <zane.dufour@gmail.com>
Date: Fri, 26 Jul 2019 00:55:19 -0400
Subject: [PATCH 17/24] Added k-medoids++ to help message

---
 sklearn_extra/cluster/_k_medoids.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/sklearn_extra/cluster/_k_medoids.py b/sklearn_extra/cluster/_k_medoids.py
index 706d1f2b..638076e5 100644
--- a/sklearn_extra/cluster/_k_medoids.py
+++ b/sklearn_extra/cluster/_k_medoids.py
@@ -33,10 +33,14 @@ class KMedoids(BaseEstimator, ClusterMixin, TransformerMixin):
     metric : string, or callable, optional, default: 'euclidean'
         What distance metric to use. See :func:metrics.pairwise_distances
 
-    init : {'random', 'heuristic'}, optional, default: 'heuristic'
-        Specify medoid initialization method. Random selects n_clusters
-        elements from the dataset, while heuristic picks the n_clusters points
-        with the smallest sum distance to every other point.
+    init : {'random', 'heuristic', 'k-medoids++'}, optional, default: 'heuristic'
+        Specify medoid initialization method. 'random' selects n_clusters
+        elements from the dataset. 'heuristic' picks the n_clusters points
+        with the smallest sum distance to every other point. 'k-medoids++'
+        follows an approach based on k-means++_, and in general, gives initial
+        medoids which are more separated than those generated by the other methods.
+        
+        .. _k-means++: https://theory.stanford.edu/~sergei/papers/kMeansPP-soda.pdf
 
     max_iter : int, optional, default : 300
         Specify the maximum number of iterations when fitting.

From 3d7100195d636e5cb7aec6425e0f29bb9a097fe3 Mon Sep 17 00:00:00 2001
From: Zane Dufour <zane.dufour@gmail.com>
Date: Sat, 27 Jul 2019 09:16:25 -0400
Subject: [PATCH 18/24] Run `black` on code

---
 benchmarks/bench_rbfsampler_fastfood.py       |  23 +-
 doc/conf.py                                   | 204 ++++++++++--------
 examples/plot_kmedoids_digits.py              |  71 +++---
 sklearn_extra/cluster/__init__.py             |   4 +-
 sklearn_extra/cluster/_k_medoids.py           | 121 +++++++----
 sklearn_extra/cluster/tests/test_k_medoids.py | 175 ++++++++-------
 sklearn_extra/tests/test_common.py            |   6 +-
 7 files changed, 328 insertions(+), 276 deletions(-)

diff --git a/benchmarks/bench_rbfsampler_fastfood.py b/benchmarks/bench_rbfsampler_fastfood.py
index 42bea9b4..11f5df9b 100644
--- a/benchmarks/bench_rbfsampler_fastfood.py
+++ b/benchmarks/bench_rbfsampler_fastfood.py
@@ -15,9 +15,9 @@
 Y /= Y.sum(axis=1)[:, np.newaxis]
 
 # calculate feature maps
-gamma = 10.
+gamma = 10.0
 sigma = np.sqrt(1 / (2 * gamma))
-number_of_features_to_generate = 4096*4
+number_of_features_to_generate = 4096 * 4
 
 exact_start = datetime.datetime.utcnow()
 # original rbf kernel method:
@@ -27,23 +27,24 @@
 exact_spent_time = exact_end - exact_start
 print("Timimg exact rbf: \t\t", exact_spent_time)
 
-rbf_transform = Fastfood(sigma=sigma,
-                         n_components=number_of_features_to_generate,
-                         tradeoff_mem_accuracy='mem',
-                         random_state=42)
+rbf_transform = Fastfood(
+    sigma=sigma,
+    n_components=number_of_features_to_generate,
+    tradeoff_mem_accuracy="mem",
+    random_state=42,
+)
 _ = rbf_transform.fit(X)
 fastfood_fast_vec_start = datetime.datetime.utcnow()
 # Fastfood: approximate kernel mapping
 _ = rbf_transform.transform(X)
 _ = rbf_transform.transform(Y)
 fastfood_fast_vec_end = datetime.datetime.utcnow()
-fastfood_fast_vec_spent_time = fastfood_fast_vec_end - \
-    fastfood_fast_vec_start
+fastfood_fast_vec_spent_time = fastfood_fast_vec_end - fastfood_fast_vec_start
 print("Timimg fastfood fast vectorized: \t\t", fastfood_fast_vec_spent_time)
 
-rks_rbf_transform = RBFSampler(gamma=gamma,
-                               n_components=number_of_features_to_generate,
-                               random_state=42)
+rks_rbf_transform = RBFSampler(
+    gamma=gamma, n_components=number_of_features_to_generate, random_state=42
+)
 _ = rks_rbf_transform.fit(X)
 rks_start = datetime.datetime.utcnow()
 # Random Kitchens Sinks: approximate kernel mapping
diff --git a/doc/conf.py b/doc/conf.py
index 4f3e502c..1cbdd442 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -21,24 +21,24 @@
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
-#sys.path.insert(0, os.path.abspath('.'))
+# sys.path.insert(0, os.path.abspath('.'))
 
 # -- General configuration ------------------------------------------------
 
 # If your documentation needs a minimal Sphinx version, state it here.
-#needs_sphinx = '1.0'
+# needs_sphinx = '1.0'
 
 # Add any Sphinx extension module names here, as strings. They can be
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
 extensions = [
-    'sphinx.ext.autodoc',
-    'sphinx.ext.autosummary',
-    'sphinx.ext.doctest',
-    'sphinx.ext.intersphinx',
-    'sphinx.ext.viewcode',
-    'numpydoc',
-    'sphinx_gallery.gen_gallery',
+    "sphinx.ext.autodoc",
+    "sphinx.ext.autosummary",
+    "sphinx.ext.doctest",
+    "sphinx.ext.intersphinx",
+    "sphinx.ext.viewcode",
+    "numpydoc",
+    "sphinx_gallery.gen_gallery",
 ]
 
 # this is needed for some reason...
@@ -48,44 +48,47 @@
 # pngmath / imgmath compatibility layer for different sphinx versions
 import sphinx
 from distutils.version import LooseVersion
+
 # if LooseVersion(sphinx.__version__) < LooseVersion('1.4'):
 #     extensions.append('sphinx.ext.pngmath')
 # else:
 #     extensions.append('sphinx.ext.imgmath')
 
-autodoc_default_flags = ['members', 'inherited-members']
+autodoc_default_flags = ["members", "inherited-members"]
 
 # For maths, use mathjax by default and svg if NO_MATHJAX env variable is set
 # (useful for viewing the doc offline)
-if os.environ.get('NO_MATHJAX'):
-    extensions.append('sphinx.ext.imgmath')
-    imgmath_image_format = 'svg'
+if os.environ.get("NO_MATHJAX"):
+    extensions.append("sphinx.ext.imgmath")
+    imgmath_image_format = "svg"
 else:
-    extensions.append('sphinx.ext.mathjax')
-    mathjax_path = ('https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/'
-                    'MathJax.js?config=TeX-AMS_SVG')
+    extensions.append("sphinx.ext.mathjax")
+    mathjax_path = (
+        "https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/"
+        "MathJax.js?config=TeX-AMS_SVG"
+    )
 
 # Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
+templates_path = ["_templates"]
 
 # generate autosummary even if no references
 autosummary_generate = True
 
 # The suffix of source filenames.
-source_suffix = '.rst'
+source_suffix = ".rst"
 
 # The encoding of source files.
-#source_encoding = 'utf-8-sig'
+# source_encoding = 'utf-8-sig'
 
 # Generate the plots for the gallery
 plot_gallery = True
 
 # The master toctree document.
-master_doc = 'index'
+master_doc = "index"
 
 # General information about the project.
-project = u'scikit-learn-extra'
-copyright = u'2019, scikit-learn-extra developpers'
+project = u"scikit-learn-extra"
+copyright = u"2019, scikit-learn-extra developpers"
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
@@ -93,177 +96,181 @@
 #
 # The short X.Y version.
 from sklearn_extra import __version__
+
 version = __version__
 # The full version, including alpha/beta/rc tags.
 release = __version__
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
-#language = None
+# language = None
 
 # There are two options for replacing |today|: either, you set today to some
 # non-false value, then it is used:
-#today = ''
+# today = ''
 # Else, today_fmt is used as the format for a strftime call.
-#today_fmt = '%B %d, %Y'
+# today_fmt = '%B %d, %Y'
 
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
-exclude_patterns = ['_build', '_templates']
+exclude_patterns = ["_build", "_templates"]
 
 # The reST default role (used for this markup: `text`) to use for all
 # documents.
-#default_role = None
+# default_role = None
 
 # If true, '()' will be appended to :func: etc. cross-reference text.
-#add_function_parentheses = True
+# add_function_parentheses = True
 
 # If true, the current module name will be prepended to all description
 # unit titles (such as .. function::).
-#add_module_names = True
+# add_module_names = True
 
 # If true, sectionauthor and moduleauthor directives will be shown in the
 # output. They are ignored by default.
-#show_authors = False
+# show_authors = False
 
 # The name of the Pygments (syntax highlighting) style to use.
-pygments_style = 'sphinx'
+pygments_style = "sphinx"
 
 # Custom style
-html_style = 'css/project-template.css'
+html_style = "css/project-template.css"
 
 # A list of ignored prefixes for module index sorting.
-#modindex_common_prefix = []
+# modindex_common_prefix = []
 
 # If true, keep warnings as "system message" paragraphs in the built documents.
-#keep_warnings = False
+# keep_warnings = False
 
 
 # -- Options for HTML output ----------------------------------------------
 
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
-html_theme = 'sphinx_rtd_theme'
+html_theme = "sphinx_rtd_theme"
 
 # Theme options are theme-specific and customize the look and feel of a theme
 # further.  For a list of options available for each theme, see the
 # documentation.
-#html_theme_options = {}
+# html_theme_options = {}
 
 # Add any paths that contain custom themes here, relative to this directory.
 html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
 
 # The name for this set of Sphinx documents.  If None, it defaults to
 # "<project> v<release> documentation".
-#html_title = None
+# html_title = None
 
 # A shorter title for the navigation bar.  Default is the same as html_title.
-#html_short_title = None
+# html_short_title = None
 
 # The name of an image file (relative to this directory) to place at the top
 # of the sidebar.
-#html_logo = None
+# html_logo = None
 
 # The name of an image file (within the static path) to use as favicon of the
 # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
 # pixels large.
-#html_favicon = None
+# html_favicon = None
 
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+html_static_path = ["_static"]
 
 # Add any extra paths that contain custom files (such as robots.txt or
 # .htaccess) here, relative to this directory. These files are copied
 # directly to the root of the documentation.
-#html_extra_path = []
+# html_extra_path = []
 
 # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
 # using the given strftime format.
-#html_last_updated_fmt = '%b %d, %Y'
+# html_last_updated_fmt = '%b %d, %Y'
 
 # If true, SmartyPants will be used to convert quotes and dashes to
 # typographically correct entities.
-#html_use_smartypants = True
+# html_use_smartypants = True
 
 # Custom sidebar templates, maps document names to template names.
-#html_sidebars = {}
+# html_sidebars = {}
 
 # Additional templates that should be rendered to pages, maps page names to
 # template names.
-#html_additional_pages = {}
+# html_additional_pages = {}
 
 # If false, no module index is generated.
-#html_domain_indices = True
+# html_domain_indices = True
 
 # If false, no index is generated.
-#html_use_index = True
+# html_use_index = True
 
 # If true, the index is split into individual pages for each letter.
-#html_split_index = False
+# html_split_index = False
 
 # If true, links to the reST sources are added to the pages.
-#html_show_sourcelink = True
+# html_show_sourcelink = True
 
 # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
-#html_show_sphinx = True
+# html_show_sphinx = True
 
 # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
-#html_show_copyright = True
+# html_show_copyright = True
 
 # If true, an OpenSearch description file will be output, and all pages will
 # contain a <link> tag referring to it.  The value of this option must be the
 # base URL from which the finished HTML is served.
-#html_use_opensearch = ''
+# html_use_opensearch = ''
 
 # This is the file name suffix for HTML files (e.g. ".xhtml").
-#html_file_suffix = None
+# html_file_suffix = None
 
 # Output file base name for HTML help builder.
-htmlhelp_basename = 'project-templatedoc'
+htmlhelp_basename = "project-templatedoc"
 
 
 # -- Options for LaTeX output ---------------------------------------------
 
 latex_elements = {
-# The paper size ('letterpaper' or 'a4paper').
-#'papersize': 'letterpaper',
-
-# The font size ('10pt', '11pt' or '12pt').
-#'pointsize': '10pt',
-
-# Additional stuff for the LaTeX preamble.
-#'preamble': '',
+    # The paper size ('letterpaper' or 'a4paper').
+    #'papersize': 'letterpaper',
+    # The font size ('10pt', '11pt' or '12pt').
+    #'pointsize': '10pt',
+    # Additional stuff for the LaTeX preamble.
+    #'preamble': '',
 }
 
 # Grouping the document tree into LaTeX files. List of tuples
 # (source start file, target name, title,
 #  author, documentclass [howto, manual, or own class]).
 latex_documents = [
-  ('index', 'project-template.tex', u'project-template Documentation',
-   u'Vighnesh Birodkar', 'manual'),
+    (
+        "index",
+        "project-template.tex",
+        u"project-template Documentation",
+        u"Vighnesh Birodkar",
+        "manual",
+    )
 ]
 
 # The name of an image file (relative to this directory) to place at the top of
 # the title page.
-#latex_logo = None
+# latex_logo = None
 
 # For "manual" documents, if this is true, then toplevel headings are parts,
 # not chapters.
-#latex_use_parts = False
+# latex_use_parts = False
 
 # If true, show page references after internal links.
-#latex_show_pagerefs = False
+# latex_show_pagerefs = False
 
 # If true, show URL addresses after external links.
-#latex_show_urls = False
+# latex_show_urls = False
 
 # Documents to append as an appendix to all manuals.
-#latex_appendices = []
+# latex_appendices = []
 
 # If false, no module index is generated.
-#latex_domain_indices = True
+# latex_domain_indices = True
 
 
 # -- Options for manual page output ---------------------------------------
@@ -271,12 +278,17 @@
 # One entry per manual page. List of tuples
 # (source start file, name, description, authors, manual section).
 man_pages = [
-    ('index', 'project-template', u'project-template Documentation',
-     [u'Vighnesh Birodkar'], 1)
+    (
+        "index",
+        "project-template",
+        u"project-template Documentation",
+        [u"Vighnesh Birodkar"],
+        1,
+    )
 ]
 
 # If true, show URL addresses after external links.
-#man_show_urls = False
+# man_show_urls = False
 
 
 # -- Options for Texinfo output -------------------------------------------
@@ -285,43 +297,51 @@
 # (source start file, target name, title, author,
 #  dir menu entry, description, category)
 texinfo_documents = [
-  ('index', 'project-template', u'project-template Documentation',
-   u'Vighnesh Birodkar', 'project-template', 'One line description of project.',
-   'Miscellaneous'),
+    (
+        "index",
+        "project-template",
+        u"project-template Documentation",
+        u"Vighnesh Birodkar",
+        "project-template",
+        "One line description of project.",
+        "Miscellaneous",
+    )
 ]
 
 # Documents to append as an appendix to all manuals.
-#texinfo_appendices = []
+# texinfo_appendices = []
 
 # If false, no module index is generated.
-#texinfo_domain_indices = True
+# texinfo_domain_indices = True
 
 # How to display URL addresses: 'footnote', 'no', or 'inline'.
-#texinfo_show_urls = 'footnote'
+# texinfo_show_urls = 'footnote'
 
 # If true, do not generate a @detailmenu in the "Top" node's menu.
-#texinfo_no_detailmenu = False
+# texinfo_no_detailmenu = False
 
 
 # Example configuration for intersphinx: refer to the Python standard library.
 # intersphinx configuration
 intersphinx_mapping = {
-    'python': ('https://docs.python.org/{.major}'.format(
-        sys.version_info), None),
-    'numpy': ('https://docs.scipy.org/doc/numpy/', None),
-    'scipy': ('https://docs.scipy.org/doc/scipy/reference', None),
-    'matplotlib': ('https://matplotlib.org/', None),
-    'sklearn': ('http://scikit-learn.org/stable', None)
+    "python": (
+        "https://docs.python.org/{.major}".format(sys.version_info),
+        None,
+    ),
+    "numpy": ("https://docs.scipy.org/doc/numpy/", None),
+    "scipy": ("https://docs.scipy.org/doc/scipy/reference", None),
+    "matplotlib": ("https://matplotlib.org/", None),
+    "sklearn": ("http://scikit-learn.org/stable", None),
 }
 
 # sphinx-gallery configuration
 sphinx_gallery_conf = {
-    'doc_module': 'sklearn_extra',
-    'backreferences_dir': os.path.join('generated'),
-    'reference_url': {
-        'sklearn_extra': None}
+    "doc_module": "sklearn_extra",
+    "backreferences_dir": os.path.join("generated"),
+    "reference_url": {"sklearn_extra": None},
 }
 
+
 def setup(app):
     # a copy button to copy snippet of code from the documentation
-    app.add_javascript('js/copybutton.js')
+    app.add_javascript("js/copybutton.js")
diff --git a/examples/plot_kmedoids_digits.py b/examples/plot_kmedoids_digits.py
index dbeab2e7..c74d9ab2 100644
--- a/examples/plot_kmedoids_digits.py
+++ b/examples/plot_kmedoids_digits.py
@@ -32,7 +32,7 @@
 reduced_data = PCA(n_components=2).fit_transform(data)
 
 # Step size of the mesh. Decrease to increase the quality of the VQ.
-h = .02     # point in the mesh [x_min, m_max]x[y_min, y_max].
+h = 0.02  # point in the mesh [x_min, m_max]x[y_min, y_max].
 
 # Plot the decision boundary. For that, we will assign a color to each
 x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
@@ -42,24 +42,27 @@
 plt.figure()
 plt.clf()
 
-plt.suptitle("Comparing multiple K-Medoids metrics to K-Means and each other",
-             fontsize=14)
+plt.suptitle(
+    "Comparing multiple K-Medoids metrics to K-Means and each other",
+    fontsize=14,
+)
 
-Algorithm = namedtuple('ClusterAlgorithm', ['model', 'description'])
+Algorithm = namedtuple("ClusterAlgorithm", ["model", "description"])
 
 selected_models = [
-    Algorithm(KMedoids(metric='manhattan',
-                       n_clusters=n_digits),
-              'KMedoids (manhattan)'),
-    Algorithm(KMedoids(metric='euclidean',
-                       n_clusters=n_digits),
-              'KMedoids (euclidean)'),
-    Algorithm(KMedoids(metric='cosine',
-                       n_clusters=n_digits),
-              'KMedoids (cosine)'),
-    Algorithm(KMeans(n_clusters=n_digits),
-              'KMeans')
-    ]
+    Algorithm(
+        KMedoids(metric="manhattan", n_clusters=n_digits),
+        "KMedoids (manhattan)",
+    ),
+    Algorithm(
+        KMedoids(metric="euclidean", n_clusters=n_digits),
+        "KMedoids (euclidean)",
+    ),
+    Algorithm(
+        KMedoids(metric="cosine", n_clusters=n_digits), "KMedoids (cosine)"
+    ),
+    Algorithm(KMeans(n_clusters=n_digits), "KMeans"),
+]
 
 plot_rows = int(np.ceil(len(selected_models) / 2.0))
 plot_cols = 2
@@ -73,25 +76,33 @@
     # Put the result into a color plot
     Z = Z.reshape(xx.shape)
     plt.subplot(plot_cols, plot_rows, i + 1)
-    plt.imshow(Z, interpolation='nearest',
-               extent=(xx.min(), xx.max(), yy.min(), yy.max()),
-               cmap=plt.cm.Paired,
-               aspect='auto', origin='lower')
-
-    plt.plot(reduced_data[:, 0],
-             reduced_data[:, 1],
-             'k.', markersize=2,
-             alpha=0.3,
-             )
+    plt.imshow(
+        Z,
+        interpolation="nearest",
+        extent=(xx.min(), xx.max(), yy.min(), yy.max()),
+        cmap=plt.cm.Paired,
+        aspect="auto",
+        origin="lower",
+    )
+
+    plt.plot(
+        reduced_data[:, 0], reduced_data[:, 1], "k.", markersize=2, alpha=0.3
+    )
     # Plot the centroids as a white X
     centroids = model.cluster_centers_
-    plt.scatter(centroids[:, 0], centroids[:, 1],
-                marker='x', s=169, linewidths=3,
-                color='w', zorder=10)
+    plt.scatter(
+        centroids[:, 0],
+        centroids[:, 1],
+        marker="x",
+        s=169,
+        linewidths=3,
+        color="w",
+        zorder=10,
+    )
     plt.title(description)
     plt.xlim(x_min, x_max)
     plt.ylim(y_min, y_max)
     plt.xticks(())
     plt.yticks(())
 
-plt.show()
\ No newline at end of file
+plt.show()
diff --git a/sklearn_extra/cluster/__init__.py b/sklearn_extra/cluster/__init__.py
index 6b7d4c8d..bbdaaf41 100644
--- a/sklearn_extra/cluster/__init__.py
+++ b/sklearn_extra/cluster/__init__.py
@@ -1,5 +1,3 @@
 from ._k_medoids import KMedoids
 
-__all__ = [
-    'KMedoids',
-]
+__all__ = ["KMedoids"]
diff --git a/sklearn_extra/cluster/_k_medoids.py b/sklearn_extra/cluster/_k_medoids.py
index 638076e5..def6560d 100644
--- a/sklearn_extra/cluster/_k_medoids.py
+++ b/sklearn_extra/cluster/_k_medoids.py
@@ -12,7 +12,10 @@
 import numpy as np
 
 from sklearn.base import BaseEstimator, ClusterMixin, TransformerMixin
-from sklearn.metrics.pairwise import pairwise_distances, pairwise_distances_argmin
+from sklearn.metrics.pairwise import (
+    pairwise_distances,
+    pairwise_distances_argmin,
+)
 from sklearn.utils import check_array, check_random_state
 from sklearn.utils.extmath import stable_cumsum
 from sklearn.utils.validation import check_is_fitted
@@ -105,8 +108,14 @@ class KMedoids(BaseEstimator, ClusterMixin, TransformerMixin):
     
     """
 
-    def __init__(self, n_clusters=8, metric='euclidean',
-                 init='heuristic', max_iter=300, random_state=None):
+    def __init__(
+        self,
+        n_clusters=8,
+        metric="euclidean",
+        init="heuristic",
+        max_iter=300,
+        random_state=None,
+    ):
         self.n_clusters = n_clusters
         self.metric = metric
         self.init = init
@@ -116,10 +125,15 @@ def __init__(self, n_clusters=8, metric='euclidean',
     def _check_nonnegative_int(self, value, desc):
         """Validates if value is a valid integer > 0"""
 
-        if (value is None or value <= 0 or
-                not isinstance(value, (int, np.integer))):
-            raise ValueError("%s should be a nonnegative integer. "
-                             "%s was given" % (desc, value))
+        if (
+            value is None
+            or value <= 0
+            or not isinstance(value, (int, np.integer))
+        ):
+            raise ValueError(
+                "%s should be a nonnegative integer. "
+                "%s was given" % (desc, value)
+            )
 
     def _check_init_args(self):
         """Validates the input arguments. """
@@ -129,11 +143,13 @@ def _check_init_args(self):
         self._check_nonnegative_int(self.max_iter, "max_iter")
 
         # Check init
-        init_methods = ['random', 'heuristic', 'k-medoids++']
+        init_methods = ["random", "heuristic", "k-medoids++"]
         if self.init not in init_methods:
-            raise ValueError("init needs to be one of " +
-                             "the following: " +
-                             "%s" % init_methods)
+            raise ValueError(
+                "init needs to be one of "
+                + "the following: "
+                + "%s" % init_methods
+            )
 
     def fit(self, X, y=None):
         """Fit K-Medoids to the provided data.
@@ -153,17 +169,18 @@ def fit(self, X, y=None):
         random_state_ = check_random_state(self.random_state)
 
         self._check_init_args()
-        X = check_array(X, accept_sparse=['csr', 'csc'])
+        X = check_array(X, accept_sparse=["csr", "csc"])
         if self.n_clusters > X.shape[0]:
-            raise ValueError("The number of medoids (%d) must be less "
-                             "than the number of samples %d."
-                             % (self.n_clusters, X.shape[0]))
+            raise ValueError(
+                "The number of medoids (%d) must be less "
+                "than the number of samples %d."
+                % (self.n_clusters, X.shape[0])
+            )
 
         D = pairwise_distances(X, metric=self.metric)
-        medoid_idxs = self._initialize_medoids(D,
-                                               self.n_clusters,
-                                               random_state_,
-                                               )
+        medoid_idxs = self._initialize_medoids(
+            D, self.n_clusters, random_state_
+        )
         labels = None
 
         # Continue the algorithm as long as
@@ -178,10 +195,12 @@ def fit(self, X, y=None):
             if np.all(old_medoid_idxs == medoid_idxs):
                 break
             elif self.n_iter_ == self.max_iter - 1:
-                warnings.warn("Maximum number of iteration reached before "
-                              "convergence. Consider increasing max_iter to "
-                              "improve the fit.",
-                              ConvergenceWarning)
+                warnings.warn(
+                    "Maximum number of iteration reached before "
+                    "convergence. Consider increasing max_iter to "
+                    "improve the fit.",
+                    ConvergenceWarning,
+                )
 
         # Set the resulting instance variables.
         if self.metric == "precomputed":
@@ -212,11 +231,13 @@ def _update_medoid_idxs_in_place(self, D, labels, medoid_idxs):
                     "Cluster {k} is empty! "
                     "self.labels_[self.medoid_indices_[{k}]] "
                     "may not be labeled with "
-                    "its corresponding cluster ({k}).".format(k=k))
+                    "its corresponding cluster ({k}).".format(k=k)
+                )
                 continue
 
-            in_cluster_distances = D[cluster_k_idxs,
-                                     cluster_k_idxs[:, np.newaxis]]
+            in_cluster_distances = D[
+                cluster_k_idxs, cluster_k_idxs[:, np.newaxis]
+            ]
 
             # Calculate all costs from each point to all others in the cluster
             in_cluster_all_costs = np.sum(in_cluster_distances, axis=1)
@@ -224,7 +245,8 @@ def _update_medoid_idxs_in_place(self, D, labels, medoid_idxs):
             min_cost_idx = np.argmin(in_cluster_all_costs)
             min_cost = in_cluster_all_costs[min_cost_idx]
             curr_cost = in_cluster_all_costs[
-                np.argmax(cluster_k_idxs == medoid_idxs[k])]
+                np.argmax(cluster_k_idxs == medoid_idxs[k])
+            ]
 
             # Adopt a new medoid if its distance is smaller then the current
             if min_cost < curr_cost:
@@ -244,7 +266,7 @@ def transform(self, X):
         X_new : {array-like, sparse matrix}, shape=(n_samples, n_clusters)
             X transformed in the new space of distances to cluster centers.
         """
-        X = check_array(X, accept_sparse=['csr', 'csc'])
+        X = check_array(X, accept_sparse=["csr", "csc"])
 
         if self.metric == "precomputed":
             check_is_fitted(self, "medoid_indices_")
@@ -253,8 +275,7 @@ def transform(self, X):
             check_is_fitted(self, "cluster_centers_")
 
             Y = self.cluster_centers_
-            return pairwise_distances(X, Y=Y,
-                                      metric=self.metric)
+            return pairwise_distances(X, Y=Y, metric=self.metric)
 
     def predict(self, X):
         """Predict the closest cluster for each sample in X.
@@ -270,7 +291,7 @@ def predict(self, X):
         labels : array, shape = (n_samples,)
             Index of the cluster each sample belongs to.
         """
-        X = check_array(X, accept_sparse=['csr', 'csc'])
+        X = check_array(X, accept_sparse=["csr", "csc"])
 
         if self.metric == "precomputed":
             check_is_fitted(self, "medoid_indices_")
@@ -280,8 +301,9 @@ def predict(self, X):
 
             # Return data points to clusters based on which cluster assignment
             # yields the smallest distance
-            return pairwise_distances_argmin(X, Y=self.cluster_centers_,
-                                             metric=self.metric)
+            return pairwise_distances_argmin(
+                X, Y=self.cluster_centers_, metric=self.metric
+            )
 
     def _compute_inertia(self, distances):
         """Compute inertia of new samples. Inertia is defined as the sum of the
@@ -306,19 +328,21 @@ def _compute_inertia(self, distances):
     def _initialize_medoids(self, D, n_clusters, random_state_):
         """Select initial mediods when beginning clustering."""
 
-        if self.init == 'random':  # Random initialization
+        if self.init == "random":  # Random initialization
             # Pick random k medoids as the initial ones.
             medoids = random_state_.choice(len(D), n_clusters)
-        elif self.init == 'k-medoids++':
+        elif self.init == "k-medoids++":
             medoids = self._kpp_init(D, n_clusters, random_state_)
         elif self.init == "heuristic":  # Initialization by heuristic
             # Pick K first data points that have the smallest sum distance
             # to every other point. These are the initial medoids.
-            medoids = np.argpartition(np.sum(D, axis=1),
-                                      n_clusters-1)[:n_clusters]
+            medoids = np.argpartition(np.sum(D, axis=1), n_clusters - 1)[
+                :n_clusters
+            ]
         else:
-            raise ValueError("init value '{init}' not recognized"
-                             .format(init=self.init))
+            raise ValueError(
+                "init value '{init}' not recognized".format(init=self.init)
+            )
 
         return medoids
 
@@ -370,18 +394,20 @@ def _kpp_init(self, D, n_clusters, random_state_, n_local_trials=None):
         centers[0] = center_id
 
         # Initialize list of closest distances and calculate current potential
-        closest_dist_sq = D[centers[0], :]**2
+        closest_dist_sq = D[centers[0], :] ** 2
         current_pot = closest_dist_sq.sum()
 
         # pick the remaining n_clusters-1 points
         for cluster_index in range(1, n_clusters):
-            rand_vals = (random_state_.random_sample(n_local_trials)
-                         * current_pot)
-            candidate_ids = np.searchsorted(stable_cumsum(closest_dist_sq),
-                                            rand_vals)
+            rand_vals = (
+                random_state_.random_sample(n_local_trials) * current_pot
+            )
+            candidate_ids = np.searchsorted(
+                stable_cumsum(closest_dist_sq), rand_vals
+            )
 
             # Compute distances to center candidates
-            distance_to_candidates = D[candidate_ids, :]**2
+            distance_to_candidates = D[candidate_ids, :] ** 2
 
             # Decide which candidate is the best
             best_candidate = None
@@ -389,8 +415,9 @@ def _kpp_init(self, D, n_clusters, random_state_, n_local_trials=None):
             best_dist_sq = None
             for trial in range(n_local_trials):
                 # Compute potential when including center candidate
-                new_dist_sq = np.minimum(closest_dist_sq,
-                                         distance_to_candidates[trial])
+                new_dist_sq = np.minimum(
+                    closest_dist_sq, distance_to_candidates[trial]
+                )
                 new_pot = new_dist_sq.sum()
 
                 # Store result if it is the best local trial so far
diff --git a/sklearn_extra/cluster/tests/test_k_medoids.py b/sklearn_extra/cluster/tests/test_k_medoids.py
index 0a802525..0b125f36 100644
--- a/sklearn_extra/cluster/tests/test_k_medoids.py
+++ b/sklearn_extra/cluster/tests/test_k_medoids.py
@@ -21,31 +21,51 @@
 def test_kmedoids_input_validation_and_fit_check():
     rng = np.random.RandomState(seed)
     # Invalid parameters
-    assert_raise_message(ValueError, "n_clusters should be a nonnegative "
-                                     "integer. 0 was given",
-                         KMedoids(n_clusters=0).fit, X)
+    assert_raise_message(
+        ValueError,
+        "n_clusters should be a nonnegative " "integer. 0 was given",
+        KMedoids(n_clusters=0).fit,
+        X,
+    )
 
-    assert_raise_message(ValueError, "n_clusters should be a nonnegative "
-                                     "integer. None was given",
-                         KMedoids(n_clusters=None).fit, X)
+    assert_raise_message(
+        ValueError,
+        "n_clusters should be a nonnegative " "integer. None was given",
+        KMedoids(n_clusters=None).fit,
+        X,
+    )
 
-    assert_raise_message(ValueError, "max_iter should be a nonnegative "
-                                     "integer. 0 was given",
-                         KMedoids(n_clusters=1, max_iter=0).fit, X)
+    assert_raise_message(
+        ValueError,
+        "max_iter should be a nonnegative " "integer. 0 was given",
+        KMedoids(n_clusters=1, max_iter=0).fit,
+        X,
+    )
 
-    assert_raise_message(ValueError, "max_iter should be a nonnegative "
-                                     "integer. None was given",
-                         KMedoids(n_clusters=1, max_iter=None).fit, X)
+    assert_raise_message(
+        ValueError,
+        "max_iter should be a nonnegative " "integer. None was given",
+        KMedoids(n_clusters=1, max_iter=None).fit,
+        X,
+    )
 
-    assert_raise_message(ValueError, "init needs to be one of the following: "
-                                     "['random', 'heuristic', 'k-medoids++']",
-                         KMedoids(init=None).fit, X)
+    assert_raise_message(
+        ValueError,
+        "init needs to be one of the following: "
+        "['random', 'heuristic', 'k-medoids++']",
+        KMedoids(init=None).fit,
+        X,
+    )
 
     # Trying to fit 3 samples to 8 clusters
     Xsmall = rng.rand(5, 2)
-    assert_raise_message(ValueError, "The number of medoids (8) must be less "
-                                     "than the number of samples 5.",
-                         KMedoids(n_clusters=8).fit, Xsmall)
+    assert_raise_message(
+        ValueError,
+        "The number of medoids (8) must be less "
+        "than the number of samples 5.",
+        KMedoids(n_clusters=8).fit,
+        Xsmall,
+    )
 
 
 def test_random_deterministic():
@@ -55,9 +75,7 @@ def test_random_deterministic():
     X = load_iris()["data"]
     D = euclidean_distances(X)
 
-    medoids = KMedoids(
-        init="random",
-        )._initialize_medoids(D, 4, rng)
+    medoids = KMedoids(init="random")._initialize_medoids(D, 4, rng)
     assert_array_equal(medoids, [47, 117, 67, 103])
 
 
@@ -68,13 +86,9 @@ def test_heuristic_deterministic():
     X = load_iris()["data"]
     D = euclidean_distances(X)
 
-    medoids_1 = KMedoids(
-        init="heuristic",
-        )._initialize_medoids(D, 10, rng1)
+    medoids_1 = KMedoids(init="heuristic")._initialize_medoids(D, 10, rng1)
 
-    medoids_2 = KMedoids(
-        init="heuristic",
-        )._initialize_medoids(D, 10, rng2)
+    medoids_2 = KMedoids(init="heuristic")._initialize_medoids(D, 10, rng2)
 
     assert_array_equal(medoids_1, medoids_2)
 
@@ -102,20 +116,16 @@ def test_kmedoids_empty_clusters():
     assert_warns_message(UserWarning, "Cluster 1 is empty!", kmedoids.fit, X)
 
 
-@mock.patch.object(KMedoids, '_kpp_init', return_value=object())
+@mock.patch.object(KMedoids, "_kpp_init", return_value=object())
 def test_kpp_called(_kpp_init_mocked):
     """KMedoids._kpp_init method should be called by _initialize_medoids"""
     D = np.array([[0, 1], [1, 0]])
     n_clusters = 2
     rng = np.random.RandomState(seed)
     kmedoids = KMedoids()
-    kmedoids.init = 'k-medoids++'
+    kmedoids.init = "k-medoids++"
     # set _kpp_init_mocked.return_value to a singleton
-    initial_medoids = kmedoids._initialize_medoids(
-        D,
-        n_clusters,
-        rng,
-    )
+    initial_medoids = kmedoids._initialize_medoids(D, n_clusters, rng)
 
     # assert that _kpp_init was called and its result was returned.
     _kpp_init_mocked.assert_called_once_with(D, n_clusters, rng)
@@ -126,20 +136,19 @@ def test_kmedoids_pp():
     """Initial clusters should be well-separated for k-medoids++"""
     rng = np.random.RandomState(seed)
     kmedoids = KMedoids()
-    X = [[10, 0],
-         [11, 0],
-         [0, 10],
-         [0, 11],
-         [10, 10],
-         [11, 10],
-         [12, 10],
-         [10, 11],
-         ]
+    X = [
+        [10, 0],
+        [11, 0],
+        [0, 10],
+        [0, 11],
+        [10, 10],
+        [11, 10],
+        [12, 10],
+        [10, 11],
+    ]
     D = euclidean_distances(X)
 
-    centers = kmedoids._kpp_init(D,
-                                 n_clusters=3,
-                                 random_state_=rng)
+    centers = kmedoids._kpp_init(D, n_clusters=3, random_state_=rng)
 
     assert len(centers) == 3
 
@@ -150,23 +159,12 @@ def test_kmedoids_pp():
 def test_precomputed():
     """Test the 'precomputed' distance metric."""
     rng = np.random.RandomState(seed)
-    X_1 = [
-        [1.0, 0.0],
-        [1.1, 0.0],
-        [0.0, 1.0],
-        [0.0, 1.1]
-    ]
+    X_1 = [[1.0, 0.0], [1.1, 0.0], [0.0, 1.0], [0.0, 1.1]]
     D_1 = euclidean_distances(X_1)
-    X_2 = [
-        [1.1, 0.0],
-        [0.0, 0.9]
-    ]
+    X_2 = [[1.1, 0.0], [0.0, 0.9]]
     D_2 = euclidean_distances(X_2, X_1)
 
-    kmedoids = KMedoids(metric="precomputed",
-                        n_clusters=2,
-                        random_state=rng,
-                        )
+    kmedoids = KMedoids(metric="precomputed", n_clusters=2, random_state=rng)
     kmedoids.fit(D_1)
 
     assert_allclose(kmedoids.inertia_, 0.2)
@@ -184,17 +182,18 @@ def test_precomputed():
 
 def test_kmedoids_fit_naive():
     n_clusters = 3
-    metric = 'euclidean'
+    metric = "euclidean"
 
     model = KMedoids(n_clusters=n_clusters, metric=metric)
     Xnaive = np.asarray([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
 
     model.fit(Xnaive)
 
-    assert_array_equal(model.cluster_centers_,
-                       [[1, 0, 0], [0, 1, 0], [0, 0, 1]])
+    assert_array_equal(
+        model.cluster_centers_, [[1, 0, 0], [0, 1, 0], [0, 0, 1]]
+    )
     assert_array_equal(model.labels_, [0, 1, 2])
-    assert model.inertia_ == 0.
+    assert model.inertia_ == 0.0
 
     # diagonal must be zero, off-diagonals must be positive
     X_new = model.transform(Xnaive)
@@ -208,37 +207,35 @@ def test_kmedoids_fit_naive():
 def test_max_iter():
     """Test that warning message is thrown when max_iter is reached."""
     rng = np.random.RandomState(seed)
-    X_iris = load_iris()['data']
+    X_iris = load_iris()["data"]
 
-    model = KMedoids(n_clusters=10,
-                     init='random',
-                     random_state=rng,
-                     max_iter=1,
-                     )
-    assert_warns_message(UserWarning,
-                         "Maximum number of iteration reached before",
-                         model.fit,
-                         X_iris,
-                         )
+    model = KMedoids(
+        n_clusters=10, init="random", random_state=rng, max_iter=1
+    )
+    assert_warns_message(
+        UserWarning,
+        "Maximum number of iteration reached before",
+        model.fit,
+        X_iris,
+    )
 
 
 def test_kmedoids_iris():
     """Test kmedoids on the Iris dataset"""
     rng = np.random.RandomState(seed)
-    X_iris = load_iris()['data']
+    X_iris = load_iris()["data"]
 
     ref_model = KMeans(n_clusters=3).fit(X_iris)
 
-    avg_dist_to_closest_centroid = ref_model\
-        .transform(X_iris).min(axis=1).mean()
+    avg_dist_to_closest_centroid = (
+        ref_model.transform(X_iris).min(axis=1).mean()
+    )
 
-    for init in ['random', 'heuristic', 'k-medoids++']:
-        distance_metric = 'euclidean'
-        model = KMedoids(n_clusters=3,
-                         metric=distance_metric,
-                         init=init,
-                         random_state=rng,
-                         )
+    for init in ["random", "heuristic", "k-medoids++"]:
+        distance_metric = "euclidean"
+        model = KMedoids(
+            n_clusters=3, metric=distance_metric, init=init, random_state=rng
+        )
         model.fit(X_iris)
 
         # test convergence in reasonable number of steps
@@ -254,8 +251,9 @@ def test_kmedoids_iris():
         # we can compare its performance to
         # K-Means. We want the average distance to cluster centers
         # to be similar between K-Means and K-Medoids
-        assert_allclose(avg_dist_to_closest_medoid,
-                        avg_dist_to_closest_centroid, rtol=0.1)
+        assert_allclose(
+            avg_dist_to_closest_medoid, avg_dist_to_closest_centroid, rtol=0.1
+        )
 
 
 def test_kmedoids_fit_predict_transform():
@@ -293,8 +291,7 @@ def test_outlier_robustness():
     kmeans = KMeans(n_clusters=2, random_state=rng)
     kmedoids = KMedoids(n_clusters=2, random_state=rng)
 
-    X = [[-11, 0], [-10, 0], [-9, 0],
-         [0, 0], [1, 0], [2, 0], [1000, 0]]
+    X = [[-11, 0], [-10, 0], [-9, 0], [0, 0], [1, 0], [2, 0], [1000, 0]]
 
     kmeans.fit(X)
     kmedoids.fit(X)
diff --git a/sklearn_extra/tests/test_common.py b/sklearn_extra/tests/test_common.py
index f1b1e862..cfcbf9d0 100644
--- a/sklearn_extra/tests/test_common.py
+++ b/sklearn_extra/tests/test_common.py
@@ -5,9 +5,7 @@
 from sklearn_extra.kernel_approximation import Fastfood
 from sklearn_extra.cluster import KMedoids
 
-@pytest.mark.parametrize(
-    "Estimator",
-    [Fastfood, KMedoids]
-)
+
+@pytest.mark.parametrize("Estimator", [Fastfood, KMedoids])
 def test_all_estimators(Estimator, request):
     return check_estimator(Estimator)

From 182d505ed50e2acbd7ebd8e1d7adcd2cfae017e8 Mon Sep 17 00:00:00 2001
From: Zane Dufour <zane.dufour@gmail.com>
Date: Sat, 27 Jul 2019 16:43:30 -0400
Subject: [PATCH 19/24] Remove commented out math code

---
 doc/conf.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/doc/conf.py b/doc/conf.py
index 1cbdd442..c39936a0 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -45,15 +45,6 @@
 # see https://github.com/numpy/numpydoc/issues/69
 numpydoc_show_class_members = False
 
-# pngmath / imgmath compatibility layer for different sphinx versions
-import sphinx
-from distutils.version import LooseVersion
-
-# if LooseVersion(sphinx.__version__) < LooseVersion('1.4'):
-#     extensions.append('sphinx.ext.pngmath')
-# else:
-#     extensions.append('sphinx.ext.imgmath')
-
 autodoc_default_flags = ["members", "inherited-members"]
 
 # For maths, use mathjax by default and svg if NO_MATHJAX env variable is set

From 88d9630329e2116dfe42550db8f1cb8398dbf699 Mon Sep 17 00:00:00 2001
From: Zane Dufour <zane.dufour@gmail.com>
Date: Sat, 27 Jul 2019 16:45:24 -0400
Subject: [PATCH 20/24] Remove unnecessary plot_kmedoids_digits.py

---
 examples/plot_kmedoids_digits.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/examples/plot_kmedoids_digits.py b/examples/plot_kmedoids_digits.py
index c74d9ab2..28c7659d 100644
--- a/examples/plot_kmedoids_digits.py
+++ b/examples/plot_kmedoids_digits.py
@@ -9,7 +9,6 @@
 import numpy as np
 import matplotlib.pyplot as plt
 
-from collections import namedtuple
 from sklearn.cluster import KMeans
 from sklearn_extra.cluster import KMedoids
 from sklearn.datasets import load_digits
@@ -47,21 +46,18 @@
     fontsize=14,
 )
 
-Algorithm = namedtuple("ClusterAlgorithm", ["model", "description"])
 
 selected_models = [
-    Algorithm(
+    (
         KMedoids(metric="manhattan", n_clusters=n_digits),
         "KMedoids (manhattan)",
     ),
-    Algorithm(
+    (
         KMedoids(metric="euclidean", n_clusters=n_digits),
         "KMedoids (euclidean)",
     ),
-    Algorithm(
-        KMedoids(metric="cosine", n_clusters=n_digits), "KMedoids (cosine)"
-    ),
-    Algorithm(KMeans(n_clusters=n_digits), "KMeans"),
+    (KMedoids(metric="cosine", n_clusters=n_digits), "KMedoids (cosine)"),
+    (KMeans(n_clusters=n_digits), "KMeans"),
 ]
 
 plot_rows = int(np.ceil(len(selected_models) / 2.0))

From 9405d98fdc34da6bfa58fde4f30c1ff5d03b366a Mon Sep 17 00:00:00 2001
From: Zane Dufour <zane.dufour@gmail.com>
Date: Sat, 27 Jul 2019 17:39:13 -0400
Subject: [PATCH 21/24] Remove `x_squared_norms` from _kpp_init (copied over
 from kmeans)

---
 sklearn_extra/cluster/_k_medoids.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/sklearn_extra/cluster/_k_medoids.py b/sklearn_extra/cluster/_k_medoids.py
index def6560d..515d1602 100644
--- a/sklearn_extra/cluster/_k_medoids.py
+++ b/sklearn_extra/cluster/_k_medoids.py
@@ -357,9 +357,6 @@ def _kpp_init(self, D, n_clusters, random_state_, n_local_trials=None):
         n_clusters : integer
             The number of seeds to choose
 
-        x_squared_norms : array, shape (n_samples,)
-            Squared Euclidean norm of each data point.
-
         random_state : RandomState
             The generator used to initialize the centers.
 

From 0989f8853c4bee853b4c18634386b24483ab0128 Mon Sep 17 00:00:00 2001
From: Zane Dufour <zane.dufour@gmail.com>
Date: Sat, 27 Jul 2019 17:44:26 -0400
Subject: [PATCH 22/24] Add comment for _kpp_init mentnioning k_means_._k_init
 copypasta

---
 sklearn_extra/cluster/_k_medoids.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sklearn_extra/cluster/_k_medoids.py b/sklearn_extra/cluster/_k_medoids.py
index 515d1602..ba160970 100644
--- a/sklearn_extra/cluster/_k_medoids.py
+++ b/sklearn_extra/cluster/_k_medoids.py
@@ -346,6 +346,7 @@ def _initialize_medoids(self, D, n_clusters, random_state_):
 
         return medoids
 
+    # Copied from sklearn.cluster.k_means_._k_init
     def _kpp_init(self, D, n_clusters, random_state_, n_local_trials=None):
         """Init n_clusters seeds with a method similar to k-means++
 

From d76d6b87059dd1476a1d58bf4a420690f8a621cf Mon Sep 17 00:00:00 2001
From: Zane Dufour <zane.dufour@gmail.com>
Date: Sat, 27 Jul 2019 21:28:41 -0400
Subject: [PATCH 23/24] update n_samples -> n_query, where appropriate

---
 sklearn_extra/cluster/_k_medoids.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn_extra/cluster/_k_medoids.py b/sklearn_extra/cluster/_k_medoids.py
index ba160970..298195d9 100644
--- a/sklearn_extra/cluster/_k_medoids.py
+++ b/sklearn_extra/cluster/_k_medoids.py
@@ -263,7 +263,7 @@ def transform(self, X):
 
         Returns
         -------
-        X_new : {array-like, sparse matrix}, shape=(n_samples, n_clusters)
+        X_new : {array-like, sparse matrix}, shape=(n_query, n_clusters)
             X transformed in the new space of distances to cluster centers.
         """
         X = check_array(X, accept_sparse=["csr", "csc"])
@@ -288,7 +288,7 @@ def predict(self, X):
 
         Returns
         -------
-        labels : array, shape = (n_samples,)
+        labels : array, shape = (n_query,)
             Index of the cluster each sample belongs to.
         """
         X = check_array(X, accept_sparse=["csr", "csc"])

From c060b0e5c1fcbf80c0792ef4c4ce96fae5f36f6f Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@pm.me>
Date: Mon, 29 Jul 2019 14:12:11 +0200
Subject: [PATCH 24/24] Add sklearn_extra/cluster/tests/__init__.py

---
 sklearn_extra/cluster/tests/__init__.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 sklearn_extra/cluster/tests/__init__.py

diff --git a/sklearn_extra/cluster/tests/__init__.py b/sklearn_extra/cluster/tests/__init__.py
new file mode 100644
index 00000000..e69de29b