diff --git a/.gitignore b/.gitignore index 442f8c2a..7d0d0c2b 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,9 @@ __pycache__/ # C extensions *.so +# Text Editors +.vscode/ + # scikit-learn specific doc/_build/ doc/auto_examples/ @@ -17,6 +20,7 @@ doc/datasets/generated/ # Distribution / packaging .Python +venv/ env/ build/ develop-eggs/ diff --git a/benchmarks/bench_rbfsampler_fastfood.py b/benchmarks/bench_rbfsampler_fastfood.py index 42bea9b4..11f5df9b 100644 --- a/benchmarks/bench_rbfsampler_fastfood.py +++ b/benchmarks/bench_rbfsampler_fastfood.py @@ -15,9 +15,9 @@ Y /= Y.sum(axis=1)[:, np.newaxis] # calculate feature maps -gamma = 10. +gamma = 10.0 sigma = np.sqrt(1 / (2 * gamma)) -number_of_features_to_generate = 4096*4 +number_of_features_to_generate = 4096 * 4 exact_start = datetime.datetime.utcnow() # original rbf kernel method: @@ -27,23 +27,24 @@ exact_spent_time = exact_end - exact_start print("Timimg exact rbf: \t\t", exact_spent_time) -rbf_transform = Fastfood(sigma=sigma, - n_components=number_of_features_to_generate, - tradeoff_mem_accuracy='mem', - random_state=42) +rbf_transform = Fastfood( + sigma=sigma, + n_components=number_of_features_to_generate, + tradeoff_mem_accuracy="mem", + random_state=42, +) _ = rbf_transform.fit(X) fastfood_fast_vec_start = datetime.datetime.utcnow() # Fastfood: approximate kernel mapping _ = rbf_transform.transform(X) _ = rbf_transform.transform(Y) fastfood_fast_vec_end = datetime.datetime.utcnow() -fastfood_fast_vec_spent_time = fastfood_fast_vec_end - \ - fastfood_fast_vec_start +fastfood_fast_vec_spent_time = fastfood_fast_vec_end - fastfood_fast_vec_start print("Timimg fastfood fast vectorized: \t\t", fastfood_fast_vec_spent_time) -rks_rbf_transform = RBFSampler(gamma=gamma, - n_components=number_of_features_to_generate, - random_state=42) +rks_rbf_transform = RBFSampler( + gamma=gamma, n_components=number_of_features_to_generate, random_state=42 +) _ = rks_rbf_transform.fit(X) rks_start = datetime.datetime.utcnow() # Random Kitchens Sinks: approximate kernel mapping diff --git a/doc/api.rst b/doc/api.rst index e8de935e..fcb9b8a0 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -12,3 +12,13 @@ Kernel approximation :template: class.rst kernel_approximation.Fastfood + +Clustering +==================== + +.. autosummary:: + :toctree: generated/ + :template: class.rst + + cluster.KMedoids + diff --git a/doc/conf.py b/doc/conf.py index eb7aadf6..c39936a0 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -21,61 +21,65 @@ # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. -#sys.path.insert(0, os.path.abspath('.')) +# sys.path.insert(0, os.path.abspath('.')) # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. -#needs_sphinx = '1.0' +# needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.autosummary', - 'sphinx.ext.doctest', - 'sphinx.ext.intersphinx', - 'sphinx.ext.viewcode', - 'numpydoc', - 'sphinx_gallery.gen_gallery', + "sphinx.ext.autodoc", + "sphinx.ext.autosummary", + "sphinx.ext.doctest", + "sphinx.ext.intersphinx", + "sphinx.ext.viewcode", + "numpydoc", + "sphinx_gallery.gen_gallery", ] # this is needed for some reason... # see https://github.com/numpy/numpydoc/issues/69 numpydoc_show_class_members = False -# pngmath / imgmath compatibility layer for different sphinx versions -import sphinx -from distutils.version import LooseVersion -if LooseVersion(sphinx.__version__) < LooseVersion('1.4'): - extensions.append('sphinx.ext.pngmath') -else: - extensions.append('sphinx.ext.imgmath') +autodoc_default_flags = ["members", "inherited-members"] -autodoc_default_flags = ['members', 'inherited-members'] +# For maths, use mathjax by default and svg if NO_MATHJAX env variable is set +# (useful for viewing the doc offline) +if os.environ.get("NO_MATHJAX"): + extensions.append("sphinx.ext.imgmath") + imgmath_image_format = "svg" +else: + extensions.append("sphinx.ext.mathjax") + mathjax_path = ( + "https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/" + "MathJax.js?config=TeX-AMS_SVG" + ) # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # generate autosummary even if no references autosummary_generate = True # The suffix of source filenames. -source_suffix = '.rst' +source_suffix = ".rst" # The encoding of source files. -#source_encoding = 'utf-8-sig' +# source_encoding = 'utf-8-sig' # Generate the plots for the gallery plot_gallery = True # The master toctree document. -master_doc = 'index' +master_doc = "index" # General information about the project. -project = u'scikit-learn-extra' -copyright = u'2019, scikit-learn-extra developpers' +project = u"scikit-learn-extra" +copyright = u"2019, scikit-learn-extra developpers" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the @@ -83,177 +87,181 @@ # # The short X.Y version. from sklearn_extra import __version__ + version = __version__ # The full version, including alpha/beta/rc tags. release = __version__ # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. -#language = None +# language = None # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: -#today = '' +# today = '' # Else, today_fmt is used as the format for a strftime call. -#today_fmt = '%B %d, %Y' +# today_fmt = '%B %d, %Y' # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. -exclude_patterns = ['_build', '_templates'] +exclude_patterns = ["_build", "_templates"] # The reST default role (used for this markup: `text`) to use for all # documents. -#default_role = None +# default_role = None # If true, '()' will be appended to :func: etc. cross-reference text. -#add_function_parentheses = True +# add_function_parentheses = True # If true, the current module name will be prepended to all description # unit titles (such as .. function::). -#add_module_names = True +# add_module_names = True # If true, sectionauthor and moduleauthor directives will be shown in the # output. They are ignored by default. -#show_authors = False +# show_authors = False # The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' +pygments_style = "sphinx" # Custom style -html_style = 'css/project-template.css' +html_style = "css/project-template.css" # A list of ignored prefixes for module index sorting. -#modindex_common_prefix = [] +# modindex_common_prefix = [] # If true, keep warnings as "system message" paragraphs in the built documents. -#keep_warnings = False +# keep_warnings = False # -- Options for HTML output ---------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. -html_theme = 'sphinx_rtd_theme' +html_theme = "sphinx_rtd_theme" # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. -#html_theme_options = {} +# html_theme_options = {} # Add any paths that contain custom themes here, relative to this directory. html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] # The name for this set of Sphinx documents. If None, it defaults to # " v documentation". -#html_title = None +# html_title = None # A shorter title for the navigation bar. Default is the same as html_title. -#html_short_title = None +# html_short_title = None # The name of an image file (relative to this directory) to place at the top # of the sidebar. -#html_logo = None +# html_logo = None # The name of an image file (within the static path) to use as favicon of the # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. -#html_favicon = None +# html_favicon = None # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = ["_static"] # Add any extra paths that contain custom files (such as robots.txt or # .htaccess) here, relative to this directory. These files are copied # directly to the root of the documentation. -#html_extra_path = [] +# html_extra_path = [] # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. -#html_last_updated_fmt = '%b %d, %Y' +# html_last_updated_fmt = '%b %d, %Y' # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. -#html_use_smartypants = True +# html_use_smartypants = True # Custom sidebar templates, maps document names to template names. -#html_sidebars = {} +# html_sidebars = {} # Additional templates that should be rendered to pages, maps page names to # template names. -#html_additional_pages = {} +# html_additional_pages = {} # If false, no module index is generated. -#html_domain_indices = True +# html_domain_indices = True # If false, no index is generated. -#html_use_index = True +# html_use_index = True # If true, the index is split into individual pages for each letter. -#html_split_index = False +# html_split_index = False # If true, links to the reST sources are added to the pages. -#html_show_sourcelink = True +# html_show_sourcelink = True # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. -#html_show_sphinx = True +# html_show_sphinx = True # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. -#html_show_copyright = True +# html_show_copyright = True # If true, an OpenSearch description file will be output, and all pages will # contain a tag referring to it. The value of this option must be the # base URL from which the finished HTML is served. -#html_use_opensearch = '' +# html_use_opensearch = '' # This is the file name suffix for HTML files (e.g. ".xhtml"). -#html_file_suffix = None +# html_file_suffix = None # Output file base name for HTML help builder. -htmlhelp_basename = 'project-templatedoc' +htmlhelp_basename = "project-templatedoc" # -- Options for LaTeX output --------------------------------------------- latex_elements = { -# The paper size ('letterpaper' or 'a4paper'). -#'papersize': 'letterpaper', - -# The font size ('10pt', '11pt' or '12pt'). -#'pointsize': '10pt', - -# Additional stuff for the LaTeX preamble. -#'preamble': '', + # The paper size ('letterpaper' or 'a4paper'). + #'papersize': 'letterpaper', + # The font size ('10pt', '11pt' or '12pt'). + #'pointsize': '10pt', + # Additional stuff for the LaTeX preamble. + #'preamble': '', } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - ('index', 'project-template.tex', u'project-template Documentation', - u'Vighnesh Birodkar', 'manual'), + ( + "index", + "project-template.tex", + u"project-template Documentation", + u"Vighnesh Birodkar", + "manual", + ) ] # The name of an image file (relative to this directory) to place at the top of # the title page. -#latex_logo = None +# latex_logo = None # For "manual" documents, if this is true, then toplevel headings are parts, # not chapters. -#latex_use_parts = False +# latex_use_parts = False # If true, show page references after internal links. -#latex_show_pagerefs = False +# latex_show_pagerefs = False # If true, show URL addresses after external links. -#latex_show_urls = False +# latex_show_urls = False # Documents to append as an appendix to all manuals. -#latex_appendices = [] +# latex_appendices = [] # If false, no module index is generated. -#latex_domain_indices = True +# latex_domain_indices = True # -- Options for manual page output --------------------------------------- @@ -261,12 +269,17 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ - ('index', 'project-template', u'project-template Documentation', - [u'Vighnesh Birodkar'], 1) + ( + "index", + "project-template", + u"project-template Documentation", + [u"Vighnesh Birodkar"], + 1, + ) ] # If true, show URL addresses after external links. -#man_show_urls = False +# man_show_urls = False # -- Options for Texinfo output ------------------------------------------- @@ -275,43 +288,51 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - ('index', 'project-template', u'project-template Documentation', - u'Vighnesh Birodkar', 'project-template', 'One line description of project.', - 'Miscellaneous'), + ( + "index", + "project-template", + u"project-template Documentation", + u"Vighnesh Birodkar", + "project-template", + "One line description of project.", + "Miscellaneous", + ) ] # Documents to append as an appendix to all manuals. -#texinfo_appendices = [] +# texinfo_appendices = [] # If false, no module index is generated. -#texinfo_domain_indices = True +# texinfo_domain_indices = True # How to display URL addresses: 'footnote', 'no', or 'inline'. -#texinfo_show_urls = 'footnote' +# texinfo_show_urls = 'footnote' # If true, do not generate a @detailmenu in the "Top" node's menu. -#texinfo_no_detailmenu = False +# texinfo_no_detailmenu = False # Example configuration for intersphinx: refer to the Python standard library. # intersphinx configuration intersphinx_mapping = { - 'python': ('https://docs.python.org/{.major}'.format( - sys.version_info), None), - 'numpy': ('https://docs.scipy.org/doc/numpy/', None), - 'scipy': ('https://docs.scipy.org/doc/scipy/reference', None), - 'matplotlib': ('https://matplotlib.org/', None), - 'sklearn': ('http://scikit-learn.org/stable', None) + "python": ( + "https://docs.python.org/{.major}".format(sys.version_info), + None, + ), + "numpy": ("https://docs.scipy.org/doc/numpy/", None), + "scipy": ("https://docs.scipy.org/doc/scipy/reference", None), + "matplotlib": ("https://matplotlib.org/", None), + "sklearn": ("http://scikit-learn.org/stable", None), } # sphinx-gallery configuration sphinx_gallery_conf = { - 'doc_module': 'sklearn_extra', - 'backreferences_dir': os.path.join('generated'), - 'reference_url': { - 'sklearn_extra': None} + "doc_module": "sklearn_extra", + "backreferences_dir": os.path.join("generated"), + "reference_url": {"sklearn_extra": None}, } + def setup(app): # a copy button to copy snippet of code from the documentation - app.add_javascript('js/copybutton.js') + app.add_javascript("js/copybutton.js") diff --git a/doc/user_guide.rst b/doc/user_guide.rst index a190e568..084e838b 100644 --- a/doc/user_guide.rst +++ b/doc/user_guide.rst @@ -6,4 +6,60 @@ User guide ========== -To add. +.. _k_medoids: + +K-Medoids +========= + +:class:`KMedoids` is related to the :class:`KMeans` algorithm. While +:class:`KMeans` tries to minimize the within cluster sum-of-squares, +:class:`KMedoids` tries to minimize the sum of distances between each point and +the medoid of its cluster. The medoid is a data point (unlike the centroid) +which has least total distance to the other members of its cluster. The use of +a data point to represent each cluster's center allows the use of any distance +metric for clustering. + +:class:`KMedoids` can be more robust to noise and outliers than :class:`KMeans` +as it will choose one of the cluster members as the medoid while +:class:`KMeans` will move the center of the cluster towards the outlier which +might in turn move other points away from the cluster centre. + +:class:`KMedoids` is also different from K-Medians, which is analogous to :class:`KMeans` +except that the Manhattan Median is used for each cluster center instead of +the centroid. K-Medians is robust to outliers, but it is limited to the +Manhattan Distance metric and, similar to :class:`KMeans`, it does not guarantee +that the center of each cluster will be a member of the original dataset. + +The complexity of K-Medoids is :math:`O(N^2 K T)` where :math:`N` is the number +of samples, :math:`T` is the number of iterations and :math:`K` is the number of +clusters. This makes it more suitable for smaller datasets in comparison to +:class:`KMeans` which is :math:`O(N K T)`. + +.. topic:: Examples: + + * :ref:`sphx_glr_auto_examples_plot_kmedoids_digits.py`: Applying K-Medoids on digits + with various distance metrics. + + +**Algorithm description:** +There are several algorithms to compute K-Medoids, though :class:`KMedoids` +currently only supports Partitioning Around Medoids (PAM). The PAM algorithm +uses a greedy search, which may fail to find the global optimum. It consists of +two alternating steps commonly called the +Assignment and Update steps (BUILD and SWAP in Kaufmann and Rousseeuw, 1987). + +PAM works as follows: + +* Initialize: Select ``n_clusters`` from the dataset as the medoids using + a heuristic, random, or k-medoids++ approach (configurable using the ``init`` parameter). +* Assignment step: assign each element from the dataset to the closest medoid. +* Update step: Identify the new medoid of each cluster. +* Repeat the assignment and update step while the medoids keep changing or + maximum number of iterations ``max_iter`` is reached. + +.. topic:: References: + + * "Clustering by Means of Medoids'" + Kaufman, L. and Rousseeuw, P.J., + Statistical Data Analysis Based on the L1Norm and Related Methods, edited + by Y. Dodge, North-Holland, 405416. 1987 \ No newline at end of file diff --git a/examples/plot_kmedoids_digits.py b/examples/plot_kmedoids_digits.py new file mode 100644 index 00000000..28c7659d --- /dev/null +++ b/examples/plot_kmedoids_digits.py @@ -0,0 +1,104 @@ +# -*- coding: utf-8 -*- +""" +============================================================= +A demo of K-Medoids clustering on the handwritten digits data +============================================================= +In this example we compare different pairwise distance +metrics for K-Medoids. +""" +import numpy as np +import matplotlib.pyplot as plt + +from sklearn.cluster import KMeans +from sklearn_extra.cluster import KMedoids +from sklearn.datasets import load_digits +from sklearn.decomposition import PCA +from sklearn.preprocessing import scale + +print(__doc__) + +# Authors: Timo Erkkilä +# Antti Lehmussola +# Kornel Kiełczewski +# License: BSD 3 clause + +np.random.seed(42) + +digits = load_digits() +data = scale(digits.data) +n_digits = len(np.unique(digits.target)) + +reduced_data = PCA(n_components=2).fit_transform(data) + +# Step size of the mesh. Decrease to increase the quality of the VQ. +h = 0.02 # point in the mesh [x_min, m_max]x[y_min, y_max]. + +# Plot the decision boundary. For that, we will assign a color to each +x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1 +y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1 +xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) + +plt.figure() +plt.clf() + +plt.suptitle( + "Comparing multiple K-Medoids metrics to K-Means and each other", + fontsize=14, +) + + +selected_models = [ + ( + KMedoids(metric="manhattan", n_clusters=n_digits), + "KMedoids (manhattan)", + ), + ( + KMedoids(metric="euclidean", n_clusters=n_digits), + "KMedoids (euclidean)", + ), + (KMedoids(metric="cosine", n_clusters=n_digits), "KMedoids (cosine)"), + (KMeans(n_clusters=n_digits), "KMeans"), +] + +plot_rows = int(np.ceil(len(selected_models) / 2.0)) +plot_cols = 2 + +for i, (model, description) in enumerate(selected_models): + + # Obtain labels for each point in mesh. Use last trained model. + model.fit(reduced_data) + Z = model.predict(np.c_[xx.ravel(), yy.ravel()]) + + # Put the result into a color plot + Z = Z.reshape(xx.shape) + plt.subplot(plot_cols, plot_rows, i + 1) + plt.imshow( + Z, + interpolation="nearest", + extent=(xx.min(), xx.max(), yy.min(), yy.max()), + cmap=plt.cm.Paired, + aspect="auto", + origin="lower", + ) + + plt.plot( + reduced_data[:, 0], reduced_data[:, 1], "k.", markersize=2, alpha=0.3 + ) + # Plot the centroids as a white X + centroids = model.cluster_centers_ + plt.scatter( + centroids[:, 0], + centroids[:, 1], + marker="x", + s=169, + linewidths=3, + color="w", + zorder=10, + ) + plt.title(description) + plt.xlim(x_min, x_max) + plt.ylim(y_min, y_max) + plt.xticks(()) + plt.yticks(()) + +plt.show() diff --git a/sklearn_extra/cluster/__init__.py b/sklearn_extra/cluster/__init__.py new file mode 100644 index 00000000..bbdaaf41 --- /dev/null +++ b/sklearn_extra/cluster/__init__.py @@ -0,0 +1,3 @@ +from ._k_medoids import KMedoids + +__all__ = ["KMedoids"] diff --git a/sklearn_extra/cluster/_k_medoids.py b/sklearn_extra/cluster/_k_medoids.py new file mode 100644 index 00000000..298195d9 --- /dev/null +++ b/sklearn_extra/cluster/_k_medoids.py @@ -0,0 +1,431 @@ +# -*- coding: utf-8 -*- +"""K-medoids clustering""" + +# Authors: Timo Erkkilä +# Antti Lehmussola +# Kornel Kiełczewski +# Zane Dufour +# License: BSD 3 clause + +import warnings + +import numpy as np + +from sklearn.base import BaseEstimator, ClusterMixin, TransformerMixin +from sklearn.metrics.pairwise import ( + pairwise_distances, + pairwise_distances_argmin, +) +from sklearn.utils import check_array, check_random_state +from sklearn.utils.extmath import stable_cumsum +from sklearn.utils.validation import check_is_fitted +from sklearn.exceptions import ConvergenceWarning + + +class KMedoids(BaseEstimator, ClusterMixin, TransformerMixin): + """k-medoids clustering. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_clusters : int, optional, default: 8 + The number of clusters to form as well as the number of medoids to + generate. + + metric : string, or callable, optional, default: 'euclidean' + What distance metric to use. See :func:metrics.pairwise_distances + + init : {'random', 'heuristic', 'k-medoids++'}, optional, default: 'heuristic' + Specify medoid initialization method. 'random' selects n_clusters + elements from the dataset. 'heuristic' picks the n_clusters points + with the smallest sum distance to every other point. 'k-medoids++' + follows an approach based on k-means++_, and in general, gives initial + medoids which are more separated than those generated by the other methods. + + .. _k-means++: https://theory.stanford.edu/~sergei/papers/kMeansPP-soda.pdf + + max_iter : int, optional, default : 300 + Specify the maximum number of iterations when fitting. + + random_state : int, RandomState instance or None, optional + Specify random state for the random number generator. Used to + initialise medoids when init='random'. + + Attributes + ---------- + cluster_centers_ : array, shape = (n_clusters, n_features) + or None if metric == 'precomputed' + Cluster centers, i.e. medoids (elements from the original dataset) + + medoid_indices_ : array, shape = (n_clusters,) + The indices of the medoid rows in X + + labels_ : array, shape = (n_samples,) + Labels of each point + + inertia_ : float + Sum of distances of samples to their closest cluster center. + + Examples + -------- + >>> from sklearn_extra.cluster import KMedoids + >>> import numpy as np + + >>> X = np.asarray([[1, 2], [1, 4], [1, 0], + ... [4, 2], [4, 4], [4, 0]]) + >>> kmedoids = KMedoids(n_clusters=2, random_state=0).fit(X) + >>> kmedoids.labels_ + array([0, 0, 0, 1, 1, 1]) + >>> kmedoids.predict([[0,0], [4,4]]) + array([0, 1]) + >>> kmedoids.cluster_centers_ + array([[1, 2], + [4, 2]]) + >>> kmedoids.inertia_ + 8.0 + + See scikit-learn-extra/examples/plot_kmedoids_digits.py for examples + of KMedoids with various distance metrics. + + References + ---------- + Kaufman, L. and Rousseeuw, P.J., Statistical Data Analysis Based on + the L1–Norm and Related Methods, edited by Y. Dodge, North-Holland, + 405–416. 1987 + + See also + -------- + + KMeans + The KMeans algorithm minimizes the within-cluster sum-of-squares + criterion. It scales well to large number of samples. + + Notes + ----- + Since all pairwise distances are calculated and stored in memory for + the duration of fit, the space complexity is O(n_samples ** 2). + + """ + + def __init__( + self, + n_clusters=8, + metric="euclidean", + init="heuristic", + max_iter=300, + random_state=None, + ): + self.n_clusters = n_clusters + self.metric = metric + self.init = init + self.max_iter = max_iter + self.random_state = random_state + + def _check_nonnegative_int(self, value, desc): + """Validates if value is a valid integer > 0""" + + if ( + value is None + or value <= 0 + or not isinstance(value, (int, np.integer)) + ): + raise ValueError( + "%s should be a nonnegative integer. " + "%s was given" % (desc, value) + ) + + def _check_init_args(self): + """Validates the input arguments. """ + + # Check n_clusters and max_iter + self._check_nonnegative_int(self.n_clusters, "n_clusters") + self._check_nonnegative_int(self.max_iter, "max_iter") + + # Check init + init_methods = ["random", "heuristic", "k-medoids++"] + if self.init not in init_methods: + raise ValueError( + "init needs to be one of " + + "the following: " + + "%s" % init_methods + ) + + def fit(self, X, y=None): + """Fit K-Medoids to the provided data. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape = (n_samples, n_features), \ + or (n_samples, n_samples) if metric == 'precomputed' + Dataset to cluster. + + y : Ignored + + Returns + ------- + self + """ + random_state_ = check_random_state(self.random_state) + + self._check_init_args() + X = check_array(X, accept_sparse=["csr", "csc"]) + if self.n_clusters > X.shape[0]: + raise ValueError( + "The number of medoids (%d) must be less " + "than the number of samples %d." + % (self.n_clusters, X.shape[0]) + ) + + D = pairwise_distances(X, metric=self.metric) + medoid_idxs = self._initialize_medoids( + D, self.n_clusters, random_state_ + ) + labels = None + + # Continue the algorithm as long as + # the medoids keep changing and the maximum number + # of iterations is not exceeded + for self.n_iter_ in range(0, self.max_iter): + old_medoid_idxs = np.copy(medoid_idxs) + labels = np.argmin(D[medoid_idxs, :], axis=0) + + # Update medoids with the new cluster indices + self._update_medoid_idxs_in_place(D, labels, medoid_idxs) + if np.all(old_medoid_idxs == medoid_idxs): + break + elif self.n_iter_ == self.max_iter - 1: + warnings.warn( + "Maximum number of iteration reached before " + "convergence. Consider increasing max_iter to " + "improve the fit.", + ConvergenceWarning, + ) + + # Set the resulting instance variables. + if self.metric == "precomputed": + self.cluster_centers_ = None + else: + self.cluster_centers_ = X[medoid_idxs] + + # Expose labels_ which are the assignments of + # the training data to clusters + self.labels_ = labels + self.medoid_indices_ = medoid_idxs + self.inertia_ = self._compute_inertia(self.transform(X)) + + # Return self to enable method chaining + return self + + def _update_medoid_idxs_in_place(self, D, labels, medoid_idxs): + """In-place update of the medoid indices""" + + # Update the medoids for each cluster + for k in range(self.n_clusters): + # Extract the distance matrix between the data points + # inside the cluster k + cluster_k_idxs = np.where(labels == k)[0] + + if len(cluster_k_idxs) == 0: + warnings.warn( + "Cluster {k} is empty! " + "self.labels_[self.medoid_indices_[{k}]] " + "may not be labeled with " + "its corresponding cluster ({k}).".format(k=k) + ) + continue + + in_cluster_distances = D[ + cluster_k_idxs, cluster_k_idxs[:, np.newaxis] + ] + + # Calculate all costs from each point to all others in the cluster + in_cluster_all_costs = np.sum(in_cluster_distances, axis=1) + + min_cost_idx = np.argmin(in_cluster_all_costs) + min_cost = in_cluster_all_costs[min_cost_idx] + curr_cost = in_cluster_all_costs[ + np.argmax(cluster_k_idxs == medoid_idxs[k]) + ] + + # Adopt a new medoid if its distance is smaller then the current + if min_cost < curr_cost: + medoid_idxs[k] = cluster_k_idxs[min_cost_idx] + + def transform(self, X): + """Transforms X to cluster-distance space. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_query, n_features), \ + or (n_query, n_indexed) if metric == 'precomputed' + Data to transform. + + Returns + ------- + X_new : {array-like, sparse matrix}, shape=(n_query, n_clusters) + X transformed in the new space of distances to cluster centers. + """ + X = check_array(X, accept_sparse=["csr", "csc"]) + + if self.metric == "precomputed": + check_is_fitted(self, "medoid_indices_") + return X[:, self.medoid_indices_] + else: + check_is_fitted(self, "cluster_centers_") + + Y = self.cluster_centers_ + return pairwise_distances(X, Y=Y, metric=self.metric) + + def predict(self, X): + """Predict the closest cluster for each sample in X. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_query, n_features), \ + or (n_query, n_indexed) if metric == 'precomputed' + New data to predict. + + Returns + ------- + labels : array, shape = (n_query,) + Index of the cluster each sample belongs to. + """ + X = check_array(X, accept_sparse=["csr", "csc"]) + + if self.metric == "precomputed": + check_is_fitted(self, "medoid_indices_") + return np.argmin(X[:, self.medoid_indices_], axis=1) + else: + check_is_fitted(self, "cluster_centers_") + + # Return data points to clusters based on which cluster assignment + # yields the smallest distance + return pairwise_distances_argmin( + X, Y=self.cluster_centers_, metric=self.metric + ) + + def _compute_inertia(self, distances): + """Compute inertia of new samples. Inertia is defined as the sum of the + sample distances to closest cluster centers. + + Parameters + ---------- + distances : {array-like, sparse matrix}, shape=(n_samples, n_clusters) + Distances to cluster centers. + + Returns + ------- + Sum of sample distances to closest cluster centers. + """ + + # Define inertia as the sum of the sample-distances + # to closest cluster centers + inertia = np.sum(np.min(distances, axis=1)) + + return inertia + + def _initialize_medoids(self, D, n_clusters, random_state_): + """Select initial mediods when beginning clustering.""" + + if self.init == "random": # Random initialization + # Pick random k medoids as the initial ones. + medoids = random_state_.choice(len(D), n_clusters) + elif self.init == "k-medoids++": + medoids = self._kpp_init(D, n_clusters, random_state_) + elif self.init == "heuristic": # Initialization by heuristic + # Pick K first data points that have the smallest sum distance + # to every other point. These are the initial medoids. + medoids = np.argpartition(np.sum(D, axis=1), n_clusters - 1)[ + :n_clusters + ] + else: + raise ValueError( + "init value '{init}' not recognized".format(init=self.init) + ) + + return medoids + + # Copied from sklearn.cluster.k_means_._k_init + def _kpp_init(self, D, n_clusters, random_state_, n_local_trials=None): + """Init n_clusters seeds with a method similar to k-means++ + + Parameters + ----------- + D : array, shape (n_samples, n_samples) + The distance matrix we will use to select medoid indices. + + n_clusters : integer + The number of seeds to choose + + random_state : RandomState + The generator used to initialize the centers. + + n_local_trials : integer, optional + The number of seeding trials for each center (except the first), + of which the one reducing inertia the most is greedily chosen. + Set to None to make the number of trials depend logarithmically + on the number of seeds (2+log(k)); this is the default. + + Notes + ----- + Selects initial cluster centers for k-medoid clustering in a smart way + to speed up convergence. see: Arthur, D. and Vassilvitskii, S. + "k-means++: the advantages of careful seeding". ACM-SIAM symposium + on Discrete algorithms. 2007 + + Version ported from http://www.stanford.edu/~darthur/kMeansppTest.zip, + which is the implementation used in the aforementioned paper. + """ + n_samples, _ = D.shape + + centers = np.empty(n_clusters, dtype=int) + + # Set the number of local seeding trials if none is given + if n_local_trials is None: + # This is what Arthur/Vassilvitskii tried, but did not report + # specific results for other than mentioning in the conclusion + # that it helped. + n_local_trials = 2 + int(np.log(n_clusters)) + + center_id = random_state_.randint(n_samples) + centers[0] = center_id + + # Initialize list of closest distances and calculate current potential + closest_dist_sq = D[centers[0], :] ** 2 + current_pot = closest_dist_sq.sum() + + # pick the remaining n_clusters-1 points + for cluster_index in range(1, n_clusters): + rand_vals = ( + random_state_.random_sample(n_local_trials) * current_pot + ) + candidate_ids = np.searchsorted( + stable_cumsum(closest_dist_sq), rand_vals + ) + + # Compute distances to center candidates + distance_to_candidates = D[candidate_ids, :] ** 2 + + # Decide which candidate is the best + best_candidate = None + best_pot = None + best_dist_sq = None + for trial in range(n_local_trials): + # Compute potential when including center candidate + new_dist_sq = np.minimum( + closest_dist_sq, distance_to_candidates[trial] + ) + new_pot = new_dist_sq.sum() + + # Store result if it is the best local trial so far + if (best_candidate is None) or (new_pot < best_pot): + best_candidate = candidate_ids[trial] + best_pot = new_pot + best_dist_sq = new_dist_sq + + centers[cluster_index] = best_candidate + current_pot = best_pot + closest_dist_sq = best_dist_sq + + return centers diff --git a/sklearn_extra/cluster/tests/__init__.py b/sklearn_extra/cluster/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/sklearn_extra/cluster/tests/test_k_medoids.py b/sklearn_extra/cluster/tests/test_k_medoids.py new file mode 100644 index 00000000..0b125f36 --- /dev/null +++ b/sklearn_extra/cluster/tests/test_k_medoids.py @@ -0,0 +1,312 @@ +"""Testing for K-Medoids""" +import warnings +import numpy as np +from unittest import mock +from scipy.sparse import csc_matrix + +from sklearn.datasets import load_iris +from sklearn.metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS +from sklearn.metrics.pairwise import euclidean_distances +from sklearn.utils.testing import assert_array_equal, assert_equal +from sklearn.utils.testing import assert_raise_message, assert_warns_message +from sklearn.utils.testing import assert_allclose + +from sklearn_extra.cluster import KMedoids +from sklearn.cluster import KMeans + +seed = 0 +X = np.random.RandomState(seed).rand(100, 5) + + +def test_kmedoids_input_validation_and_fit_check(): + rng = np.random.RandomState(seed) + # Invalid parameters + assert_raise_message( + ValueError, + "n_clusters should be a nonnegative " "integer. 0 was given", + KMedoids(n_clusters=0).fit, + X, + ) + + assert_raise_message( + ValueError, + "n_clusters should be a nonnegative " "integer. None was given", + KMedoids(n_clusters=None).fit, + X, + ) + + assert_raise_message( + ValueError, + "max_iter should be a nonnegative " "integer. 0 was given", + KMedoids(n_clusters=1, max_iter=0).fit, + X, + ) + + assert_raise_message( + ValueError, + "max_iter should be a nonnegative " "integer. None was given", + KMedoids(n_clusters=1, max_iter=None).fit, + X, + ) + + assert_raise_message( + ValueError, + "init needs to be one of the following: " + "['random', 'heuristic', 'k-medoids++']", + KMedoids(init=None).fit, + X, + ) + + # Trying to fit 3 samples to 8 clusters + Xsmall = rng.rand(5, 2) + assert_raise_message( + ValueError, + "The number of medoids (8) must be less " + "than the number of samples 5.", + KMedoids(n_clusters=8).fit, + Xsmall, + ) + + +def test_random_deterministic(): + """Random_state should determine 'random' init output.""" + rng = np.random.RandomState(seed) + + X = load_iris()["data"] + D = euclidean_distances(X) + + medoids = KMedoids(init="random")._initialize_medoids(D, 4, rng) + assert_array_equal(medoids, [47, 117, 67, 103]) + + +def test_heuristic_deterministic(): + """Result of heuristic init method should not depend on rnadom state.""" + rng1 = np.random.RandomState(1) + rng2 = np.random.RandomState(2) + X = load_iris()["data"] + D = euclidean_distances(X) + + medoids_1 = KMedoids(init="heuristic")._initialize_medoids(D, 10, rng1) + + medoids_2 = KMedoids(init="heuristic")._initialize_medoids(D, 10, rng2) + + assert_array_equal(medoids_1, medoids_2) + + +def test_update_medoid_idxs_empty_cluster(): + """Label is unchanged for an empty cluster.""" + D = np.zeros((3, 3)) + labels = np.array([0, 0, 0]) + medoid_idxs = np.array([0, 1]) + kmedoids = KMedoids(n_clusters=2) + + # Swallow empty cluster warning + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + kmedoids._update_medoid_idxs_in_place(D, labels, medoid_idxs) + + assert_array_equal(medoid_idxs, [0, 1]) + + +def test_kmedoids_empty_clusters(): + """When a cluster is empty, it should throw a warning.""" + rng = np.random.RandomState(seed) + X = [[1], [1], [1]] + kmedoids = KMedoids(n_clusters=2, random_state=rng) + assert_warns_message(UserWarning, "Cluster 1 is empty!", kmedoids.fit, X) + + +@mock.patch.object(KMedoids, "_kpp_init", return_value=object()) +def test_kpp_called(_kpp_init_mocked): + """KMedoids._kpp_init method should be called by _initialize_medoids""" + D = np.array([[0, 1], [1, 0]]) + n_clusters = 2 + rng = np.random.RandomState(seed) + kmedoids = KMedoids() + kmedoids.init = "k-medoids++" + # set _kpp_init_mocked.return_value to a singleton + initial_medoids = kmedoids._initialize_medoids(D, n_clusters, rng) + + # assert that _kpp_init was called and its result was returned. + _kpp_init_mocked.assert_called_once_with(D, n_clusters, rng) + assert initial_medoids == _kpp_init_mocked.return_value + + +def test_kmedoids_pp(): + """Initial clusters should be well-separated for k-medoids++""" + rng = np.random.RandomState(seed) + kmedoids = KMedoids() + X = [ + [10, 0], + [11, 0], + [0, 10], + [0, 11], + [10, 10], + [11, 10], + [12, 10], + [10, 11], + ] + D = euclidean_distances(X) + + centers = kmedoids._kpp_init(D, n_clusters=3, random_state_=rng) + + assert len(centers) == 3 + + inter_medoid_distances = D[centers][:, centers] + assert np.all((inter_medoid_distances > 5) | (inter_medoid_distances == 0)) + + +def test_precomputed(): + """Test the 'precomputed' distance metric.""" + rng = np.random.RandomState(seed) + X_1 = [[1.0, 0.0], [1.1, 0.0], [0.0, 1.0], [0.0, 1.1]] + D_1 = euclidean_distances(X_1) + X_2 = [[1.1, 0.0], [0.0, 0.9]] + D_2 = euclidean_distances(X_2, X_1) + + kmedoids = KMedoids(metric="precomputed", n_clusters=2, random_state=rng) + kmedoids.fit(D_1) + + assert_allclose(kmedoids.inertia_, 0.2) + assert_array_equal(kmedoids.medoid_indices_, [2, 0]) + assert_array_equal(kmedoids.labels_, [1, 1, 0, 0]) + assert kmedoids.cluster_centers_ is None + + med_1, med_2 = tuple(kmedoids.medoid_indices_) + predictions = kmedoids.predict(D_2) + assert_array_equal(predictions, [med_1 // 2, med_2 // 2]) + + transformed = kmedoids.transform(D_2) + assert_array_equal(transformed, D_2[:, kmedoids.medoid_indices_]) + + +def test_kmedoids_fit_naive(): + n_clusters = 3 + metric = "euclidean" + + model = KMedoids(n_clusters=n_clusters, metric=metric) + Xnaive = np.asarray([[1, 0, 0], [0, 1, 0], [0, 0, 1]]) + + model.fit(Xnaive) + + assert_array_equal( + model.cluster_centers_, [[1, 0, 0], [0, 1, 0], [0, 0, 1]] + ) + assert_array_equal(model.labels_, [0, 1, 2]) + assert model.inertia_ == 0.0 + + # diagonal must be zero, off-diagonals must be positive + X_new = model.transform(Xnaive) + for c in range(n_clusters): + assert X_new[c, c] == 0 + for c2 in range(n_clusters): + if c != c2: + assert X_new[c, c2] > 0 + + +def test_max_iter(): + """Test that warning message is thrown when max_iter is reached.""" + rng = np.random.RandomState(seed) + X_iris = load_iris()["data"] + + model = KMedoids( + n_clusters=10, init="random", random_state=rng, max_iter=1 + ) + assert_warns_message( + UserWarning, + "Maximum number of iteration reached before", + model.fit, + X_iris, + ) + + +def test_kmedoids_iris(): + """Test kmedoids on the Iris dataset""" + rng = np.random.RandomState(seed) + X_iris = load_iris()["data"] + + ref_model = KMeans(n_clusters=3).fit(X_iris) + + avg_dist_to_closest_centroid = ( + ref_model.transform(X_iris).min(axis=1).mean() + ) + + for init in ["random", "heuristic", "k-medoids++"]: + distance_metric = "euclidean" + model = KMedoids( + n_clusters=3, metric=distance_metric, init=init, random_state=rng + ) + model.fit(X_iris) + + # test convergence in reasonable number of steps + assert model.n_iter_ < (len(X_iris) // 10) + + distances = PAIRWISE_DISTANCE_FUNCTIONS[distance_metric](X_iris) + avg_dist_to_random_medoid = np.mean(distances.ravel()) + avg_dist_to_closest_medoid = model.inertia_ / X_iris.shape[0] + # We want distance-to-closest-medoid to be reduced from average + # distance by more than 50% + assert avg_dist_to_random_medoid > 2 * avg_dist_to_closest_medoid + # When K-Medoids is using Euclidean distance, + # we can compare its performance to + # K-Means. We want the average distance to cluster centers + # to be similar between K-Means and K-Medoids + assert_allclose( + avg_dist_to_closest_medoid, avg_dist_to_closest_centroid, rtol=0.1 + ) + + +def test_kmedoids_fit_predict_transform(): + rng = np.random.RandomState(seed) + model = KMedoids(random_state=rng) + + labels1 = model.fit_predict(X) + assert_equal(len(labels1), 100) + assert_array_equal(labels1, model.labels_) + + labels2 = model.predict(X) + assert_array_equal(labels1, labels2) + + Xt1 = model.fit_transform(X) + assert_array_equal(Xt1.shape, (100, model.n_clusters)) + + Xt2 = model.transform(X) + assert_array_equal(Xt1, Xt2) + + +def test_callable_distance_metric(): + rng = np.random.RandomState(seed) + + def my_metric(a, b): + return np.sqrt(np.sum(np.power(a - b, 2))) + + model = KMedoids(random_state=rng, metric=my_metric) + labels1 = model.fit_predict(X) + assert_equal(len(labels1), 100) + assert_array_equal(labels1, model.labels_) + + +def test_outlier_robustness(): + rng = np.random.RandomState(seed) + kmeans = KMeans(n_clusters=2, random_state=rng) + kmedoids = KMedoids(n_clusters=2, random_state=rng) + + X = [[-11, 0], [-10, 0], [-9, 0], [0, 0], [1, 0], [2, 0], [1000, 0]] + + kmeans.fit(X) + kmedoids.fit(X) + + assert_array_equal(kmeans.labels_, [0, 0, 0, 0, 0, 0, 1]) + assert_array_equal(kmedoids.labels_, [0, 0, 0, 1, 1, 1, 1]) + + +def test_kmedoids_on_sparse_input(): + rng = np.random.RandomState(seed) + model = KMedoids(n_clusters=2, random_state=rng) + row = np.array([1, 0]) + col = np.array([0, 4]) + data = np.array([1, 1]) + X = csc_matrix((data, (row, col)), shape=(2, 5)) + labels = model.fit_predict(X) + assert_equal(len(labels), 2) + assert_array_equal(labels, model.labels_) diff --git a/sklearn_extra/tests/test_common.py b/sklearn_extra/tests/test_common.py index 3faa646e..cfcbf9d0 100644 --- a/sklearn_extra/tests/test_common.py +++ b/sklearn_extra/tests/test_common.py @@ -3,8 +3,9 @@ from sklearn.utils.estimator_checks import check_estimator from sklearn_extra.kernel_approximation import Fastfood +from sklearn_extra.cluster import KMedoids -@pytest.mark.parametrize("Estimator", [Fastfood]) +@pytest.mark.parametrize("Estimator", [Fastfood, KMedoids]) def test_all_estimators(Estimator, request): return check_estimator(Estimator)