From 12a1a552806c83a77cd803713b8b7ce8e8cb90b9 Mon Sep 17 00:00:00 2001 From: cozek Date: Sat, 28 Sep 2019 11:29:18 +0530 Subject: [PATCH 1/4] Added dbscan in two formats. A jupyter notebook file for the storytelling and a .py file for people that just want to look at the code. The code in both is essentially the same. With a few things different in the .py file for plotting the clusters. --- machine_learning/dbscan/dbscan.ipynb | 321 +++++++++++++++++++++++++++ machine_learning/dbscan/dbscan.py | 108 +++++++++ 2 files changed, 429 insertions(+) create mode 100644 machine_learning/dbscan/dbscan.ipynb create mode 100644 machine_learning/dbscan/dbscan.py diff --git a/machine_learning/dbscan/dbscan.ipynb b/machine_learning/dbscan/dbscan.ipynb new file mode 100644 index 000000000000..9f4e72720772 --- /dev/null +++ b/machine_learning/dbscan/dbscan.ipynb @@ -0,0 +1,321 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## DBSCAN" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "__Requirements:__\n", + "\n", + "Sadly, we have some unavoidable requirements.\n", + "1. Matplotlib for visualization\n", + "2. Scikit-learn for grabbing some standard datasets to test on\n", + "3. Numpy\n", + "\n", + "`pip install matplotlib scikit-learn numpy`" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "from matplotlib.pyplot import cm\n", + "import numpy as np\n", + "\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## First lets grab a dataset\n", + "We will take the moons dataset which is pretty good at showing the power of DBSCAN. \n", + "\n", + "Lets generate 200 random points in the shape of two moons" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.datasets import make_moons\n", + "\n", + "X, label = make_moons(n_samples=200, noise=0.1, random_state=19)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Visualize the dataset using matplotlib\n", + "You will observe that the points are the the shape of two crescent moons. \n", + "\n", + "The challenge here is to cluster the two moons. " + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.plot(X[:,0], X[:,1],'ro')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The implementation is inspired from the original DBScan algorithm as given in \n", + "DBSCAN Wikipedia\n", + "\n", + "### Abstract Algorithm\n", + "The DBSCAN algorithm can be abstracted into the following steps:\n", + "\n", + "- Find the points in the $ε$ (eps) neighborhood of every point, and identify the core points with more than min_pts neighbors.\n", + "- Find the connected components of core points on the neighbor graph, ignoring all non-core points.\n", + "- Assign each non-core point to a nearby cluster if the cluster is an $ε$ (eps) neighbor, otherwise assign it to noise.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preparing the points\n", + "Initially we label all the points in the dataset as __undefined__ .\n", + "\n", + "__points__ is our database of all points in the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "points = { (point[0],point[1]):{'label':'undefined'} for point in X }\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Helper functions" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def distFunc(Q, P):\n", + " '''\n", + " Calculates the Euclidean distance\n", + " between pointd P and Q\n", + " '''\n", + " a = pow((Q[0] - P[0]),2)\n", + " b = pow((Q[1] - P[1]),2)\n", + " return pow((a+b),0.5)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "def rangeQuery(DB,Q,eps):\n", + " '''\n", + " Finds all points in the DB that\n", + " are within a distance of eps from Q\n", + " '''\n", + " Neighbors = []\n", + " for P in DB:\n", + " if distFunc(Q,P) <= eps:\n", + " Neighbors.append(P)\n", + " return Neighbors\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_cluster(DB,clusters):\n", + " '''\n", + " Extracts all the points in the DB and puts them together\n", + " as seperate clusters and finally plots them\n", + " '''\n", + " temp = []\n", + " noise=[]\n", + " for i in clusters:\n", + " stack = []\n", + " for k,v in DB.items():\n", + " if v['label'] == i:\n", + " stack.append(k)\n", + " elif v['label'] == 'noise':\n", + " noise.append(k)\n", + " temp.append(stack)\n", + " \n", + " my_clus = {}\n", + " \n", + " color=iter(cm.rainbow(np.linspace(0,1,len(clusters))))\n", + " for i in range(0,len(temp)):\n", + " c=next(color)\n", + " x = [ l[0] for l in temp[i]]\n", + " y = [ l[1] for l in temp[i]]\n", + " plt.plot(x,y,'ro',c = c)\n", + "\n", + "\n", + " x = [ l[0] for l in noise ]\n", + " y = [ l[1] for l in noise ]\n", + " plt.plot(x,y,'ro',c = '0')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Implementation of DBSCAN" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "def DBSCAN(DB,eps,min_pts):\n", + " '''\n", + " Implementation of the DBScan algorithm\n", + " '''\n", + " clusters = []\n", + " C=0\n", + " for P in DB:\n", + " if DB[P]['label'] != 'undefined':\n", + " continue\n", + " neighbors = rangeQuery(DB,P,eps)\n", + " if len(neighbors) < min_pts:\n", + " DB[P]['label']='noise'\n", + " continue\n", + " C += 1\n", + " clusters.append(C)\n", + " DB[P]['label'] = C\n", + " neighbors.remove(P)\n", + " seed_set = neighbors.copy()\n", + " while seed_set!=[]:\n", + " Q = seed_set.pop(0)\n", + " if DB[Q]['label'] == 'noise':\n", + " DB[Q]['label'] = C\n", + " if DB[Q]['label'] != 'undefined':\n", + " continue\n", + " DB[Q]['label'] = C\n", + " neighbors_n = rangeQuery(DB,Q,eps)\n", + " if len(neighbors_n) >= min_pts:\n", + " seed_set = seed_set + neighbors_n \n", + " return DB,clusters\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Lets run it!" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAAD6CAYAAACs/ECRAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAgAElEQVR4nO2de7Ac1X3nPz9dSVhabIyuiEMAXeGEPJwQv7Q4cVIpHGwHUxtkJ94KzjWRHbvuIsW7SaV2N6RUlcRkVYtx1RpSsSAqwhpbd40fix05MSEY2+s/EhxECpCBxcgEYalILCQvsRbWQuK3f3QP6ju3n9Pv6e+nampmTp/uPtPTfX7n/F7H3B0hhBDDZUXbDRBCCNEuEgRCCDFwJAiEEGLgSBAIIcTAkSAQQoiBI0EghBADpxJBYGa3mNl3zOwbCdvnzexBM9tnZn9rZq+ObHsiLL/fzPZW0R4hhBD5sSriCMzsF4BjwMfd/aditr8ReMTdv2tmbwP+yN3fEG57Atjk7k/nPd/69et948aNpdsthBBD4r777nva3c8aL19ZxcHd/WtmtjFl+99Gvt4DnFvmfBs3bmTvXk0ehBCiCGZ2IK68DRvB+4A7It8d+Bszu8/MFlpojxBCDJpKZgR5MbM3EQiCn48U/7y7HzKzHwDuMrP/7e5fi9l3AVgA2LBhQyPtFUKIIdDYjMDMfhq4Gdjs7kdG5e5+KHz/DvA54KK4/d19l7tvcvdNZ521TMUlhBBiQhoRBGa2AbgduNLdvxkp/1dm9tLRZ+CtQKznkRBCiHqoRDVkZp8ELgbWm9lB4A+BVQDufhPwB8AssNPMAE64+ybgFcDnwrKVwP9w97+uok1CCCHyUZXX0Lsytr8feH9M+ePAq5fvIUT17FuEu7fDM0/CGRvgkh1w4XzbrRKifRo1FgvRFvsW4QsL8PyzwfdnDgTfQcJACKWYEIPg7u2nhMCI558NyoUYOhIEYhA882SxciGGhASBqJ19i3D9RvjgiuB932LzbTgjIfQkqVyIISFBIGplpJt/5gDgp3TzRYVBWWFyyQ5YtXZp2aq1QbkQQ0eCQNRKFbr5KoTJhfPwy7vgjDnAgvdf3iVDsRAgryFRM1Xo5tOESZGO/MJ5dfxCxKEZgaiVKnTzMvQKUS8SBKJWqtDNy9ArRL1IEIhaqUI3L0OvEPUiG4EA6k2/UFY3P9q3S+khlK5CTBOVLFXZNJs2bXKtUFYd4+kXIBhxd9Wrpu1OuG/XS4gRZnZfmPBzCVINiV6lX4hzJb393XDd+noC1eLiF/p0vYTIg1RDoldeOXGdMMBzR6pPIpeUqC7u/NDN6yVEHjQjEL3yyknrbKselSeN/G0mvn4Xr5cQeZAgEJV65dSdVyirs61yVJ50LD8pLyYxXUgQiMrSL0yaCqKI8IgTWlHWrCvW5jQSZ0pzSlchpgt5DYnSvOjFcyB++xlz8DtPJO9b1ANn3yJ87jfAX1i+bc0s/Oenc7Q1h8eRvIPEtFGr15CZ3WJm3zGz2IXnLeBPzGy/mT1oZq+LbNtiZo+Fry1VtEc0x5JZQAJp6ppJPHAunIek8ctzR3O2NceMRYnqxFCoymvoY8CfAh9P2P424ILw9QbgRuANZraOYKH7TYAD95nZHnf/bkXtEjWT5MUTJU2vn+axlDZ6P2NDvPBJO9ckyeuUqE4MgUpmBO7+NSBlLMZm4OMecA/wcjM7G/gl4C53Pxp2/ncBl1bRJtEMWcbZLCNqUse9Zl366H0SA3ef3GSFaJKmjMXnAN+OfD8YliWVi56QNgLPo0pJ6tAhXWU0idomsa3e3sppQnSB3gSUmdkCsACwYYMctrvCJTvKGVST8gjdfmV8/ejovajaJq6tLx73QPUBaUL0haYEwSHgvMj3c8OyQ8DFY+VfjTuAu+8CdkHgNVRHI0VxqkgIF9ehJ3khlQnaWtLWmGM//2wggG5/d/B9zSy87QYJBjH9NKUa2gP8Rug99DPAM+7+FHAn8FYzO9PMzgTeGpYNki4s8j4JF84H7qF/+ELwXkXHWVfq6VFbsYQKkSHGc0fgL36zP/+DEJNSyYzAzD5JMLJfb2YHCTyBVgG4+03AF4HLgP3As8B7w21HzeyPgXvDQ13j7mlG56klKa8NDHNEWnfq6SSvo3FOHi++JKYQfUMBZR3h+o0JqpCUYKy+0Xb66PG2pCWQW4IFsx0h+k5SQFlvjMXTzrS7NnZlxhMVRmvWwco1QRCarQhyCMWhZHJi2lGuoY7Qpwygk9CFHP7jkcXPHYETz8Gmq+AlL4/fZ2Z1MbtEX+08YthIEHSEJtbljeuk0jquKju1Lsx4koTR3hsDoTDOmlnYfEv+GcukSfeEaBuphjpCUeNoUX17nGrm8+8Fs8AgOiobqWugWlXOJCkhqqaI0JnENjNJCgshuoAEQYfIGyA1ib49rpN64fnl9aLqmio7taTAsyZz+Of1FILJZipdmPUIMQlSDfWQSfTtRTqjZ56svlPrQibPIkJnkpnKtNt5xPQiQdBDJumki3RGZ2xI79QmtR3UEXhWhAvnA71/FpPOVJqw8whRBxIEPWSSkWdcJ7ViVeAVE2XUcSV1ahdc1m+D6NtuWP67ZlaHAqLkTKULsx4hJkE2gh4yib49yRgdVxbtuMa39d0gWnfEstYvEH1EgqCHTNqZjXdScZ5HEEY5Pxm//u80GETVWQuxFKWYGChxKRZmVgdLQMZ5E0Ew61i5Jt7nfppSYdRNl1JtiGFR65rFIh9dijqNU/GcPJ4sBOBUfRlEJ2eSoLMu3TdiOpEgaIgmo07zdByTqnKeOzosg2jV0dhFXX//aluwRkJfjfOiH0g11BBNZReNU/nErRiW1J4shqQCyqs+G11fyL72H1zBkjUPXiQmw+m+xXCltpj6Q/ofRHVINdQyTRlZk0acn9uydBQZ5x46szpwKU1iaCqgvOqz0Yg+z2i/iOvv3duJFxr0yzgvuo8EQUM0FXWa1EH4yaUqhTif9823wNv/+6myNbPV+NdH6ZO+u45o7CJBZ1UFCAqRhdxHG6KpXDtp+XTG/f2T3ChHZSPvlucqWjOuK2sS5KVIbqJRx5yVWK+I62/a+Y8fC4SpvI5EFWhG0BBNRZ3GjTij5B3l1mHc7sKaBEXIqz7LisaeVNgn/ZcrVoYuvDIei4qoas3iS4EbgBngZne/dmz7R4A3hV/XAj/g7i8Pt50E9oXbnnT3y6toUxdpIpBpdPzPbYlfcSuvSqGOCOK+BaNVFY09HsSXNCuK2/eXdy0tO35seRxHnyK7RTcp7TVkZjPAN4G3AAcJFqJ/l7s/nFD/3wOvdfffDL8fc/fTi5yzj15DVZInICmv91ASRbxb8jKEdZmzSLoGa2aD1dKy/q86/hcxHOr0GroI2O/uj7v7ceA2YHNK/XcBn6zgvJ2nDsNoXpVNWVVUHcZtZedMnv08dySf2kyprkUdVCEIzgG+Hfl+MCxbhpnNAecDX44Uv8TM9prZPWb29qSTmNlCWG/v4cOHK2h2vdQVQFZEz14m7XMdnXbT2Tm76KFUtMMeFxwSpqIOmvYaugL4rPsS7fWcux8ys1cCXzazfe7+rfEd3X0XsAsC1VAzzS1GVGVjK5br6KvQ5TalZ68rS2dTCd+66qGU5D2WmMNpTHDUnT1VDJMqBMEh4LzI93PDsjiuAH4rWuDuh8L3x83sq8BrgWWCoOuMdzxxhloo32E3ufZvn7N0djVddpoBOq97cZ//F9FNqhAE9wIXmNn5BALgCuDXxyuZ2Y8DZwJ/Fyk7E3jW3b9vZuuBnwOuq6BNjRPX8cRRtsPuwtq/faDLHkppHblG+qINSgsCdz9hZh8A7iRwH73F3R8ys2uAve6+J6x6BXCbL3VT+gngz8zsBQJ7xbVJ3kZdJ08HU0WHLdVAPpqcOcWRN9V0FSmpldZalEVJ5yoiyS3QZsBfSHfzrOshHnIHUdZ9tolzV9HGNn+n6B9KOlczSd4c77g12Wsny7OojNdLk2mvu0ib6wfn9eyqItI6zzG66D0luoVyDVXEJCqbrIe4jNdLV42lTdKWUTWvfaIKO0bWMbrqPSW6hQRBhRTteNIe4rIdeZeNpdNOXvtEFXaMrGNoQCDyINVQS+xbDGIN4jhjQ/mOXBGo7ZE36KuK4LCsY2hAIPIgQVABRXWwo+l6XKzB6CEu25ErArU98tonqrBjZB1DAwKRB3kNlWQSr400D6N33BrsV5VHyVC9hkSAvIpElCSvIdkISlJUB7tvMXmxEX9h6aIxo+NP2pErAnX6yRL2ijsReZAgKEkRHexodJZEXF4ZPbAiibweQbqPRBayEZSk6GLkSWkopL8XRclyP1b8gMiLBEFJqlqMXDpbEUdaZ542Gx16QKEohgRBSYp4fiTOHuYkBF5kcRE2boQVK4L3xeH2XFmdedpstG/rQ4t2kSCogLwLwMilM4PFRVhYgAMHwD14X1gYrDDI6sxjF7c3uOAyxQ+IYkgQNEib+W96wfbt8OxYz/fss0H5AMnqzC+ch1dvASyy0eGBW2HNuvh9i8QPyMbQMC3OhuU11DDy4EjhyYSeL6l8ysmTguKxL7JsMfvnnw1WPFu1dvJ1K5SjqGFGs+HRQGg0GwaYr/+Ca0ZQExpN5WB8BLQuYRi7YZhhsHlUiUmzhueOlpt9ysbQMBmz4W3btrFy5UrMjJUrV7Jt27ZKTy9BUAPy2MhBnD3ge9+DVauW1lu7FnbsWL7vAAzKeVSJaQbjvLarOGRjaJiU2fC2bdu48cYbOXkyyElz8uRJbrzxxkqFgVJMlCQusvPu7QlT+rnggRQEHfiBmIs0Owunnx48GBs2BEIgOjUen0JDICx27WpkCt016kohkZQGRfdwTSQ9D3NzrDx48EUhEGVmZoYTJ04UOk2tC9OY2aVm9qiZ7Tezq2O2v8fMDpvZ/eHr/ZFtW8zssfC1pYr2NEXSyD8phYRGUxGSRkBHj8ITT8ALLwTv4527DMpLqMsBQR5uDbNjRzCgiRLOhuOEAJBYPgmljcVmNgN8FHgLcBC418z2xKw9/Cl3/8DYvuuAPwQ2EZi87gv3/W7ZdjVBkh7VZuIziyrjY4QNG+JHQFn2ABmUl1HWASEtX5FyFDXEaMCzffuy2fDMli2JM4KqqGJGcBGw390fd/fjwG3A5pz7/hJwl7sfDTv/u4BLK2hTIySN8P2kRlOZpIyAUkkSFAM1KJclzZ5VxMYg54gKmJ+PnQ0vLMQnKEsqn4QqBME5wLcj3w+GZeP8qpk9aGafNbPzCu7bSdIihRUvkMH8fKDXn5sDs+A9j55/UgEiYqnCO0jOEfWyc+dOtm7d+uIMYGZmhq1bt7Jz587KztGU19AXgI3u/tMEo/5bix7AzBbMbK+Z7T18+HDlDRxRZGSTpkcdjaZ+5RNB+e1XaqS0jIQRUOY+kwgQEUsV3kFyNa2fnTt3cuLECdydEydOVCoEoBpBcAg4L/L93LDsRdz9iLt/P/x6M/D6vPtGjrHL3Te5+6azzjqrgmYvp+jIJstQp5FSTUwiQEQsVaxgJlfT/lOFILgXuMDMzjez1cAVwJ5oBTM7O/L1cuCR8POdwFvN7EwzOxN4a1jWClWPbIoeb7B61oHEBXSRKryDtBxmTjp8n5f2GnL3E2b2AYIOfAa4xd0fMrNrgL3uvgf4D2Z2OXACOAq8J9z3qJn9MYEwAbjG3Y+WbdOkFB3ZZIXhT7JozeBC+lsOre87ZZcjrcI76JId8bEMco6I0PH7XAFlEYoG0WTVL3K8wQbwpATS8MQTTbemV3RpPWKtj51BR+7zWgPKpoWi0+SsEX8Vi9ZMvZ5VcQET0yUjbZl0FoOg4/e5BEGEolGaWal+K1m0Ztr1rIoLmJjBDh76SMfvc6WhHiNvlOa+RTj+veXlK1YtHfHnPd5g9aw7dsTnDlJcQCZ50lSLjtDx+1wzggySPHnu3g4njy+vf9rLJpsWD3bRGsUFTIzyAfWIjt/nMhankGaMu/1Kli0IAoAFelIhmkBGWlEEGYsnIM0YN1idvugUZY20k8SuKN6le3EAZZEgSCHNGKdpueg7k0S+DzZaPm4hpYWFqREGEgQpZK3+NEidfleZ4tFaXUziftoll9VGmfJ1MCQIUsga9ct3ugYm6dCnfLRWBXHqnEncTwfrslo2DqDjAxUJghQ06m+YSTv0KR+tleWvtgXODVF1zu1XLh/kjEizcw3WNjZpHMDiIqxfD+9+d6cHKhIEGWjU3yCTdugdj9psk32LsPcmlnu4OTz/f4O4lyhZdq64WTLA8WNTbifIsw7G+Kh/27agwz9yZPnxOjZQkSAQ3SFvhz7+wK1LCPHuSNRmm9y9nXg355DTXlZsxjuaJa+ZXVr+3JEpNxpnxQHEzWZvumn5wCZKhwYqiiMQzbG4GLsm64skJeY6/XSYnQ32W7cO/uVf4PnnT21ftSp4OI9HIvzWru1UwE5bfHAFqYJg0riXwSZJTCLp3k2jhcSKiiOYkMH6TFdNHv3/jh2wevXyfY8dO7XfkSNLhQAE31/60s5GbbZJlu5+Ut3+YI3GSRQd3XcovQRIEKQyWJ/pOsij/5+fDzr0STh6VKuWxZCk04dycS+DNRonkaSGNFteNjvbuYGKBEEKg/WZroO8+v+jE65LJHtALEs83wAL1j9/0R4Ak814FVA5RpIx+aqrls5Ud++Gp5/ulBAAZR9NRdPfCtmwIV6HOt6BJ9VLo2PT7K6RlAG3zKp4VaxsNlWMOvY0G1iHkbE4BRnEKmR8qT6IN+jG1Rtn9epAhXT0aO8euC6h+3t41GosNrNLzexRM9tvZlfHbP9dM3vYzB40s7vNbC6y7aSZ3R++9ozv2yaa/lZI3jS8cfW2bl36/ZZbgum17AGl0Ix3QjoeJTwJpWcEZjYDfBN4C3CQYCH6d7n7w5E6bwK+7u7PmtlW4GJ3/7Vw2zF3P73IOZt0H1WaXzGtaEYwAXlnth2lzhnBRcB+d3/c3Y8DtwGboxXc/SvuPrpy9wDnVnDeRlBksZhWNOOdgKLR7z2ZPVQhCM4Bvh35fjAsS+J9wB2R7y8xs71mdo+Zvb2C9nQSxSOIrqFcWhNQJJ1Jj5IhNuo+ambvBjYBH44Uz4VTlV8HrjezH07YdyEUGHsPHz5cWxvr6LAVj1AxPRll9QHNeAuwuBjcc3HEuS/3KBliFYLgEHBe5Pu5YdkSzOzNwHbgcnf//qjc3Q+F748DXwVeG3cSd9/l7pvcfdNZZ51VQbOXE9dhf/69cN36coJB8QgVkmeUJUEhqmZ03508uXzbyH15/L5LcoPuUI6hEVUIgnuBC8zsfDNbDVwBLPH+MbPXAn9GIAS+Eyk/08xOCz+vB34OeJiWiOuwX3g+SKhVZiQv74wKyRpl9Wg63hZSU05A3H0HMDMTGIph+X0XF1UMnQx+LC0I3P0E8AHgTuAR4NPu/pCZXWNml4fVPgycDnxmzE30J4C9ZvYA8BXg2qi3UdPk6ZjjRvJZD5bC8SskS0fbo+l4G0hNOSFJ990LLwTeQnH3XZxH5urVnQx+rMRG4O5fdPcfdfcfdvcdYdkfuPue8POb3f0V7v6a8HV5WP637n6hu786fP/zKtozKXk75qjAyPNgyTujQrIWCEl6YA8c0KwAqSknZtL7bpyOBvAq11CEtARdUaICI8+DJe+MCslaICRt2i0VkdSUk7C4GGTAHSfvfRfl+ec7OTuVIIgw3mGvmYWZmKzI0dWY8j5Y8s6oiKwI5ThBMUIqIqkpizKyOY2vMjaeQTTtvhung8Zi5RpKYd8i3PHbobF4jFVrA6Fx93ZFZ3aOxcVgjdg4zAK97gCIi4qHpYnm4NS9rMFJDEneP3GLyowvvHTsWPwylS0sSDNCC9MUZKT7jxMCcEr9I/1/B5mfDx62ODrosVEHSbYrkJqyEEUCyObnl66JccMN2escdwQJggTidP/jjGYCerA6SJ7FxqeYJNvV57bA7VcG33/lE1JTZpJlJE4jb6LFDqD1CBLIazz7wkLQ8UsN1DF6nh++LEn3r4fxUEXWHhg0O3bEJ5nLO6CYn+/FPacZQQJ5jWdyvesw41P1HjyQVZHn/tW9m4MejerLIEGQQF5XUpDrnegeee9f3bs5GMCAQoIggTjf/zWz8XXXrGu0aUJkMrp/R2sUJyG3UQESBKmM+/6/7QZYsWp5vePfU4i+6B4XzoOneMrKu02MkCAowIXzcNrLlpefPC5dq+gmSSN+m5F3mziFBEFBnjsaXy5dq+giSXEu77hVQkCcQoKgIArR7yhagyAW5blqmW3bYOXKwONo5crgewdRiomCjCI2FaLfIeIWFIcgH8wNN0yll4foAdu2wY03Li/fuhV27my+PSSnmJAgmIC4HC4SAi2SthrU2rVT6fddB7qvK2blyvgVzWZm4MSJ5tuDBIFu8mlmxYr0PO8tJvnqC5rp1kDSCmXQ2roEg046p1WZppysvC8dTPvbNbRgTQ3MJARxJJW3yCAEgW7yKSDNGJyVC34gGUfLoAVramBhoVh5i1QiCMzsUjN71Mz2m9nVMdtPM7NPhdu/bmYbI9t+Pyx/1Mx+qYr2jKObvOdkLUg/ygczGxP6PaCMo2WQN1wN7NwZGIZHM4CZmVYNxWmUFgRmNgN8FHgb8CrgXWb2qrFq7wO+6+4/AnwE+FC476uAK4CfBC4FdobHqxTd5D0nz4L08/Pw9NOwe/fUJwirA62rURM7dwaGYffgvYNCAKqZEVwE7Hf3x939OHAbsHmszmbg1vDzZ4FLzMzC8tvc/fvu/o/A/vB4laKbvOeUWRxEQiAXReIN9i3C9RvhgyuCd9naIvQ0nqWK9QjOAb4d+X4QeENSHXc/YWbPALNh+T1j+54TdxIzWwAWADYU1PmObmZ5DfWUDRvi3UOl+6+UC+ezn4lx7yKtaxBhPJ5lpMKEzg9IemMsdvdd7r7J3TedddZZhffX4vE9ZtLVxno6OquSqkfvcrwg+b7Ko8LsKFXMCA4B50W+nxuWxdU5aGYrgTOAIzn3bQ3FHnSESVYb6/HorCrqGL0P3vEi7b4qosLsGFXMCO4FLjCz881sNYHxd89YnT3AlvDzO4EvexDJtge4IvQqOh+4APj7CtpUGsUedIyo7n/HjkAopI30ezw6q4o6Ru+DcbyYZNRfZn3jliktCNz9BPAB4E7gEeDT7v6QmV1jZpeH1f4cmDWz/cDvAleH+z4EfBp4GPhr4LfcPSYmu3k0Be4oWa6kI3o8OquKOkbvg3C8SLvH0u6rSVWYHWAwKSaK8sEVQNylscDOIFoiKa/QeBqJvPWmmOs3hjPaMc6YC+xkkzLVKtPFRdiyJT5H0Nxc8J52Xy0uFlNhNsygU0xMwmCmwH0j70i/x6Ozqqhr9D61jhejmUCcEIB8o/6q3JcbdnSQIEhgEFPgPpJXDzuKNh5wcJnWIihInP4/yoYN2fdVFR14XvVnlbh7716vf/3rvQke3O3+kTn3P7Lg/cHdjZxWpLF7t/vate7BIxK81q4NytP2mZtzNwve0+qK4WK29L6Kvkbb0u6fSe7NOObm4tswN1fyB7oDez2mT229U5/k1ZQgEB1l1LGD+8xM+gNa1cMppp+kDnj8lXT/VNWBJwkks9I/MUkQSDUk+sf8/Cld7UifmzR9LuJGqgC0YTL63w8cWL6GQNyaAkn3T1Weai24oUoQiH6S1sFHO/SklcvGH8429LKifaL/OwT//ajzn5tLXkAmrnMv2oEnDTzacHSImyZ0/SXV0MCI0/Gn6XPHVUF5pus16mVFh8n634vcF0XUkFl1a7JrIRuB6CVJD8zsbPwDOrIZFNXx1qiXFR0m638vamPK24G3NPBIEgRSDYluk6QCgvjpc5IPOKS7kfY4PYAoQdb/XtQNOW8cQcci3yUIRLdJejCOHo1/QEfRn+PMzaU/nApAGyZ5/vc61rjo2MBDgkB0m7QHJu4BnbRDn58PUgtElxXcsmVQAWiDJM+Ivw5vsq4NPOL0RV1/yUYwIJoKIFO8gYijzvuihUBHZCwWvaWJB0ZeQ92iK9HgU3ZfJAkCqYZE92liHeI0450CzZqlSzEdHTPq1oUEgRCQbItYt647ndJQaGNRoSRh3zGjbl1IEAgBycY7GPxKZ43T9Cg8bQbSNaNuTUgQCAHJ3iNHj8bX76tqoG41VxXHb3oUnjYDGUo68zjDQd4XsA64C3gsfD8zps5rgL8DHgIeBH4tsu1jwD8C94ev1+Q5r4zFIpOqjI3TZCys2zOqquM37cE1oKhy6vAaAq4Drg4/Xw18KKbOjwIXhJ9/CHgKeLmfEgTvLHpeCQKRSpUdyTS5ldYt1Ko8fpNeQ9Mk7DOoSxA8Cpwdfj4beDTHPg9EBIMEgaieqh/srrgylqXukW9fR9Z15RPqIEmCoKyN4BXu/lT4+Z+AV6RVNrOLgNXAtyLFO8zsQTP7iJmdVrI9QlRvbGzCfbUJ6ta999XDpogdoEuurRWSKQjM7Etm9o2Y1+ZovVDaeMpxzgY+AbzX3V8Ii38f+HHgXxPYG34vZf8FM9trZnsPHz6c/cvEcGmyQ+pTjEHdHjB99rDJK+zbcG1tgrhpQt4XOVVDwMuAfyBFDQRcDPxlnvNKNSRSaUqv30f7Qd1qjR6rTXLRV/VXCDXZCD7MUmPxdTF1VgN3A78Ts20kRAy4Hrg2z3klCEQmbaalGNkjpq0TbJKuCpSeG5brEgSzYSf/GPAlYF1Yvgm4Ofz8buB5TrmIvugmCnwZ2Ad8A9gNnJ7nvBIEohOkrZLWh9lBV+nyTKvLbctBLYKgrZcEgchNnSPLtBlBnSPFro6Wq6Lro+4eX38JAjE82gigqlt33PMRaS6a0sP3uEOflCRBYMG2frFp0ybfu3dv280QXWfjxsC9b5y5ucAzpAoWF4MFbJKWyKzyXNDMb2qbpv63hYWlHkBr105n+ogIZnafu28aL1euITG9NJG8bH4+cDlMomrXySGkRa7CDTXLrXda3UAnRIJATC9NxRMkHW92drLRZVon1tegrSKUTfSWJ+hrCAK1CHH6oq6/ZCMQuehaPEEenXTWsabFRtCGET9qbO66QbomkLFYDJKmDIJZ52kH/WcAAArwSURBVMnbgefpoNo2cpY9fx3CLNqmPIb7aRGoBZEgEKJN8o5A0zqyLni4VNGB1pEUMMt7K+744wJt69ZuXOMaSRIE8hoSoglWrAi6o3HMlhqbkzxmzJbu35aHSxUePXmvRdk2Rcm6XgPxIpLXkBDjNJkwLsmYu2LF0vPGecyMCwFoz8MlyZia1RFHqdrgnWbgzWtsHrgXkQSBGCZ5PEuqFBRxHTwE8QfR88Z5zCTN2tvwcEnqrM3Sr0/0Wh47BqtWLd1eJktpUpvm5pZmE037P4fuRRSnL+r6SzYCUZosPXUZXXiSMXX3bveZmeL68S55uOzenWzHSGpP3LVcvdp9drYafXye/yqrTpeucY0gY7EQEbLSGEzaMWR1OJOkT2jbw2VcsKV55cQJwSY62TRPpjwCuO1r3BASBEJEyeqcJs13k3XcMgKmiEdLVS6mcR1k0rWZnY3vTNMER91keRSNu5QO1Guo9U59kpcEgShNXaqCLAHSxMizynMkXYfx37l2bSAI8tStU+0y3pkntanONnQYCQIhxslSJ0zSmXYhIKwqVczu3dmdaPQ3ZK3PEH2tWtVMhHfaawpVP1lIEAhRlEk67C7omqtQxWR1qnFCJc/6DFE1UtUUOf/MzOCEgHuyIJD7qBBJ5F3QfHyfMgnTyrK4GJw3jiJ++nF+9SOSXD2TXGTjOHo0f1vyktfVc+1auPXWqQoUK4sEgRBVM4kASaNIPMP27cGYdxyzYn76aZ3qmjVw5ZXL2xInBGdn449RR7bUtCywbQnmvhA3Tcj7AtYBdxGsWXwXcGZCvZOcWq94T6T8fODrwH7gU8DqPOeVakgMhqKqpjQ9/eh4edRdRYzEaSqWumMIss41QDtAGtS0eP11wNXh56uBDyXUO5ZQ/mngivDzTcDWPOeVIBC9ooxxuKjhN61+kY6yqNto3t8/OxsYiuvqrAfgAlqGugTBo8DZ4eezgUcT6i0TBIABTwMrw+8/C9yZ57wSBKI3lB2lFo1nSDtfUaGSN5AM8v+egUTwdpUkQVAq+6iZ/R93f3n42YDvjr6P1TsRqoVOANe6++fNbD1wj7v/SFjnPOAOd/+prPMq+6joDWWzdU6y/+JiYCt48slAb75jR6ATL5v1My3LZ97fU3XmUVGIibOPmtmXzOwbMa/N0XqhtEmSKnPhyX8duN7MfniCH7BgZnvNbO/hw4eL7i5EO5RNZlZ0/d4kIQDls36mGZvz/p4hLLXZR+KmCXlf5FQNje3zMeCdSDUkhkAVqpBxHXuSobWJZS6TInXz/p6mIqtlJ4iFmmwEH2apsfi6mDpnAqeFn9cTeBi9Kvz+GZYai7flOa8EgegNVXZ8VaTF6MIyk1F7xSgZXFUdtjyHUqlLEMwCd4ed+5eAdWH5JuDm8PMbgX3AA+H7+yL7vxL4ewL30c+MBEbWS4JA9IoinW9a3boS5dX5e9KOUUeHLWN0KkmCQEtVCtEVtm2Dm25aakyNLpeYZWitYhnJpqirrTJGp6KlKoXoMouLy4UALF0uMcvQWtSw3CZ1rQgmY/RESBAI0QWSUkPAqc4xq6NvO89REerqsPskDDuEBIEQXSBtJDzqHPN09FXnOaqLujrsPgnDDiFBIEQXSFsUPto59qWjjxKXNK/ODruP16hlVrbdACEEQWe/sLA09bMZXHVVvzuyxcWlv+vAgeA7BL+rz79titCMQIguEDdC/sQnYOfOtltWjrh1DaIGcNEJ5D4qhKgPuXN2CrmPCiGaR+6cvUCCQAhRH3Ln7AUSBEKI+pA7Zy+QIBCiCoqsKzw05M7ZeeQ+KkRZslwkheg4mhEIURa5SIqeI0EgRFnqSqAmRENIEAhRFrlIip4jQSBEWeQiKXqOBIEQZZGLpOg58hoSogqUQE30mFIzAjNbZ2Z3mdlj4fuZMXXeZGb3R17/z8zeHm77mJn9Y2Tba8q0RwghRHHKqoauBu529wsIFrG/eryCu3/F3V/j7q8BfhF4FvibSJX/NNru7veXbI8QQoiClBUEm4Fbw8+3Am/PqP9O4A53fzajnhBCiIYoKwhe4e5PhZ//CXhFRv0rgE+Ole0wswfN7CNmdlrJ9gghhChIprHYzL4E/GDMpiVhk+7uZpa4uIGZnQ1cCNwZKf59AgGyGtgF/B5wTcL+C8ACwAb5ZwshRGWUWpjGzB4FLnb3p8KO/qvu/mMJdX8b+El3X0jYfjHwH9393+Q472HgwMQNL8d64OmWzl2GvrYb+tt2tbt5+tr2pto95+5njReWdR/dA2wBrg3f/yKl7rsIZgAvYmZnh0LECOwL38hz0rgf0hRmtjduhZ+u09d2Q3/brnY3T1/b3na7y9oIrgXeYmaPAW8Ov2Nmm8zs5lElM9sInAf8r7H9F81sH7CPQCL+l5LtEUIIUZBSMwJ3PwJcElO+F3h/5PsTwDkx9X6xzPmFEEKURykmirOr7QZMSF/bDf1tu9rdPH1te6vtLmUsFkII0X80IxBCiIEjQZCBmf1bM3vIzF4ws0SrvpldamaPmtl+M1uWaqNp8uSBCuudjOR62tN0O8faknoNzew0M/tUuP3roRNC6+Ro93vM7HDkOr8/7jhNY2a3mNl3zCzWW88C/iT8XQ+a2euabmMcOdp9sZk9E7nef9B0G+Mws/PM7Ctm9nDYp/x2TJ12rrm765XyAn4C+DHgq8CmhDozwLeAVxIExz0AvKrldl8HXB1+vhr4UEK9Y21f47zXENgG3BR+vgL4VE/a/R7gT9tua0zbfwF4HfCNhO2XAXcABvwM8PW225yz3RcDf9l2O2PadTbwuvDzS4FvxtwrrVxzzQgycPdH3P3RjGoXAfvd/XF3Pw7cRpCHqU2K5oFqmzzXMPqbPgtcEsagtEkX//tcuPvXgKMpVTYDH/eAe4CXh4GjrZKj3Z3E3Z9y938IP38PeITl3pStXHMJgmo4B/h25PtBYtxlGyZvHqiXmNleM7tnlB68JfJcwxfruPsJ4BlgtpHWJZP3v//VcKr/WTM7r5mmlaaL93VeftbMHjCzO8zsJ9tuzDihWvO1wNfHNrVyzbUwDen5lNw9LVq6VSrKAzXn7ofM7JXAl81sn7t/q+q2DpwvAJ909++b2b8jmNUohqY+/oHgvj5mZpcBnwcuaLlNL2JmpwP/E/gdd/+XttsDEgQAuPubSx7iEEHk9Ihzw7JaSWu3mf1zJIXH2cB3Eo5xKHx/3My+SjBKaUMQ5LmGozoHzWwlcAZwpJnmJZLZbg8CL0fcTGC/6QOt3NdliXau7v5FM9tpZuvdvfUcRGa2ikAILLr77TFVWrnmUg1Vw73ABWZ2vpmtJjBktuqBw6k8UJCQB8rMzhyl/jaz9cDPAQ831sKl5LmG0d/0TuDLHlrYWiSz3WM63ssJdMN9YA/wG6Eny88Az0TUjZ3FzH5wZDsys4sI+rm2BwyEbfpz4BF3/28J1dq55m1b0rv+At5BoKf7PvDPwJ1h+Q8BX4zUu4zAC+BbBCqltts9S7Bq3GPAl4B1Yfkm4Obw8xsJ8jw9EL6/r+U2L7uGBGnJLw8/vwT4DLAf+HvglW1f55zt/q/AQ+F1/grw4223OWzXJ4GngOfDe/x9wFXAVeF2Az4a/q59JHjNdbDdH4hc73uAN7bd5rBdPw848CBwf/i6rAvXXJHFQggxcKQaEkKIgSNBIIQQA0eCQAghBo4EgRBCDBwJAiGEGDgSBEIIMXAkCIQQYuBIEAghxMD5/1SENh4utwCVAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "eps = 0.25\n", + "min_pts = 12\n", + "\n", + "DB,clusters = DBSCAN(points,eps,min_pts)\n", + "\n", + "plot_cluster(DB,clusters)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### I encourage you to try with different datasets and playing with the values of eps and min_pts" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/machine_learning/dbscan/dbscan.py b/machine_learning/dbscan/dbscan.py new file mode 100644 index 000000000000..8b5251f3bd74 --- /dev/null +++ b/machine_learning/dbscan/dbscan.py @@ -0,0 +1,108 @@ +import matplotlib.pyplot as plt +from matplotlib.pyplot import cm +import numpy as np +from sklearn.datasets import make_moons + + +def distFunc(Q, P): + """ + Calculates the Euclidean distance + between pointd P and Q + """ + a = pow((Q[0] - P[0]), 2) + b = pow((Q[1] - P[1]), 2) + return pow((a + b), 0.5) + + +def rangeQuery(DB, Q, eps): + """ + Finds all points in the DB that + are within a distance of eps from Q + """ + Neighbors = [] + for P in DB: + if distFunc(Q, P) <= eps: + Neighbors.append(P) + return Neighbors + + +def plot_cluster(DB, clusters, ax): + """ + Extracts all the points in the DB and puts them together + as seperate clusters and finally plots them + """ + temp = [] + noise = [] + for i in clusters: + stack = [] + for k, v in DB.items(): + if v["label"] == i: + stack.append(k) + elif v["label"] == "noise": + noise.append(k) + temp.append(stack) + + my_clus = {} + + color = iter(cm.rainbow(np.linspace(0, 1, len(clusters)))) + for i in range(0, len(temp)): + c = next(color) + x = [l[0] for l in temp[i]] + y = [l[1] for l in temp[i]] + ax.plot(x, y, "ro", c=c) + + x = [l[0] for l in noise] + y = [l[1] for l in noise] + ax.plot(x, y, "ro", c="0") + + +def DBSCAN(DB, eps, min_pts): + """ + Implementation of the DBSCAN algorithm + """ + clusters = [] + C = 0 + for P in DB: + if DB[P]["label"] != "undefined": + continue + neighbors = rangeQuery(DB, P, eps) + if len(neighbors) < min_pts: + DB[P]["label"] = "noise" + continue + C += 1 + clusters.append(C) + DB[P]["label"] = C + neighbors.remove(P) # remove itself + seed_set = neighbors.copy() + while seed_set != []: + Q = seed_set.pop(0) + if DB[Q]["label"] == "noise": + DB[Q]["label"] = C + if DB[Q]["label"] != "undefined": + continue + DB[Q]["label"] = C + neighbors_n = rangeQuery(DB, Q, eps) + if len(neighbors_n) >= min_pts: + seed_set = seed_set + neighbors_n ## seed_set U neighbors_n + return DB, clusters + + +if __name__ == "__main__": + + fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(7, 5)) + + X, label = make_moons(n_samples=200, noise=0.1, random_state=19) + + axes[0].plot(X[:, 0], X[:, 1], "ro") + + points = {(point[0], point[1]): {"label": "undefined"} for point in X} + + eps = 0.25 + + min_pts = 12 + + DB, clusters = DBSCAN(points, eps, min_pts) + + plot_cluster(DB, clusters, axes[1]) + + plt.show() From 55d72147e22ac88aa7be3de36358f49a5b9ea452 Mon Sep 17 00:00:00 2001 From: cozek Date: Sat, 28 Sep 2019 12:36:13 +0530 Subject: [PATCH 2/4] fixed LGTM problems --- machine_learning/dbscan/dbscan.ipynb | 5 +---- machine_learning/dbscan/dbscan.py | 5 +---- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/machine_learning/dbscan/dbscan.ipynb b/machine_learning/dbscan/dbscan.ipynb index 9f4e72720772..b7b6d5e98c7e 100644 --- a/machine_learning/dbscan/dbscan.ipynb +++ b/machine_learning/dbscan/dbscan.ipynb @@ -28,7 +28,6 @@ "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", - "from matplotlib.pyplot import cm\n", "import numpy as np\n", "\n", "%matplotlib inline" @@ -196,9 +195,7 @@ " noise.append(k)\n", " temp.append(stack)\n", " \n", - " my_clus = {}\n", - " \n", - " color=iter(cm.rainbow(np.linspace(0,1,len(clusters))))\n", + " color=iter(plt.cm.rainbow(np.linspace(0,1,len(clusters))))\n", " for i in range(0,len(temp)):\n", " c=next(color)\n", " x = [ l[0] for l in temp[i]]\n", diff --git a/machine_learning/dbscan/dbscan.py b/machine_learning/dbscan/dbscan.py index 8b5251f3bd74..dda5600402e9 100644 --- a/machine_learning/dbscan/dbscan.py +++ b/machine_learning/dbscan/dbscan.py @@ -1,5 +1,4 @@ import matplotlib.pyplot as plt -from matplotlib.pyplot import cm import numpy as np from sklearn.datasets import make_moons @@ -42,9 +41,7 @@ def plot_cluster(DB, clusters, ax): noise.append(k) temp.append(stack) - my_clus = {} - - color = iter(cm.rainbow(np.linspace(0, 1, len(clusters)))) + color = iter(plt.cm.rainbow(np.linspace(0, 1, len(clusters)))) for i in range(0, len(temp)): c = next(color) x = [l[0] for l in temp[i]] From b5130d95156314ec23ead3232f08e1be3c0aad8a Mon Sep 17 00:00:00 2001 From: cozek Date: Sat, 28 Sep 2019 21:47:36 +0530 Subject: [PATCH 3/4] Some requested changes implemented. Still need to do docstring --- machine_learning/dbscan/dbscan.ipynb | 137 ++++++++++++++++----------- machine_learning/dbscan/dbscan.py | 24 ++--- 2 files changed, 94 insertions(+), 67 deletions(-) diff --git a/machine_learning/dbscan/dbscan.ipynb b/machine_learning/dbscan/dbscan.ipynb index b7b6d5e98c7e..8de3a67280ea 100644 --- a/machine_learning/dbscan/dbscan.ipynb +++ b/machine_learning/dbscan/dbscan.ipynb @@ -4,21 +4,37 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## DBSCAN" + "## DBSCAN\n", + "This implementation and notebook is inspired from the original DBSCAN algorithm and article as given in \n", + "[DBSCAN Wikipedia](https://en.wikipedia.org/wiki/DBSCAN).\n", + "\n", + "Stands for __Density-based spatial clustering of applications with noise__ . \n", + "\n", + "DBSCAN is clustering algorithm works by finding regions that are densely packed together, i.e, the points that have many close neighbours. It tries to captures the intuition that if two points belong to the same cluster they should be close to one another, as you will soon see.\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "__Requirements:__\n", + "### When to use ?\n", "\n", - "Sadly, we have some unavoidable requirements.\n", - "1. Matplotlib for visualization\n", - "2. Scikit-learn for grabbing some standard datasets to test on\n", - "3. Numpy\n", + "1. You need a robust clustering algorithm.\n", + "2. You don't know how many clusters there are in the dataset\n", + "3. You find it difficult to guess the number of clusters there are just by eyeballing the dataset.\n", + "4. The clusters are of arbitrary shapes.\n", + "5. You want to detect outliers." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Why DBSCAN ? \n", "\n", - "`pip install matplotlib scikit-learn numpy`" + "This algorithm is way better than other clustering algorithms such as [k-means](https://en.wikipedia.org/wiki/K-means_clustering) whose only job is to make circular blobs. It is smart enough to figure out the number of clusters in the dataset on its own, unlike k-means where you need to specify 'k'. It can also find clusters of arbitrary shapes, not just circular blobs. Its too robust to be affected by outliers (the noise points) and isn't fooled by them, unlike k-means where the entire centroid get pulled thanks to pesky outliers. Plus, you can fine-tune its parameters depending on what you are clustering.\n", + "\n", + "#### Have a look at these [neat animations](https://www.naftaliharris.com/blog/visualizing-dbscan-clustering/) of DBSCAN to see for yourself." ] }, { @@ -59,7 +75,7 @@ "metadata": {}, "source": [ "## Visualize the dataset using matplotlib\n", - "You will observe that the points are the the shape of two crescent moons. \n", + "You will observe that the points are in the shape of two crescent moons. \n", "\n", "The challenge here is to cluster the two moons. " ] @@ -74,7 +90,7 @@ { "data": { "text/plain": [ - "[]" + "[]" ] }, "execution_count": 3, @@ -102,10 +118,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The implementation is inspired from the original DBScan algorithm as given in \n", - "DBSCAN Wikipedia\n", - "\n", - "### Abstract Algorithm\n", + "### Abstract of the Algorithm\n", "The DBSCAN algorithm can be abstracted into the following steps:\n", "\n", "- Find the points in the $ε$ (eps) neighborhood of every point, and identify the core points with more than min_pts neighbors.\n", @@ -129,7 +142,7 @@ "metadata": {}, "outputs": [], "source": [ - "points = { (point[0],point[1]):{'label':'undefined'} for point in X }\n" + "points = { (point[0],point[1]):{'label':'undefined'} for point in X }" ] }, { @@ -145,14 +158,14 @@ "metadata": {}, "outputs": [], "source": [ - "def distFunc(Q, P):\n", - " '''\n", + "def euclidean_distance(Q, P):\n", + " \"\"\"\n", " Calculates the Euclidean distance\n", - " between pointd P and Q\n", - " '''\n", - " a = pow((Q[0] - P[0]),2)\n", - " b = pow((Q[1] - P[1]),2)\n", - " return pow((a+b),0.5)" + " between points P and Q\n", + " \"\"\"\n", + " a = pow((Q[0] - P[0]), 2)\n", + " b = pow((Q[1] - P[1]), 2)\n", + " return pow((a + b), 0.5)" ] }, { @@ -161,16 +174,12 @@ "metadata": {}, "outputs": [], "source": [ - "def rangeQuery(DB,Q,eps):\n", - " '''\n", + "def find_neighbors(DB, Q, eps):\n", + " \"\"\"\n", " Finds all points in the DB that\n", " are within a distance of eps from Q\n", - " '''\n", - " Neighbors = []\n", - " for P in DB:\n", - " if distFunc(Q,P) <= eps:\n", - " Neighbors.append(P)\n", - " return Neighbors\n" + " \"\"\"\n", + " return [P for P in DB if euclidean_distance(Q, P) <= eps]" ] }, { @@ -179,40 +188,40 @@ "metadata": {}, "outputs": [], "source": [ - "def plot_cluster(DB,clusters):\n", - " '''\n", + "def plot_cluster(DB, clusters):\n", + " \"\"\"\n", " Extracts all the points in the DB and puts them together\n", " as seperate clusters and finally plots them\n", - " '''\n", + " \"\"\"\n", " temp = []\n", - " noise=[]\n", + " noise = []\n", " for i in clusters:\n", " stack = []\n", - " for k,v in DB.items():\n", - " if v['label'] == i:\n", + " for k, v in DB.items():\n", + " if v[\"label\"] == i:\n", " stack.append(k)\n", - " elif v['label'] == 'noise':\n", + " elif v[\"label\"] == \"noise\":\n", " noise.append(k)\n", " temp.append(stack)\n", - " \n", - " color=iter(plt.cm.rainbow(np.linspace(0,1,len(clusters))))\n", - " for i in range(0,len(temp)):\n", - " c=next(color)\n", - " x = [ l[0] for l in temp[i]]\n", - " y = [ l[1] for l in temp[i]]\n", - " plt.plot(x,y,'ro',c = c)\n", "\n", + " color = iter(plt.cm.rainbow(np.linspace(0, 1, len(clusters))))\n", + " for i in range(0, len(temp)):\n", + " c = next(color)\n", + " x = [l[0] for l in temp[i]]\n", + " y = [l[1] for l in temp[i]]\n", + " plt.plot(x, y, \"ro\", c=c)\n", "\n", - " x = [ l[0] for l in noise ]\n", - " y = [ l[1] for l in noise ]\n", - " plt.plot(x,y,'ro',c = '0')" + " x = [l[0] for l in noise]\n", + " y = [l[1] for l in noise]\n", + " plt.plot(x, y, \"ro\", c=\"0\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# Implementation of DBSCAN" + "# Implementation of DBSCAN\n", + "todo: Explain the code" ] }, { @@ -221,16 +230,16 @@ "metadata": {}, "outputs": [], "source": [ - "def DBSCAN(DB,eps,min_pts):\n", + "def dbscan(DB,eps,min_pts):\n", " '''\n", - " Implementation of the DBScan algorithm\n", + " Implementation of the DBSCAN algorithm\n", " '''\n", " clusters = []\n", " C=0\n", " for P in DB:\n", " if DB[P]['label'] != 'undefined':\n", " continue\n", - " neighbors = rangeQuery(DB,P,eps)\n", + " neighbors = find_neighbors(DB,P,eps)\n", " if len(neighbors) < min_pts:\n", " DB[P]['label']='noise'\n", " continue\n", @@ -246,7 +255,7 @@ " if DB[Q]['label'] != 'undefined':\n", " continue\n", " DB[Q]['label'] = C\n", - " neighbors_n = rangeQuery(DB,Q,eps)\n", + " neighbors_n = find_neighbors(DB,Q,eps)\n", " if len(neighbors_n) >= min_pts:\n", " seed_set = seed_set + neighbors_n \n", " return DB,clusters\n" @@ -281,7 +290,7 @@ "eps = 0.25\n", "min_pts = 12\n", "\n", - "DB,clusters = DBSCAN(points,eps,min_pts)\n", + "DB,clusters = dbscan(points,eps,min_pts)\n", "\n", "plot_cluster(DB,clusters)" ] @@ -290,8 +299,30 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### I encourage you to try with different datasets and playing with the values of eps and min_pts" + "I encourage you to try with different datasets and playing with the values of eps and min_pts.\n", + "\n", + "Also, try kmeans on this dataset and see how it compares to dbscan. " ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "I hope by now you are convinced about about how cool dbscan is. But it has its pitfalls.\n", + "### When NOT to use ?\n", + "\n", + "1. You have a high dimentional dataset. Euclidean distance will fail thanks to '[curse of dimentionality](https://en.wikipedia.org/wiki/Curse_of_dimensionality#Distance_functions)'.\n", + "2. We have used a dict to store the points. So we can't do anything about the order in which the points will be processed. So its not entirely deterministic.\n", + "3. Won't work well if there are large differences in density. Finding the min_pts and $ε$ combination is gonna be a big pain in the neck.\n", + "4. Choosing the $ε$ without understanding the data and its scale, might result is poor clustering performance." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/machine_learning/dbscan/dbscan.py b/machine_learning/dbscan/dbscan.py index dda5600402e9..1c7d8f7bd259 100644 --- a/machine_learning/dbscan/dbscan.py +++ b/machine_learning/dbscan/dbscan.py @@ -3,26 +3,22 @@ from sklearn.datasets import make_moons -def distFunc(Q, P): +def euclidean_distance(Q, P): """ Calculates the Euclidean distance - between pointd P and Q + between points P and Q """ a = pow((Q[0] - P[0]), 2) b = pow((Q[1] - P[1]), 2) return pow((a + b), 0.5) -def rangeQuery(DB, Q, eps): +def find_neighbors(DB, Q, eps): """ Finds all points in the DB that are within a distance of eps from Q """ - Neighbors = [] - for P in DB: - if distFunc(Q, P) <= eps: - Neighbors.append(P) - return Neighbors + return [P for P in DB if euclidean_distance(Q, P) <= eps] def plot_cluster(DB, clusters, ax): @@ -53,7 +49,7 @@ def plot_cluster(DB, clusters, ax): ax.plot(x, y, "ro", c="0") -def DBSCAN(DB, eps, min_pts): +def dbscan(DB, eps, min_pts): """ Implementation of the DBSCAN algorithm """ @@ -62,14 +58,14 @@ def DBSCAN(DB, eps, min_pts): for P in DB: if DB[P]["label"] != "undefined": continue - neighbors = rangeQuery(DB, P, eps) + neighbors = find_neighbors(DB, P, eps) if len(neighbors) < min_pts: DB[P]["label"] = "noise" continue C += 1 clusters.append(C) DB[P]["label"] = C - neighbors.remove(P) # remove itself + neighbors.remove(P) seed_set = neighbors.copy() while seed_set != []: Q = seed_set.pop(0) @@ -78,9 +74,9 @@ def DBSCAN(DB, eps, min_pts): if DB[Q]["label"] != "undefined": continue DB[Q]["label"] = C - neighbors_n = rangeQuery(DB, Q, eps) + neighbors_n = find_neighbors(DB, Q, eps) if len(neighbors_n) >= min_pts: - seed_set = seed_set + neighbors_n ## seed_set U neighbors_n + seed_set = seed_set + neighbors_n return DB, clusters @@ -98,7 +94,7 @@ def DBSCAN(DB, eps, min_pts): min_pts = 12 - DB, clusters = DBSCAN(points, eps, min_pts) + DB, clusters = dbscan(points, eps, min_pts) plot_cluster(DB, clusters, axes[1]) From 878ce881e557d8bdf44be53e65cc91a037a3e1c5 Mon Sep 17 00:00:00 2001 From: cozek Date: Sun, 29 Sep 2019 14:03:56 +0530 Subject: [PATCH 4/4] implememted all changes as requested --- machine_learning/dbscan/dbscan.ipynb | 103 +++++++----- machine_learning/dbscan/dbscan.py | 234 +++++++++++++++++++++++---- 2 files changed, 267 insertions(+), 70 deletions(-) diff --git a/machine_learning/dbscan/dbscan.ipynb b/machine_learning/dbscan/dbscan.ipynb index 8de3a67280ea..603a4cd405b9 100644 --- a/machine_learning/dbscan/dbscan.ipynb +++ b/machine_learning/dbscan/dbscan.ipynb @@ -10,7 +10,7 @@ "\n", "Stands for __Density-based spatial clustering of applications with noise__ . \n", "\n", - "DBSCAN is clustering algorithm works by finding regions that are densely packed together, i.e, the points that have many close neighbours. It tries to captures the intuition that if two points belong to the same cluster they should be close to one another, as you will soon see.\n" + "DBSCAN is clustering algorithm that tries to captures the intuition that if two points belong to the same cluster they should be close to one another. It does so by finding regions that are densely packed together, i.e, the points that have many close neighbours.\n" ] }, { @@ -23,7 +23,7 @@ "2. You don't know how many clusters there are in the dataset\n", "3. You find it difficult to guess the number of clusters there are just by eyeballing the dataset.\n", "4. The clusters are of arbitrary shapes.\n", - "5. You want to detect outliers." + "5. You want to detect outliers/noise." ] }, { @@ -32,7 +32,7 @@ "source": [ "### Why DBSCAN ? \n", "\n", - "This algorithm is way better than other clustering algorithms such as [k-means](https://en.wikipedia.org/wiki/K-means_clustering) whose only job is to make circular blobs. It is smart enough to figure out the number of clusters in the dataset on its own, unlike k-means where you need to specify 'k'. It can also find clusters of arbitrary shapes, not just circular blobs. Its too robust to be affected by outliers (the noise points) and isn't fooled by them, unlike k-means where the entire centroid get pulled thanks to pesky outliers. Plus, you can fine-tune its parameters depending on what you are clustering.\n", + "This algorithm is way better than other clustering algorithms such as [k-means](https://en.wikipedia.org/wiki/K-means_clustering) whose only job is to find circular blobs. It is smart enough to figure out the number of clusters in the dataset on its own, unlike k-means where you need to specify 'k'. It can also find clusters of arbitrary shapes, not just circular blobs. Its too robust to be affected by outliers (the noise points) and isn't fooled by them, unlike k-means where the entire centroid get pulled thanks to pesky outliers. Plus, you can fine-tune its parameters depending on what you are clustering.\n", "\n", "#### Have a look at these [neat animations](https://www.naftaliharris.com/blog/visualizing-dbscan-clustering/) of DBSCAN to see for yourself." ] @@ -67,7 +67,7 @@ "source": [ "from sklearn.datasets import make_moons\n", "\n", - "X, label = make_moons(n_samples=200, noise=0.1, random_state=19)" + "x, label = make_moons(n_samples=200, noise=0.1, random_state=19)" ] }, { @@ -90,7 +90,7 @@ { "data": { "text/plain": [ - "[]" + "[]" ] }, "execution_count": 3, @@ -111,7 +111,7 @@ } ], "source": [ - "plt.plot(X[:,0], X[:,1],'ro')" + "plt.plot(x[:,0], x[:,1],'ro')" ] }, { @@ -142,7 +142,7 @@ "metadata": {}, "outputs": [], "source": [ - "points = { (point[0],point[1]):{'label':'undefined'} for point in X }" + "points = { (point[0],point[1]):{'label':'undefined'} for point in x }" ] }, { @@ -158,13 +158,13 @@ "metadata": {}, "outputs": [], "source": [ - "def euclidean_distance(Q, P):\n", + "def euclidean_distance(q, p):\n", " \"\"\"\n", " Calculates the Euclidean distance\n", " between points P and Q\n", " \"\"\"\n", - " a = pow((Q[0] - P[0]), 2)\n", - " b = pow((Q[1] - P[1]), 2)\n", + " a = pow((q[0] - p[0]), 2)\n", + " b = pow((q[1] - p[1]), 2)\n", " return pow((a + b), 0.5)" ] }, @@ -174,12 +174,12 @@ "metadata": {}, "outputs": [], "source": [ - "def find_neighbors(DB, Q, eps):\n", + "def find_neighbors(db, q, eps):\n", " \"\"\"\n", " Finds all points in the DB that\n", " are within a distance of eps from Q\n", " \"\"\"\n", - " return [P for P in DB if euclidean_distance(Q, P) <= eps]" + " return [p for p in db if euclidean_distance(q, p) <= eps]" ] }, { @@ -188,7 +188,7 @@ "metadata": {}, "outputs": [], "source": [ - "def plot_cluster(DB, clusters):\n", + "def plot_cluster(db, clusters):\n", " \"\"\"\n", " Extracts all the points in the DB and puts them together\n", " as seperate clusters and finally plots them\n", @@ -197,7 +197,7 @@ " noise = []\n", " for i in clusters:\n", " stack = []\n", - " for k, v in DB.items():\n", + " for k, v in db.items():\n", " if v[\"label\"] == i:\n", " stack.append(k)\n", " elif v[\"label\"] == \"noise\":\n", @@ -221,7 +221,34 @@ "metadata": {}, "source": [ "# Implementation of DBSCAN\n", - "todo: Explain the code" + "\n", + "Initialize an empty list, clusters = $[ ]$ and cluster identifier, c = 0\n", + "\n", + "1. For each point p in our database/dict db :\n", + "\n", + " 1.1 Check if p is already labelled. If it's already labelled (means it already been associated to a cluster), continue to the next point,i.e, go to step 1\n", + " \n", + " 1.2. Find the list of neighbors of p , i.e, points that are within a distance of eps from p\n", + " \n", + " 1.3. If p does not have atleast min_pts neighbours, we label it as noise and go back to step 1\n", + " \n", + " 1.4. Initialize the cluster, by incrementing c by 1\n", + " \n", + " 1.5. Append the cluster identifier c to clusters\n", + " \n", + " 1.6. Label p with the cluster identifier c\n", + " \n", + " 1.7 Remove p from the list of neighbors (p will be detected as its own neighbor because it is within eps of itself)\n", + " \n", + " 1.8. Initialize the seed_set as a copy of neighbors\n", + " \n", + " 1.9. While the seed_set is not empty:\n", + " 1.9.1. Removing the 1st point from seed_set and initialise it as q\n", + " 1.9.2. If it's label is noise, label it with c\n", + " 1.9.3. If it's not unlabelled, go back to step 1.9\n", + " 1.9.4. Label q with c\n", + " 1.9.5. Find the neighbours of q \n", + " 1.9.6. If there are atleast min_pts neighbors, append them to the seed_set" ] }, { @@ -230,35 +257,35 @@ "metadata": {}, "outputs": [], "source": [ - "def dbscan(DB,eps,min_pts):\n", + "def dbscan(db,eps,min_pts):\n", " '''\n", " Implementation of the DBSCAN algorithm\n", " '''\n", " clusters = []\n", - " C=0\n", - " for P in DB:\n", - " if DB[P]['label'] != 'undefined':\n", + " c = 0\n", + " for p in db:\n", + " if db[p][\"label\"] != \"undefined\":\n", " continue\n", - " neighbors = find_neighbors(DB,P,eps)\n", + " neighbors = find_neighbors(db, p, eps)\n", " if len(neighbors) < min_pts:\n", - " DB[P]['label']='noise'\n", + " db[p][\"label\"] = \"noise\"\n", " continue\n", - " C += 1\n", - " clusters.append(C)\n", - " DB[P]['label'] = C\n", - " neighbors.remove(P)\n", + " c += 1\n", + " clusters.append(c)\n", + " db[p][\"label\"] = c\n", + " neighbors.remove(p)\n", " seed_set = neighbors.copy()\n", - " while seed_set!=[]:\n", - " Q = seed_set.pop(0)\n", - " if DB[Q]['label'] == 'noise':\n", - " DB[Q]['label'] = C\n", - " if DB[Q]['label'] != 'undefined':\n", + " while seed_set != []:\n", + " q = seed_set.pop(0)\n", + " if db[q][\"label\"] == \"noise\":\n", + " db[q][\"label\"] = c\n", + " if db[q][\"label\"] != \"undefined\":\n", " continue\n", - " DB[Q]['label'] = C\n", - " neighbors_n = find_neighbors(DB,Q,eps)\n", + " db[q][\"label\"] = c\n", + " neighbors_n = find_neighbors(db, q, eps)\n", " if len(neighbors_n) >= min_pts:\n", - " seed_set = seed_set + neighbors_n \n", - " return DB,clusters\n" + " seed_set = seed_set + neighbors_n\n", + " return db, clusters\n" ] }, { @@ -290,9 +317,9 @@ "eps = 0.25\n", "min_pts = 12\n", "\n", - "DB,clusters = dbscan(points,eps,min_pts)\n", + "db,clusters = dbscan(points,eps,min_pts)\n", "\n", - "plot_cluster(DB,clusters)" + "plot_cluster(db,clusters)" ] }, { @@ -312,8 +339,8 @@ "### When NOT to use ?\n", "\n", "1. You have a high dimentional dataset. Euclidean distance will fail thanks to '[curse of dimentionality](https://en.wikipedia.org/wiki/Curse_of_dimensionality#Distance_functions)'.\n", - "2. We have used a dict to store the points. So we can't do anything about the order in which the points will be processed. So its not entirely deterministic.\n", - "3. Won't work well if there are large differences in density. Finding the min_pts and $ε$ combination is gonna be a big pain in the neck.\n", + "2. We have used a dict to store the points. So we can't do anything about the order in which the points will be processed. So it's not entirely deterministic.\n", + "3. Won't work well if there are large differences in density. Finding the min_pts and $ε$ combination will be difficult.\n", "4. Choosing the $ε$ without understanding the data and its scale, might result is poor clustering performance." ] }, diff --git a/machine_learning/dbscan/dbscan.py b/machine_learning/dbscan/dbscan.py index 1c7d8f7bd259..04fb5f0186e1 100644 --- a/machine_learning/dbscan/dbscan.py +++ b/machine_learning/dbscan/dbscan.py @@ -1,36 +1,146 @@ import matplotlib.pyplot as plt import numpy as np from sklearn.datasets import make_moons +import warnings -def euclidean_distance(Q, P): +def euclidean_distance(q, p): """ Calculates the Euclidean distance - between points P and Q + between points q and p + + Distance can only be calculated between numeric values + >>> euclidean_distance([1,'a'],[1,2]) + Traceback (most recent call last): + ... + ValueError: Non-numeric input detected + + The dimentions of both the points must be the same + >>> euclidean_distance([1,1,1],[1,2]) + Traceback (most recent call last): + ... + ValueError: expected dimensions to be 2-d, instead got p:3 and q:2 + + Supports only two dimentional points + >>> euclidean_distance([1,1,1],[1,2]) + Traceback (most recent call last): + ... + ValueError: expected dimensions to be 2-d, instead got p:3 and q:2 + + Input should be in the format [x,y] or (x,y) + >>> euclidean_distance(1,2) + Traceback (most recent call last): + ... + TypeError: inputs must be iterable, either list [x,y] or tuple (x,y) """ - a = pow((Q[0] - P[0]), 2) - b = pow((Q[1] - P[1]), 2) + if not hasattr(q, "__iter__") or not hasattr(p, "__iter__"): + raise TypeError("inputs must be iterable, either list [x,y] or tuple (x,y)") + + if isinstance(q, str) or isinstance(p, str): + raise TypeError("inputs cannot be str") + + if len(q) != 2 or len(p) != 2: + raise ValueError( + "expected dimensions to be 2-d, instead got p:{} and q:{}".format( + len(q), len(p) + ) + ) + + for num in q + p: + try: + num = int(num) + except: + raise ValueError("Non-numeric input detected") + + a = pow((q[0] - p[0]), 2) + b = pow((q[1] - p[1]), 2) return pow((a + b), 0.5) -def find_neighbors(DB, Q, eps): +def find_neighbors(db, q, eps): """ - Finds all points in the DB that + Finds all points in the db that are within a distance of eps from Q + + eps value should be a number + >>> find_neighbors({ (1,2):{'label':'undefined'}, (2,3):{'label':'undefined'}}, (2,5),'a') + Traceback (most recent call last): + ... + ValueError: eps should be either int or float + + Q must be a 2-d point as list or tuple + >>> find_neighbors({ (1,2):{'label':'undefined'}, (2,3):{'label':'undefined'}}, 2, 0.5) + Traceback (most recent call last): + ... + TypeError: Q must a 2-dimentional point in the format (x,y) or [x,y] + + Points must be in correct format + >>> find_neighbors([], (2,2) ,0.4) + Traceback (most recent call last): + ... + TypeError: db must be a dict of points in the format {(x,y):{'label':'boolean/undefined'}} """ - return [P for P in DB if euclidean_distance(Q, P) <= eps] + if not isinstance(eps, (int, float)): + raise ValueError("eps should be either int or float") + + if not hasattr(q, "__iter__"): + raise TypeError("Q must a 2-dimentional point in the format (x,y) or [x,y]") + + if not isinstance(db, dict): + raise TypeError( + "db must be a dict of points in the format {(x,y):{'label':'boolean/undefined'}}" + ) + + return [p for p in db if euclidean_distance(q, p) <= eps] -def plot_cluster(DB, clusters, ax): + +def plot_cluster(db, clusters, ax): """ - Extracts all the points in the DB and puts them together + Extracts all the points in the db and puts them together as seperate clusters and finally plots them + + db cannot be empty + >>> fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(7, 5)) + >>> plot_cluster({},[1,2], axes[1] ) + Traceback (most recent call last): + ... + Exception: db is empty. No points to cluster + + clusters cannot be empty + >>> fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(7, 5)) + >>> plot_cluster({ (1,2):{'label':'undefined'}, (2,3):{'label':'undefined'}},[],axes[1] ) + Traceback (most recent call last): + ... + Exception: nothing to cluster. Empty clusters + + clusters cannot be empty + >>> fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(7, 5)) + >>> plot_cluster({ (1,2):{'label':'undefined'}, (2,3):{'label':'undefined'}},[],axes[1] ) + Traceback (most recent call last): + ... + Exception: nothing to cluster. Empty clusters + + ax must be a plotable + >>> plot_cluster({ (1,2):{'label':'1'}, (2,3):{'label':'2'}},[1,2], [] ) + Traceback (most recent call last): + ... + TypeError: ax must be an slot in a matplotlib figure """ + if len(db) == 0: + raise Exception("db is empty. No points to cluster") + + if len(clusters) == 0: + raise Exception("nothing to cluster. Empty clusters") + + if not hasattr(ax, "plot"): + raise TypeError("ax must be an slot in a matplotlib figure") + temp = [] noise = [] for i in clusters: stack = [] - for k, v in DB.items(): + for k, v in db.items(): if v["label"] == i: stack.append(k) elif v["label"] == "noise": @@ -49,53 +159,113 @@ def plot_cluster(DB, clusters, ax): ax.plot(x, y, "ro", c="0") -def dbscan(DB, eps, min_pts): +def dbscan(db, eps, min_pts): """ Implementation of the DBSCAN algorithm + + Points must be in correct format + >>> dbscan([], (2,2) ,0.4) + Traceback (most recent call last): + ... + TypeError: db must be a dict of points in the format {(x,y):{'label':'boolean/undefined'}} + + eps value should be a number + >>> dbscan({ (1,2):{'label':'undefined'}, (2,3):{'label':'undefined'}},'a',20 ) + Traceback (most recent call last): + ... + ValueError: eps should be either int or float + + min_pts value should be an integer + >>> dbscan({ (1,2):{'label':'undefined'}, (2,3):{'label':'undefined'}},0.4,20.0 ) + Traceback (most recent call last): + ... + ValueError: min_pts should be int + + db cannot be empty + >>> dbscan({},0.4,20.0 ) + Traceback (most recent call last): + ... + Exception: db is empty, nothing to cluster + + min_pts cannot be negative + >>> dbscan({ (1,2):{'label':'undefined'}, (2,3):{'label':'undefined'}}, 0.4, -20) + Traceback (most recent call last): + ... + ValueError: min_pts or eps cannot be negative + + eps cannot be negative + >>> dbscan({ (1,2):{'label':'undefined'}, (2,3):{'label':'undefined'}},-0.4, 20) + Traceback (most recent call last): + ... + ValueError: min_pts or eps cannot be negative + """ + if not isinstance(db, dict): + raise TypeError( + "db must be a dict of points in the format {(x,y):{'label':'boolean/undefined'}}" + ) + + if len(db) == 0: + raise Exception("db is empty, nothing to cluster") + + if not isinstance(eps, (int, float)): + raise ValueError("eps should be either int or float") + + if not isinstance(min_pts, int): + raise ValueError("min_pts should be int") + + if min_pts < 0 or eps < 0: + raise ValueError("min_pts or eps cannot be negative") + + if min_pts == 0: + warnings.warn("min_pts is 0. Are you sure you want this ?") + + if eps == 0: + warnings.warn("eps is 0. Are you sure you want this ?") + clusters = [] - C = 0 - for P in DB: - if DB[P]["label"] != "undefined": + c = 0 + for p in db: + if db[p]["label"] != "undefined": continue - neighbors = find_neighbors(DB, P, eps) + neighbors = find_neighbors(db, p, eps) if len(neighbors) < min_pts: - DB[P]["label"] = "noise" + db[p]["label"] = "noise" continue - C += 1 - clusters.append(C) - DB[P]["label"] = C - neighbors.remove(P) + c += 1 + clusters.append(c) + db[p]["label"] = c + neighbors.remove(p) seed_set = neighbors.copy() while seed_set != []: - Q = seed_set.pop(0) - if DB[Q]["label"] == "noise": - DB[Q]["label"] = C - if DB[Q]["label"] != "undefined": + q = seed_set.pop(0) + if db[q]["label"] == "noise": + db[q]["label"] = c + if db[q]["label"] != "undefined": continue - DB[Q]["label"] = C - neighbors_n = find_neighbors(DB, Q, eps) + db[q]["label"] = c + neighbors_n = find_neighbors(db, q, eps) if len(neighbors_n) >= min_pts: seed_set = seed_set + neighbors_n - return DB, clusters + return db, clusters if __name__ == "__main__": fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(7, 5)) - X, label = make_moons(n_samples=200, noise=0.1, random_state=19) + x, label = make_moons(n_samples=200, noise=0.1, random_state=19) - axes[0].plot(X[:, 0], X[:, 1], "ro") + axes[0].plot(x[:, 0], x[:, 1], "ro") - points = {(point[0], point[1]): {"label": "undefined"} for point in X} + points = {(point[0], point[1]): {"label": "undefined"} for point in x} eps = 0.25 min_pts = 12 - DB, clusters = dbscan(points, eps, min_pts) + db, clusters = dbscan(points, eps, min_pts) - plot_cluster(DB, clusters, axes[1]) + plot_cluster(db, clusters, axes[1]) plt.show()