From b75036a54e3e0696a51a5d7b4a54b8a773315574 Mon Sep 17 00:00:00 2001 From: zhqu1148980644 Date: Tue, 23 Oct 2018 19:15:48 +0800 Subject: [PATCH 1/8] Implementation of sequential minimal optimization algorithm --- machine_learning/smo.py | 486 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 486 insertions(+) create mode 100644 machine_learning/smo.py diff --git a/machine_learning/smo.py b/machine_learning/smo.py new file mode 100644 index 000000000000..8f571f2bf10d --- /dev/null +++ b/machine_learning/smo.py @@ -0,0 +1,486 @@ +# coding: utf-8 +""" + Implementation of sequential minimal optimization(SMO) for support vector machines(SVM). + + Sequential minimal optimization (SMO) is an algorithm for solving the quadratic programming (QP) problem + that arises during the training of support vector machines. + It was invented by John Platt in 1998. +Input: + 0: pandas dataframe + 1: first column of df must be tags of samples,should be 1 or -1. + 2: rows of df represent samples + +Usage: + 0: download https://www.kaggle.com/uciml/breast-cancer-wisconsin-data/ + 1: choose your kernel function. + 2: call SmoSVM class to get your SmoSVM object. + 3: call SmoSVM object's fit() function. + 4: call SmoSVM object's predict() function. + +Reference: + https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/smo-book.pdf + https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/tr-98-14.pdf + http://web.cs.iastate.edu/~honavar/smo-svm.pdf +""" +from __future__ import division + +import numpy as np +import pandas as pd + + +class SmoSVM(object): + def __init__(self, train, kernel_func, alpha_list=None, cost=1.0, b=0.0, tolerance=0.0, auto_norm=True): + self._init = True + self._auto_norm = auto_norm + self._cost = np.float64(cost) + self._b = np.float64(b) + self._tol = np.float64(tolerance) if tolerance > 0.0001 else np.float64(0.001) + + self.tags = train[:, 0] + self.samples = self._norm(train[:, 1:]) if self._auto_norm else train[:, 1:] + self.alphas = alpha_list if alpha_list is not None else np.zeros(train.shape[0]) + self.Kernel = kernel_func + + self._eps = 0.001 + self._all_samples = list(range(self.length)) + self._K_matrix = self._calculate_k_matrix() + self._error = np.zeros(self.length) + self._unbound = [] + + self.choose_alpha = self._choose_alpha() + + # Calculate alphas using SMO algorithsm + def fit(self): + K = self._k + state = None + while True: + + # 1: Find alpha1, alpha2 + try: + i1, i2 = self.choose_alpha.send(state) + state = None + # show non-obey-kkt samples' number + # from collections import Counter + # result = [] + # for i in self._all_samples: + # result.append(self._check_obey_KKT(i)) + # print(Counter(result).get(True)) + except StopIteration: + print("Optimization done!\r\nEvery sample satisfy the KKT condition!") + # for i in self._all_samples: + # if self._check_obey_KKT(i): + # raise ValueError('some sample not fit KKT condition') + break + + # 2: calculate new alpha2 and new alpha1 + y1, y2 = self.tags[i1], self.tags[i2] + a1, a2 = self.alphas[i1].copy(), self.alphas[i2].copy() + e1, e2 = self._e(i1), self._e(i2) + args = (i1, i2, a1, a2, e1, e2, y1, y2) + a1_new, a2_new = self._get_new_alpha(*args) + + if not a1_new and not a2_new: + state = False + continue + self.alphas[i1], self.alphas[i2] = a1_new, a2_new + + # 3: update threshold(b) + b1_new = np.float64(-e1 - y1 * K(i1, i1) * (a1_new - a1) - y2 * K(i2, i1) * (a2_new - a2) + self.b) + b2_new = np.float64(-e2 - y2 * K(i2, i2) * (a2_new - a2) - y1 * K(i1, i2) * (a1_new - a1) + self.b) + + if 0.0 < a1_new < self.c: + b = b1_new + if 0.0 < a2_new < self.c: + b = b2_new + if not (np.float64(0) < a2_new < self.c) and not (np.float64(0) < a1_new < self.c): + b = (b1_new + b2_new) / 2.0 + b_old = self.b + self._b = b + + # 4: update error value,here we only calculate those non-bound samples' error + self._unbound = [i for i in self._all_samples if self._is_unbound(i)] + for s in self.unbound: + if s == i1 or s == i2: + continue + self._error[s] += y1 * (a1_new - a1) * K(i1, s) + y2 * (a2_new - a2) * K(i2, s) + (self.b - b_old) + + # if i1 or i2 is non-bound,update there error value to zero + if self._is_unbound(i1): + self._error[i1] = 0 + + if self._is_unbound(i2): + self._error[i2] = 0 + + # Predict test samles + def predict(self, test_samples): + def k(index, sample): + return self.Kernel(self.samples[index], sample) + + if test_samples.shape[1] > self.samples.shape[1]: + raise ValueError("Test samples' feature length not equal to train samples") + + if self._auto_norm: + test_samples = self._norm(test_samples) + + results = [] + for test_sample in test_samples: + tag = self.tags + alphas = self.alphas + b = self.b + result = np.sum([alphas[j] * tag[j] * k(j, test_sample) for j in self._all_samples]) + b + if result > 0: + results.append(1) + else: + results.append(-1) + + return results + + # Check if alpha violate KKT condition + def _check_obey_kkt(self, index): + alphas = self.alphas + tol = self.tol + r = self._e(index) * self.tags[index] + c = self.c + return (r < -tol and alphas[index] < c) or (r > tol and alphas[index] > 0.0) + + # Get value calculated from kernel function + def _k(self, i1, i2): + return self._K_matrix[i1, i2] + + # Calculate Kernel matrix of all possible i1,i2 ,saving time + def _calculate_k_matrix(self): + k_matrix = np.zeros([self.length, self.length]) + for i in self._all_samples: + for j in self._all_samples: + k_matrix[i, j] = np.float64(self.Kernel(self.samples[i, :], self.samples[j, :])) + return k_matrix + + # Get sample's error + def _e(self, index): + """ + Two cases: + 1:Sample[index] is non-bound,Fetch error from list (_error) + 2:sample[index] is bound,Use predicted value deduct true value (g(xi) - yi) + + """ + # get from error data + if self._is_unbound(index): + return self._error[index] + # get by g(xi) - yi + else: + return self._predict(index) - self.tags[index] + + # Equal to g(xi) + def _predict(self, index): + return np.dot(self.alphas * self.tags, self._K_matrix[:, index]) + self.b + + # Get L and H which bound the new alpha2 + def _get_LH(self, a1, a2, s): + if s == -1: + l, h = max(0.0, a2 - a1), min(self.c, self.c + a2 - a1) + elif s == 1: + l, h = max(0.0, a2 + a1 - self.c), min(self.c, a2 + a1) + else: + raise ValueError('s is not -1 or 1,s={}'.format(s)) + return l, h + + # Get K11 + K22 - 2*K12 + def _get_eta(self, i1, i2): + K = self._k + k11 = K(i1, i1) + k22 = K(i2, i2) + k12 = K(i1, i2) + return k11 + k22 - 2.0 * k12 + + # Get the new alpha2 and new alpha1 + def _get_new_alpha(self, i1, i2, a1, a2, e1, e2, y1, y2): + # Get K11 + K22 - 2*K12 + def get_eta(k_func, i1, i2): + k11 = k_func(i1, i1) + k22 = k_func(i2, i2) + k12 = k_func(i1, i2) + return k11 + k22 - 2.0 * k12 + + if i1 == i2: + return None, None + + s = y1 * y2 + L, H = self._get_LH(a1, a2, s) + if L == H: + return None, None + + K = self._k + eta = get_eta(K, i1, i2) + + if eta > 0.0: + a2_new_unc = a2 + (y2 * (e1 - e2)) / eta + + # a2_new has a boundry + if a2_new_unc >= H: + a2_new = H + elif a2_new_unc <= L: + a2_new = L + else: + a2_new = a2_new_unc + + # select the new alpha2 which could get the minimal objective + else: + b = self.b + K = self._k + l1 = a1 + s * (a2 - L) + h1 = a1 + s * (a2 - H) + + # way 1 + f1 = y1 * (e1 + b) - a1 * K(i1, i1) - s * a2 * K(i1, i2) + f2 = y2 * (e2 + b) - a2 * K(i2, i2) - s * a1 * K(i1, i2) + ol = l1 * f1 + L * f2 + 1 / 2 * l1 ** 2 * K(i1, i1) + 1 / 2 * L ** 2 * K(i2, i2) + s * L * l1 * K(i1, i2) + oh = h1 * f1 + H * f2 + 1 / 2 * h1 ** 2 * K(i1, i1) + 1 / 2 * H ** 2 * K(i2, i2) + s * H * h1 * K(i1, i2) + + # way 2 + # tmp_alphas = self.alphas.copy() + # tmp_alphas[i1], tmp_alphas[i2] = l1, L + # ol = self._get_objective(tmp_alphas) + # tmp_alphas[i1], tmp_alphas[i2] = h1, H + # oh = self._get_objective(tmp_alphas) + + if ol < (oh - self._eps): + a2_new = L + elif ol > oh + self._eps: + a2_new = H + else: + a2_new = a2 + + # a1_new has a boundry + a1_new = a1 + s * (a2 - a2_new) + if a1_new < 0: + a2_new += s * a1_new + a1_new = 0 + if a1_new > self.c: + a2_new += s * (a1_new - self.c) + a1_new = self.c + + return a1_new, a2_new + + # Choose alpha1 and alpha2 + + def _choose_alpha(self): + locis = yield from self._choose_a1() + if not locis: + return + return locis + + def _choose_a1(self): + """ + Choose first alpha ;steps: + 1:Fisrt loop over all sample + 2:Second loop over all non-bound samples till all non-bound samples does not voilate kkt condition. + 3:Repeat this two process endlessly,till all samples does not voilate kkt condition samples after first loop. + """ + while True: + all_not_obey = True + # all sample + print('scanning all sample!') + for i1 in [i for i in self._all_samples if self._check_obey_kkt(i)]: + all_not_obey = False + yield from self._choose_a2(i1) + + # non-bound sample + print('scanning non-bound sample!') + while True: + not_obey = True + for i1 in [i for i in self._all_samples if self._check_obey_kkt(i) and self._is_unbound(i)]: + not_obey = False + yield from self._choose_a2(i1) + if not_obey: + print('all non-bound samples fit the KKT condition!') + break + if all_not_obey: + print('all samples fit the KKT condition! Optimization done!') + break + return False + + def _choose_a2(self, i1): + """ + Choose the second alpha by using heuristic algorithm ;steps: + 1:Choosed alpha2 which get the maximum step size (|E1 - E2|). + 2:Start in a random point,loop over all non-bound samples till alpha1 and alpha2 are optimized. + 3:Start in a random point,loop over all samples till alpha1 and alpha2 are optimized. + """ + self._unbound = [i for i in self._all_samples if self._is_unbound(i)] + + if len(self.unbound) > 0: + tmp_error = self._error.copy().tolist() + tmp_error_dict = {index: value for index, value in enumerate(tmp_error) if self._is_unbound(index)} + + if self._e(i1) >= 0: + i2 = min(tmp_error_dict, key=lambda index: tmp_error_dict[index]) + else: + i2 = max(tmp_error_dict, key=lambda index: tmp_error_dict[index]) + + cmd = yield i1, i2 + if cmd is None: + return + + for i2 in np.roll(self.unbound, np.random.choice(self.length)): + cmd = yield i1, i2 + if cmd is None: + return + + for i2 in np.roll(self._all_samples, np.random.choice(self.length)): + cmd = yield i1, i2 + if cmd is None: + return + + # Normalise data using min_max way + def _norm(self, data): + # Use sklearn's normerlizer + # from sklearn.preprocessing import MinMaxScaler + # if self.init: + # self.normer = MinMaxScaler() + # return self.normer.fit_transform(data) + # else: + # return self.normer.transform(data) + + if self._init: + self._min = np.min(data, axis=0) + self._max = np.max(data, axis=0) + self._init = False + + return (data - self._min) / (self._max - self._min) + else: + + return (data - self._min) / (self._max - self._min) + + def _is_unbound(self, index): + if 0.0 < self.alphas[index] < self.c: + return True + else: + return False + + def _is_support(self, index): + if self.alphas[index] > 0: + return True + else: + return False + + @property + def unbound(self): + return self._unbound + + @property + def support(self): + return [i for i in range(self.length) if self._is_support(i)] + + @property + def b(self): + return self._b + + @b.setter + def b(self, value): + self._b = value + + @property + def c(self): + return self._cost + + @property + def tol(self): + return self._tol + + @property + def length(self): + return self.samples.shape[0] + + +class Kernel(object): + def __init__(self, kernel, degree=1.0, coef0=0.0, gamma=1.0): + self.degree = np.float64(degree) + self.coef0 = np.float64(coef0) + self.gamma = np.float64(gamma) + self._kernel_name = kernel + self._kernel = self._get_kernel(kernel_name=kernel) + self._check() + + def _polynomial(self, v1, v2): + return (self.gamma * np.inner(v1, v2) + self.coef0) ** self.degree + + def _linear(self, v1, v2): + return np.inner(v1, v2) + self.coef0 + + def _rbf(self, v1, v2): + return np.exp(-1 * (self.gamma * np.linalg.norm(v1 - v2) ** 2)) + + def _check(self): + if self._kernel == self._rbf: + if self.gamma < 0: + raise ValueError('gamma value must greater than 0') + + def _get_kernel(self, kernel_name): + + maps = { + 'linear': self._linear, + 'poly': self._polynomial, + 'rbf': self._rbf + } + + return maps[kernel_name] + + def __call__(self, v1, v2): + return self._kernel(v1, v2) + + def __repr__(self): + return self._kernel_name + + +def count_time(title='Process'): + def count(func): + def call_func(*args, **kwargs): + import time + start_time = time.time() + func(*args, **kwargs) + end_time = time.time() + print('\r\n{} cost {} seconds'.format(title, end_time - start_time)) + + return call_func + + return count + + +@count_time(title='SMO algorithm') +def test(): + print('Hello!\r\nStart test svm by smo algorithm!') + # 0: change the dataset path + data = pd.read_csv(r'C:/Users/dell/Downloads/breast-cancer-wisconsin-data/data.csv') + + # 1: pre-processing data + del data[data.columns.tolist()[-1]] + del data['id'] + data = data.dropna(axis=0) + data = data.replace({'M': np.float64(1), 'B': np.float64(-1)}) + samples = np.array(data)[:, :] + + # 2: deviding data into train_data data and test_data data + train_data, test_data = samples[:328, :], samples[328:, :] + test_tags, test_samples = test_data[:, 0], test_data[:, 1:] + + # 3: choose kernel function,and set alphas to zero + mykernel = Kernel(kernel='rbf', degree=3, coef0=1, gamma=0.5) + al = np.zeros(train_data.shape[0]) + + # 4: calculating best alphas using SMO algorithm and predict test_data samples + mysvm = SmoSVM(train=train_data, kernel_func=mykernel, alpha_list=al, cost=0.4, b=0.0, tolerance=0.001) + mysvm.fit() + predict = mysvm.predict(test_samples) + + # 5: check accuracy + score = 0 + test_num = test_tags.shape[0] + for i in range(test_tags.shape[0]): + if test_tags[i] == predict[i]: + score += 1 + print('\r\nall: {}\r\nright: {}\r\nfalse: {}'.format(test_num, score, test_num - score)) + print("Rough Accuracy: {}".format(score / test_tags.shape[0])) + + +if __name__ == '__main__': + test() From 9a2fe0472843cf5256de1549f14b733d1a2bf885 Mon Sep 17 00:00:00 2001 From: BAKEZQ Date: Tue, 23 Oct 2018 19:33:43 +0800 Subject: [PATCH 2/8] Update smo.py --- machine_learning/smo.py | 51 +++++++++++++++++++++++------------------ 1 file changed, 29 insertions(+), 22 deletions(-) diff --git a/machine_learning/smo.py b/machine_learning/smo.py index 8f571f2bf10d..71ae34c30b65 100644 --- a/machine_learning/smo.py +++ b/machine_learning/smo.py @@ -59,17 +59,22 @@ def fit(self): try: i1, i2 = self.choose_alpha.send(state) state = None - # show non-obey-kkt samples' number - # from collections import Counter - # result = [] - # for i in self._all_samples: - # result.append(self._check_obey_KKT(i)) - # print(Counter(result).get(True)) + ''' + #show non-obey-kkt samples' number + from collections import Counter + result = [] + for i in self._all_samples: + result.append(self._check_obey_KKT(i)) + print(Counter(result).get(True)) + ''' except StopIteration: print("Optimization done!\r\nEvery sample satisfy the KKT condition!") - # for i in self._all_samples: - # if self._check_obey_KKT(i): - # raise ValueError('some sample not fit KKT condition') + ''' + # may cost time + for i in self._all_samples: + if self._check_obey_KKT(i): + raise ValueError('some sample not fit KKT condition') + ''' break # 2: calculate new alpha2 and new alpha1 @@ -235,13 +240,14 @@ def get_eta(k_func, i1, i2): f2 = y2 * (e2 + b) - a2 * K(i2, i2) - s * a1 * K(i1, i2) ol = l1 * f1 + L * f2 + 1 / 2 * l1 ** 2 * K(i1, i1) + 1 / 2 * L ** 2 * K(i2, i2) + s * L * l1 * K(i1, i2) oh = h1 * f1 + H * f2 + 1 / 2 * h1 ** 2 * K(i1, i1) + 1 / 2 * H ** 2 * K(i2, i2) + s * H * h1 * K(i1, i2) - + ''' # way 2 - # tmp_alphas = self.alphas.copy() - # tmp_alphas[i1], tmp_alphas[i2] = l1, L - # ol = self._get_objective(tmp_alphas) - # tmp_alphas[i1], tmp_alphas[i2] = h1, H - # oh = self._get_objective(tmp_alphas) + tmp_alphas = self.alphas.copy() + tmp_alphas[i1], tmp_alphas[i2] = l1, L + ol = self._get_objective(tmp_alphas) + tmp_alphas[i1], tmp_alphas[i2] = h1, H + oh = self._get_objective(tmp_alphas) + ''' if ol < (oh - self._eps): a2_new = L @@ -333,14 +339,15 @@ def _choose_a2(self, i1): # Normalise data using min_max way def _norm(self, data): + ''' # Use sklearn's normerlizer - # from sklearn.preprocessing import MinMaxScaler - # if self.init: - # self.normer = MinMaxScaler() - # return self.normer.fit_transform(data) - # else: - # return self.normer.transform(data) - + from sklearn.preprocessing import MinMaxScaler + if self.init: + self.normer = MinMaxScaler() + return self.normer.fit_transform(data) + else: + return self.normer.transform(data) + ''' if self._init: self._min = np.min(data, axis=0) self._max = np.max(data, axis=0) From a5bf83081d1ae0d2cc03de93bf91d78b0795d300 Mon Sep 17 00:00:00 2001 From: BAKEZQ Date: Fri, 26 Oct 2018 23:51:12 +0800 Subject: [PATCH 3/8] Add demonstration of svm partition boundary 1:Use matplotlib show svm's partition boundary 2:Automatically download test dataset --- machine_learning/smo.py | 443 ++++++++++++++++++++++------------------ 1 file changed, 239 insertions(+), 204 deletions(-) diff --git a/machine_learning/smo.py b/machine_learning/smo.py index 71ae34c30b65..f98647128075 100644 --- a/machine_learning/smo.py +++ b/machine_learning/smo.py @@ -5,17 +5,23 @@ Sequential minimal optimization (SMO) is an algorithm for solving the quadratic programming (QP) problem that arises during the training of support vector machines. It was invented by John Platt in 1998. + Input: 0: pandas dataframe - 1: first column of df must be tags of samples,should be 1 or -1. - 2: rows of df represent samples + 1: first column of dataframe must be tags of samples,should be 1 or -1. + 2: rows of dataframe represent samples Usage: - 0: download https://www.kaggle.com/uciml/breast-cancer-wisconsin-data/ - 1: choose your kernel function. - 2: call SmoSVM class to get your SmoSVM object. - 3: call SmoSVM object's fit() function. - 4: call SmoSVM object's predict() function. + Command: + python3 smo.py + Code: + from smo import SmoSVM, Kernel + + kernel = Kernel(kernel='poly', degree=3., coef0=1., gamma=0.5) + init_alphas = np.zeros(train.shape[0]) + SVM = SmoSVM(train=train, alpha_list=init_alphas, kernel_func=kernel, cost=0.4, b=0.0, tolerance=0.001) + SVM.fit() + predict = SVM.predict(test_samples) Reference: https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/smo-book.pdf @@ -24,15 +30,23 @@ """ from __future__ import division +import os +import urllib.request + +import matplotlib.pyplot as plt import numpy as np import pandas as pd +from sklearn.datasets import make_blobs, make_circles +from sklearn.preprocessing import StandardScaler + +CANCER_DATASET_URL = 'http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data' class SmoSVM(object): - def __init__(self, train, kernel_func, alpha_list=None, cost=1.0, b=0.0, tolerance=0.0, auto_norm=True): + def __init__(self, train, kernel_func, alpha_list=None, cost=0.4, b=0.0, tolerance=0.001, auto_norm=True): self._init = True self._auto_norm = auto_norm - self._cost = np.float64(cost) + self._c = np.float64(cost) self._b = np.float64(b) self._tol = np.float64(tolerance) if tolerance > 0.0001 else np.float64(0.001) @@ -47,7 +61,7 @@ def __init__(self, train, kernel_func, alpha_list=None, cost=1.0, b=0.0, toleran self._error = np.zeros(self.length) self._unbound = [] - self.choose_alpha = self._choose_alpha() + self.choose_alpha = self._choose_alphas() # Calculate alphas using SMO algorithsm def fit(self): @@ -59,22 +73,8 @@ def fit(self): try: i1, i2 = self.choose_alpha.send(state) state = None - ''' - #show non-obey-kkt samples' number - from collections import Counter - result = [] - for i in self._all_samples: - result.append(self._check_obey_KKT(i)) - print(Counter(result).get(True)) - ''' except StopIteration: print("Optimization done!\r\nEvery sample satisfy the KKT condition!") - ''' - # may cost time - for i in self._all_samples: - if self._check_obey_KKT(i): - raise ValueError('some sample not fit KKT condition') - ''' break # 2: calculate new alpha2 and new alpha1 @@ -83,23 +83,21 @@ def fit(self): e1, e2 = self._e(i1), self._e(i2) args = (i1, i2, a1, a2, e1, e2, y1, y2) a1_new, a2_new = self._get_new_alpha(*args) - if not a1_new and not a2_new: state = False continue self.alphas[i1], self.alphas[i2] = a1_new, a2_new # 3: update threshold(b) - b1_new = np.float64(-e1 - y1 * K(i1, i1) * (a1_new - a1) - y2 * K(i2, i1) * (a2_new - a2) + self.b) - b2_new = np.float64(-e2 - y2 * K(i2, i2) * (a2_new - a2) - y1 * K(i1, i2) * (a1_new - a1) + self.b) - - if 0.0 < a1_new < self.c: + b1_new = np.float64(-e1 - y1 * K(i1, i1) * (a1_new - a1) - y2 * K(i2, i1) * (a2_new - a2) + self._b) + b2_new = np.float64(-e2 - y2 * K(i2, i2) * (a2_new - a2) - y1 * K(i1, i2) * (a1_new - a1) + self._b) + if 0.0 < a1_new < self._c: b = b1_new - if 0.0 < a2_new < self.c: + if 0.0 < a2_new < self._c: b = b2_new - if not (np.float64(0) < a2_new < self.c) and not (np.float64(0) < a1_new < self.c): + if not (np.float64(0) < a2_new < self._c) and not (np.float64(0) < a1_new < self._c): b = (b1_new + b2_new) / 2.0 - b_old = self.b + b_old = self._b self._b = b # 4: update error value,here we only calculate those non-bound samples' error @@ -107,19 +105,16 @@ def fit(self): for s in self.unbound: if s == i1 or s == i2: continue - self._error[s] += y1 * (a1_new - a1) * K(i1, s) + y2 * (a2_new - a2) * K(i2, s) + (self.b - b_old) + self._error[s] += y1 * (a1_new - a1) * K(i1, s) + y2 * (a2_new - a2) * K(i2, s) + (self._b - b_old) # if i1 or i2 is non-bound,update there error value to zero if self._is_unbound(i1): self._error[i1] = 0 - if self._is_unbound(i2): self._error[i2] = 0 # Predict test samles - def predict(self, test_samples): - def k(index, sample): - return self.Kernel(self.samples[index], sample) + def predict(self, test_samples, classify=True): if test_samples.shape[1] > self.samples.shape[1]: raise ValueError("Test samples' feature length not equal to train samples") @@ -129,43 +124,40 @@ def k(index, sample): results = [] for test_sample in test_samples: - tag = self.tags - alphas = self.alphas - b = self.b - result = np.sum([alphas[j] * tag[j] * k(j, test_sample) for j in self._all_samples]) + b - if result > 0: - results.append(1) + result = self._predict(test_sample) + if classify: + if result > 0: + results.append(1) + else: + results.append(-1) else: - results.append(-1) - - return results + results.append(result) + return np.array(results) # Check if alpha violate KKT condition def _check_obey_kkt(self, index): alphas = self.alphas - tol = self.tol + tol = self._tol r = self._e(index) * self.tags[index] - c = self.c + c = self._c + return (r < -tol and alphas[index] < c) or (r > tol and alphas[index] > 0.0) # Get value calculated from kernel function def _k(self, i1, i2): - return self._K_matrix[i1, i2] - - # Calculate Kernel matrix of all possible i1,i2 ,saving time - def _calculate_k_matrix(self): - k_matrix = np.zeros([self.length, self.length]) - for i in self._all_samples: - for j in self._all_samples: - k_matrix[i, j] = np.float64(self.Kernel(self.samples[i, :], self.samples[j, :])) - return k_matrix + # for test samples,use Kernel function + if isinstance(i2, np.ndarray): + return self.Kernel(self.samples[i1], i2) + # for train samples,Kernel values have been saved in matrix + else: + return self._K_matrix[i1, i2] # Get sample's error def _e(self, index): """ Two cases: - 1:Sample[index] is non-bound,Fetch error from list (_error) - 2:sample[index] is bound,Use predicted value deduct true value (g(xi) - yi) + 1:Sample[index] is non-bound,Fetch error from list: _error + 2:sample[index] is bound,Use predicted value deduct true value: g(xi) - yi """ # get from error data @@ -173,103 +165,27 @@ def _e(self, index): return self._error[index] # get by g(xi) - yi else: - return self._predict(index) - self.tags[index] - - # Equal to g(xi) - def _predict(self, index): - return np.dot(self.alphas * self.tags, self._K_matrix[:, index]) + self.b - - # Get L and H which bound the new alpha2 - def _get_LH(self, a1, a2, s): - if s == -1: - l, h = max(0.0, a2 - a1), min(self.c, self.c + a2 - a1) - elif s == 1: - l, h = max(0.0, a2 + a1 - self.c), min(self.c, a2 + a1) - else: - raise ValueError('s is not -1 or 1,s={}'.format(s)) - return l, h - - # Get K11 + K22 - 2*K12 - def _get_eta(self, i1, i2): - K = self._k - k11 = K(i1, i1) - k22 = K(i2, i2) - k12 = K(i1, i2) - return k11 + k22 - 2.0 * k12 - - # Get the new alpha2 and new alpha1 - def _get_new_alpha(self, i1, i2, a1, a2, e1, e2, y1, y2): - # Get K11 + K22 - 2*K12 - def get_eta(k_func, i1, i2): - k11 = k_func(i1, i1) - k22 = k_func(i2, i2) - k12 = k_func(i1, i2) - return k11 + k22 - 2.0 * k12 - - if i1 == i2: - return None, None - - s = y1 * y2 - L, H = self._get_LH(a1, a2, s) - if L == H: - return None, None - - K = self._k - eta = get_eta(K, i1, i2) - - if eta > 0.0: - a2_new_unc = a2 + (y2 * (e1 - e2)) / eta - - # a2_new has a boundry - if a2_new_unc >= H: - a2_new = H - elif a2_new_unc <= L: - a2_new = L - else: - a2_new = a2_new_unc - - # select the new alpha2 which could get the minimal objective - else: - b = self.b - K = self._k - l1 = a1 + s * (a2 - L) - h1 = a1 + s * (a2 - H) + gx = np.dot(self.alphas * self.tags, self._K_matrix[:, index]) + self._b + yi = self.tags[index] + return gx - yi - # way 1 - f1 = y1 * (e1 + b) - a1 * K(i1, i1) - s * a2 * K(i1, i2) - f2 = y2 * (e2 + b) - a2 * K(i2, i2) - s * a1 * K(i1, i2) - ol = l1 * f1 + L * f2 + 1 / 2 * l1 ** 2 * K(i1, i1) + 1 / 2 * L ** 2 * K(i2, i2) + s * L * l1 * K(i1, i2) - oh = h1 * f1 + H * f2 + 1 / 2 * h1 ** 2 * K(i1, i1) + 1 / 2 * H ** 2 * K(i2, i2) + s * H * h1 * K(i1, i2) - ''' - # way 2 - tmp_alphas = self.alphas.copy() - tmp_alphas[i1], tmp_alphas[i2] = l1, L - ol = self._get_objective(tmp_alphas) - tmp_alphas[i1], tmp_alphas[i2] = h1, H - oh = self._get_objective(tmp_alphas) - ''' - - if ol < (oh - self._eps): - a2_new = L - elif ol > oh + self._eps: - a2_new = H - else: - a2_new = a2 + # Calculate Kernel matrix of all possible i1,i2 ,saving time + def _calculate_k_matrix(self): + k_matrix = np.zeros([self.length, self.length]) + for i in self._all_samples: + for j in self._all_samples: + k_matrix[i, j] = np.float64(self.Kernel(self.samples[i, :], self.samples[j, :])) + return k_matrix - # a1_new has a boundry - a1_new = a1 + s * (a2 - a2_new) - if a1_new < 0: - a2_new += s * a1_new - a1_new = 0 - if a1_new > self.c: - a2_new += s * (a1_new - self.c) - a1_new = self.c - - return a1_new, a2_new + # Predict test sample's tag + def _predict(self, sample): + k = self._k + predicted_value = np.sum( + [self.alphas[i1] * self.tags[i1] * k(i1, sample) for i1 in self._all_samples]) + self._b + return predicted_value # Choose alpha1 and alpha2 - - def _choose_alpha(self): + def _choose_alphas(self): locis = yield from self._choose_a1() if not locis: return @@ -317,12 +233,10 @@ def _choose_a2(self, i1): if len(self.unbound) > 0: tmp_error = self._error.copy().tolist() tmp_error_dict = {index: value for index, value in enumerate(tmp_error) if self._is_unbound(index)} - if self._e(i1) >= 0: i2 = min(tmp_error_dict, key=lambda index: tmp_error_dict[index]) else: i2 = max(tmp_error_dict, key=lambda index: tmp_error_dict[index]) - cmd = yield i1, i2 if cmd is None: return @@ -337,29 +251,85 @@ def _choose_a2(self, i1): if cmd is None: return + # Get the new alpha2 and new alpha1 + def _get_new_alpha(self, i1, i2, a1, a2, e1, e2, y1, y2): + K = self._k + if i1 == i2: + return None, None + + # calculate L and H which bound the new alpha2 + s = y1 * y2 + if s == -1: + L, H = max(0.0, a2 - a1), min(self._c, self._c + a2 - a1) + else: + L, H = max(0.0, a2 + a1 - self._c), min(self._c, a2 + a1) + if L == H: + return None, None + + # calculate eta + k11 = K(i1, i1) + k22 = K(i2, i2) + k12 = K(i1, i2) + eta = k11 + k22 - 2.0 * k12 + + # select the new alpha2 which could get the minimal objectives + if eta > 0.0: + a2_new_unc = a2 + (y2 * (e1 - e2)) / eta + # a2_new has a boundry + if a2_new_unc >= H: + a2_new = H + elif a2_new_unc <= L: + a2_new = L + else: + a2_new = a2_new_unc + else: + b = self._b + l1 = a1 + s * (a2 - L) + h1 = a1 + s * (a2 - H) + + # way 1 + f1 = y1 * (e1 + b) - a1 * K(i1, i1) - s * a2 * K(i1, i2) + f2 = y2 * (e2 + b) - a2 * K(i2, i2) - s * a1 * K(i1, i2) + ol = l1 * f1 + L * f2 + 1 / 2 * l1 ** 2 * K(i1, i1) + 1 / 2 * L ** 2 * K(i2, i2) + s * L * l1 * K(i1, i2) + oh = h1 * f1 + H * f2 + 1 / 2 * h1 ** 2 * K(i1, i1) + 1 / 2 * H ** 2 * K(i2, i2) + s * H * h1 * K(i1, i2) + """ + # way 2 + tmp_alphas = self.alphas.copy() + tmp_alphas[i1], tmp_alphas[i2] = l1, L + ol = self._get_objectives(tmp_alphas) + tmp_alphas[i1], tmp_alphas[i2] = h1, H + oh = self._get_objectives(tmp_alphas) + """ + if ol < (oh - self._eps): + a2_new = L + elif ol > oh + self._eps: + a2_new = H + else: + a2_new = a2 + + # a1_new has a boundry too + a1_new = a1 + s * (a2 - a2_new) + if a1_new < 0: + a2_new += s * a1_new + a1_new = 0 + if a1_new > self._c: + a2_new += s * (a1_new - self._c) + a1_new = self._c + + return a1_new, a2_new + # Normalise data using min_max way def _norm(self, data): - ''' - # Use sklearn's normerlizer - from sklearn.preprocessing import MinMaxScaler - if self.init: - self.normer = MinMaxScaler() - return self.normer.fit_transform(data) - else: - return self.normer.transform(data) - ''' if self._init: self._min = np.min(data, axis=0) self._max = np.max(data, axis=0) self._init = False - return (data - self._min) / (self._max - self._min) else: - return (data - self._min) / (self._max - self._min) def _is_unbound(self, index): - if 0.0 < self.alphas[index] < self.c: + if 0.0 < self.alphas[index] < self._c: return True else: return False @@ -378,22 +348,6 @@ def unbound(self): def support(self): return [i for i in range(self.length) if self._is_support(i)] - @property - def b(self): - return self._b - - @b.setter - def b(self, value): - self._b = value - - @property - def c(self): - return self._cost - - @property - def tol(self): - return self._tol - @property def length(self): return self.samples.shape[0] @@ -423,13 +377,11 @@ def _check(self): raise ValueError('gamma value must greater than 0') def _get_kernel(self, kernel_name): - maps = { 'linear': self._linear, 'poly': self._polynomial, 'rbf': self._rbf } - return maps[kernel_name] def __call__(self, v1, v2): @@ -439,29 +391,35 @@ def __repr__(self): return self._kernel_name -def count_time(title='Process'): - def count(func): - def call_func(*args, **kwargs): - import time - start_time = time.time() - func(*args, **kwargs) - end_time = time.time() - print('\r\n{} cost {} seconds'.format(title, end_time - start_time)) - - return call_func +def count_time(func): + def call_func(*args, **kwargs): + import time + start_time = time.time() + func(*args, **kwargs) + end_time = time.time() + print('smo algorithm cost {} seconds'.format(end_time - start_time)) - return count + return call_func -@count_time(title='SMO algorithm') -def test(): +@count_time +def test_cancel_data(): print('Hello!\r\nStart test svm by smo algorithm!') - # 0: change the dataset path - data = pd.read_csv(r'C:/Users/dell/Downloads/breast-cancer-wisconsin-data/data.csv') + # 0: download dataset and load into pandas' dataframe + if not os.path.exists(r'cancel_data.csv'): + request = urllib.request.Request( + CANCER_DATASET_URL, + headers={'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'} + ) + response = urllib.request.urlopen(request) + content = response.read().decode('utf-8') + with open(r'cancel_data.csv', 'w') as f: + f.write(content) + + data = pd.read_csv(r'cancel_data.csv', header=None) # 1: pre-processing data - del data[data.columns.tolist()[-1]] - del data['id'] + del data[data.columns.tolist()[0]] data = data.dropna(axis=0) data = data.replace({'M': np.float64(1), 'B': np.float64(-1)}) samples = np.array(data)[:, :] @@ -470,8 +428,8 @@ def test(): train_data, test_data = samples[:328, :], samples[328:, :] test_tags, test_samples = test_data[:, 0], test_data[:, 1:] - # 3: choose kernel function,and set alphas to zero - mykernel = Kernel(kernel='rbf', degree=3, coef0=1, gamma=0.5) + # 3: choose kernel function,and set initial alphas to zero(optional) + mykernel = Kernel(kernel='rbf', degree=5, coef0=1, gamma=0.5) al = np.zeros(train_data.shape[0]) # 4: calculating best alphas using SMO algorithm and predict test_data samples @@ -489,5 +447,82 @@ def test(): print("Rough Accuracy: {}".format(score / test_tags.shape[0])) +def test_demonstration(): + # change stdout + import os + import sys + sys.stdout = open(os.devnull, 'w') + + ax1 = plt.subplot2grid((2, 2), (0, 0)) + ax2 = plt.subplot2grid((2, 2), (0, 1)) + ax3 = plt.subplot2grid((2, 2), (1, 0)) + ax4 = plt.subplot2grid((2, 2), (1, 1)) + ax1.set_title("linear svm,cost:0.1") + test_linear_kernel(ax1, cost=0.1) + ax2.set_title("linear svm,cost:500") + test_linear_kernel(ax2, cost=500) + ax3.set_title("rbf kernel svm,cost:0.1") + test_rbf_kernel(ax3, cost=0.1) + ax4.set_title("rbf kernel svm,cost:500") + test_rbf_kernel(ax4, cost=500) + + +def test_linear_kernel(ax, cost): + train_x, train_y = make_blobs(n_samples=1000, centers=2, + n_features=2, random_state=1) + train_y[train_y == 0] = -1 + scaler = StandardScaler() + train_x_scaled = scaler.fit_transform(train_x, train_y) + train_data = np.hstack((train_y.reshape(1000, 1), train_x_scaled)) + mykernel = Kernel(kernel='linear', degree=5, coef0=1, gamma=0.5) + mysvm = SmoSVM(train=train_data, kernel_func=mykernel, cost=cost, tolerance=0.001, auto_norm=False) + mysvm.fit() + plot_partition_boundary(mysvm, train_data, ax=ax) + + +def test_rbf_kernel(ax, cost): + train_x, train_y = make_circles(n_samples=500, noise=0.1, factor=0.1, random_state=1) + train_y[train_y == 0] = -1 + scaler = StandardScaler() + train_x_scaled = scaler.fit_transform(train_x, train_y) + train_data = np.hstack((train_y.reshape(500, 1), train_x_scaled)) + mykernel = Kernel(kernel='rbf', degree=5, coef0=1, gamma=0.5) + mysvm = SmoSVM(train=train_data, kernel_func=mykernel, cost=cost, tolerance=0.001, auto_norm=False) + mysvm.fit() + plot_partition_boundary(mysvm, train_data, ax=ax) + + +def plot_partition_boundary(model, train_data, ax, resolution=100, colors=('b', 'k', 'r')): + """ + We can not get the optimum w of our kernel svm model which is different from linear svm. + For this reason, we generate randomly destributed points with high desity and prediced values of these points are + calculated by using our tained model. Then we could use this prediced values to draw contour map. + And this contour map can represent svm's partition boundary. + + """ + train_data_x = train_data[:, 1] + train_data_y = train_data[:, 2] + train_data_tags = train_data[:, 0] + xrange = np.linspace(train_data_x.min(), train_data_x.max(), resolution) + yrange = np.linspace(train_data_y.min(), train_data_y.max(), resolution) + test_samples = np.array([(x, y) for x in xrange for y in yrange]).reshape(resolution * resolution, 2) + + test_tags = model.predict(test_samples, classify=False) + grid = test_tags.reshape((len(xrange), len(yrange))) + + # Plot contour map which represents the partition boundary + ax.contour(xrange, yrange, np.mat(grid).T, levels=(-1, 0, 1), linestyles=('--', '-', '--'), + linewidths=(1, 1, 1), + colors=colors) + # Plot all train samples + ax.scatter(train_data_x, train_data_y, c=train_data_tags, cmap=plt.cm.Dark2, lw=0, alpha=0.5) + + # Plot support vectors + support = model.support + ax.scatter(train_data_x[support], train_data_y[support], c=train_data_tags[support], cmap=plt.cm.Dark2) + + if __name__ == '__main__': - test() + test_cancel_data() + test_demonstration() + plt.show() From e09a43ada7ea1e240233405e5c2f8888db58b3f0 Mon Sep 17 00:00:00 2001 From: BAKEZQ Date: Sat, 27 Oct 2018 18:14:51 +0800 Subject: [PATCH 4/8] Update smo.py --- machine_learning/smo.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/machine_learning/smo.py b/machine_learning/smo.py index f98647128075..87b8edda74bc 100644 --- a/machine_learning/smo.py +++ b/machine_learning/smo.py @@ -31,6 +31,7 @@ from __future__ import division import os +import sys import urllib.request import matplotlib.pyplot as plt @@ -294,11 +295,8 @@ def _get_new_alpha(self, i1, i2, a1, a2, e1, e2, y1, y2): oh = h1 * f1 + H * f2 + 1 / 2 * h1 ** 2 * K(i1, i1) + 1 / 2 * H ** 2 * K(i2, i2) + s * H * h1 * K(i1, i2) """ # way 2 - tmp_alphas = self.alphas.copy() - tmp_alphas[i1], tmp_alphas[i2] = l1, L - ol = self._get_objectives(tmp_alphas) - tmp_alphas[i1], tmp_alphas[i2] = h1, H - oh = self._get_objectives(tmp_alphas) + Use objective function check which alpha2 new could get the minimal objectives + """ if ol < (oh - self._eps): a2_new = L @@ -449,8 +447,7 @@ def test_cancel_data(): def test_demonstration(): # change stdout - import os - import sys + print('\r\nStart plot,please wait!!!') sys.stdout = open(os.devnull, 'w') ax1 = plt.subplot2grid((2, 2), (0, 0)) @@ -466,14 +463,16 @@ def test_demonstration(): ax4.set_title("rbf kernel svm,cost:500") test_rbf_kernel(ax4, cost=500) + sys.stdout = sys.__stdout__ + print("Plot done!!!") def test_linear_kernel(ax, cost): - train_x, train_y = make_blobs(n_samples=1000, centers=2, + train_x, train_y = make_blobs(n_samples=500, centers=2, n_features=2, random_state=1) train_y[train_y == 0] = -1 scaler = StandardScaler() train_x_scaled = scaler.fit_transform(train_x, train_y) - train_data = np.hstack((train_y.reshape(1000, 1), train_x_scaled)) + train_data = np.hstack((train_y.reshape(500, 1), train_x_scaled)) mykernel = Kernel(kernel='linear', degree=5, coef0=1, gamma=0.5) mysvm = SmoSVM(train=train_data, kernel_func=mykernel, cost=cost, tolerance=0.001, auto_norm=False) mysvm.fit() @@ -526,3 +525,4 @@ def plot_partition_boundary(model, train_data, ax, resolution=100, colors=('b', test_cancel_data() test_demonstration() plt.show() + From 415aad85565aa496187308a8b4e4a9a848d08e13 Mon Sep 17 00:00:00 2001 From: John Law Date: Mon, 16 Sep 2019 23:08:35 +0800 Subject: [PATCH 5/8] Update smo.py --- machine_learning/smo.py | 1 + 1 file changed, 1 insertion(+) diff --git a/machine_learning/smo.py b/machine_learning/smo.py index 87b8edda74bc..7b853804d2df 100644 --- a/machine_learning/smo.py +++ b/machine_learning/smo.py @@ -28,6 +28,7 @@ https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/tr-98-14.pdf http://web.cs.iastate.edu/~honavar/smo-svm.pdf """ + from __future__ import division import os From 3c3273019fa136d259a294b203b767ad04668d8d Mon Sep 17 00:00:00 2001 From: BAKEZQ Date: Mon, 16 Sep 2019 23:52:53 +0800 Subject: [PATCH 6/8] Rename smo.py to sequential_minimum_optimization.py --- machine_learning/{smo.py => sequential_minimum_optimization.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename machine_learning/{smo.py => sequential_minimum_optimization.py} (100%) diff --git a/machine_learning/smo.py b/machine_learning/sequential_minimum_optimization.py similarity index 100% rename from machine_learning/smo.py rename to machine_learning/sequential_minimum_optimization.py From fe2832d3e42e2fa88f1460b78473ae94d82f3205 Mon Sep 17 00:00:00 2001 From: BAKEZQ Date: Tue, 17 Sep 2019 07:32:52 +0800 Subject: [PATCH 7/8] Update doc and simplify the code. Fix filename typo error in doc. Use ternary conditional operator in predict() --- machine_learning/sequential_minimum_optimization.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/machine_learning/sequential_minimum_optimization.py b/machine_learning/sequential_minimum_optimization.py index 7b853804d2df..926f88564f56 100644 --- a/machine_learning/sequential_minimum_optimization.py +++ b/machine_learning/sequential_minimum_optimization.py @@ -13,9 +13,9 @@ Usage: Command: - python3 smo.py + python3 sequential_minimum_optimization.py Code: - from smo import SmoSVM, Kernel + from sequential_minimum_optimization import SmoSVM, Kernel kernel = Kernel(kernel='poly', degree=3., coef0=1., gamma=0.5) init_alphas = np.zeros(train.shape[0]) @@ -119,7 +119,7 @@ def fit(self): def predict(self, test_samples, classify=True): if test_samples.shape[1] > self.samples.shape[1]: - raise ValueError("Test samples' feature length not equal to train samples") + raise ValueError("Test samples' feature length does not equal to that of train samples") if self._auto_norm: test_samples = self._norm(test_samples) @@ -128,10 +128,7 @@ def predict(self, test_samples, classify=True): for test_sample in test_samples: result = self._predict(test_sample) if classify: - if result > 0: - results.append(1) - else: - results.append(-1) + results.append(1 if result > 0 else -1) else: results.append(result) return np.array(results) From ac844613ed85cb924d584ac20aeb1938d7be889e Mon Sep 17 00:00:00 2001 From: BAKEZQ Date: Tue, 17 Sep 2019 07:43:59 +0800 Subject: [PATCH 8/8] Update doc. --- machine_learning/sequential_minimum_optimization.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/machine_learning/sequential_minimum_optimization.py b/machine_learning/sequential_minimum_optimization.py index 926f88564f56..0b5d788e92e1 100644 --- a/machine_learning/sequential_minimum_optimization.py +++ b/machine_learning/sequential_minimum_optimization.py @@ -7,9 +7,9 @@ It was invented by John Platt in 1998. Input: - 0: pandas dataframe - 1: first column of dataframe must be tags of samples,should be 1 or -1. - 2: rows of dataframe represent samples + 0: type: numpy.ndarray. + 1: first column of ndarray must be tags of samples, must be 1 or -1. + 2: rows of ndarray represent samples. Usage: Command: