|
| 1 | +/** |
| 2 | + * \file |
| 3 | + * \brief [Adaptive Linear Neuron |
| 4 | + * (ADALINE)](https://en.wikipedia.org/wiki/ADALINE) implementation |
| 5 | + * |
| 6 | + * <img |
| 7 | + * src="https://upload.wikimedia.org/wikipedia/commons/b/be/Adaline_flow_chart.gif" |
| 8 | + * width="200px"> |
| 9 | + * [source](https://commons.wikimedia.org/wiki/File:Adaline_flow_chart.gif) |
| 10 | + * ADALINE is one of the first and simplest single layer artificial neural |
| 11 | + * network. The algorithm essentially implements a linear function |
| 12 | + * \f[ f\left(x_0,x_1,x_2,\ldots\right) = |
| 13 | + * \sum_j x_jw_j+\theta |
| 14 | + * \f] |
| 15 | + * where \f$x_j\f$ are the input features of a sample, \f$w_j\f$ are the |
| 16 | + * coefficients of the linear function and \f$\theta\f$ is a constant. If we |
| 17 | + * know the \f$w_j\f$, then for any given set of features, \f$y\f$ can be |
| 18 | + * computed. Computing the \f$w_j\f$ is a supervised learning algorithm wherein |
| 19 | + * a set of features and their corresponding outputs are given and weights are |
| 20 | + * computed using stochastic gradient descent method. |
| 21 | + */ |
| 22 | + |
| 23 | +#include <cassert> |
| 24 | +#include <climits> |
| 25 | +#include <cmath> |
| 26 | +#include <cstdlib> |
| 27 | +#include <ctime> |
| 28 | +#include <iostream> |
| 29 | +#include <vector> |
| 30 | + |
| 31 | +#define MAX_ITER 500 // INT_MAX ///< Maximum number of iterations to learn |
| 32 | + |
| 33 | +class adaline { |
| 34 | + public: |
| 35 | + /** |
| 36 | + * Default constructor |
| 37 | + * \param[in] num_features number of features present |
| 38 | + * \param[in] eta learning rate (optional, default=0.1) |
| 39 | + * \param[in] convergence accuracy (optional, default=\f$1\times10^{-5}\f$) |
| 40 | + */ |
| 41 | + adaline(int num_features, const double eta = 0.01f, |
| 42 | + const double accuracy = 1e-5) |
| 43 | + : eta(eta), accuracy(accuracy) { |
| 44 | + if (eta <= 0) { |
| 45 | + std::cerr << "learning rate should be positive and nonzero" |
| 46 | + << std::endl; |
| 47 | + std::exit(EXIT_FAILURE); |
| 48 | + } |
| 49 | + |
| 50 | + weights = std::vector<double>( |
| 51 | + num_features + |
| 52 | + 1); // additional weight is for the constant bias term |
| 53 | + |
| 54 | + // initialize with random weights in the range [-50, 49] |
| 55 | + for (int i = 0; i < weights.size(); i++) |
| 56 | + weights[i] = (static_cast<double>(std::rand() % 100) - 50); |
| 57 | + } |
| 58 | + |
| 59 | + /** |
| 60 | + * Operator to print the weights of the model |
| 61 | + */ |
| 62 | + friend std::ostream &operator<<(std::ostream &out, const adaline &ada) { |
| 63 | + out << "<"; |
| 64 | + for (int i = 0; i < ada.weights.size(); i++) { |
| 65 | + out << ada.weights[i]; |
| 66 | + if (i < ada.weights.size() - 1) |
| 67 | + out << ", "; |
| 68 | + } |
| 69 | + out << ">"; |
| 70 | + return out; |
| 71 | + } |
| 72 | + |
| 73 | + /** |
| 74 | + * predict the output of the model for given set of features |
| 75 | + * \param[in] x input vector |
| 76 | + * \returns model prediction output |
| 77 | + */ |
| 78 | + int predict(const std::vector<double> &x) { |
| 79 | + if (!check_size_match(x)) |
| 80 | + return 0; |
| 81 | + |
| 82 | + double y = weights.back(); // assign bias value |
| 83 | + |
| 84 | + for (int i = 0; i < x.size(); i++) y += x[i] * weights[i]; |
| 85 | + |
| 86 | + return y >= 0 ? 1 : -1; // quantizer: apply ADALINE threshold function |
| 87 | + } |
| 88 | + |
| 89 | + /** |
| 90 | + * Update the weights of the model using supervised learning for one feature |
| 91 | + * vector |
| 92 | + * \param[in] x feature vector |
| 93 | + * \param[in] y known output value |
| 94 | + * \returns correction factor |
| 95 | + */ |
| 96 | + double fit(const std::vector<double> &x, const int &y) { |
| 97 | + if (!check_size_match(x)) |
| 98 | + return 0; |
| 99 | + |
| 100 | + /* output of the model with current weights */ |
| 101 | + int p = predict(x); |
| 102 | + int prediction_error = y - p; // error in estimation |
| 103 | + double correction_factor = eta * prediction_error; |
| 104 | + |
| 105 | + /* update each weight, the last weight is the bias term */ |
| 106 | + for (int i = 0; i < x.size(); i++) { |
| 107 | + weights[i] += correction_factor * x[i]; |
| 108 | + } |
| 109 | + weights[x.size()] += correction_factor; // update bias |
| 110 | + |
| 111 | + return correction_factor; |
| 112 | + } |
| 113 | + |
| 114 | + /** |
| 115 | + * Update the weights of the model using supervised learning for an array of |
| 116 | + * vectors. |
| 117 | + * \param[in] X array of feature vector |
| 118 | + * \param[in] y known output value for each feature vector |
| 119 | + */ |
| 120 | + template <int N> |
| 121 | + void fit(std::vector<double> const (&X)[N], const int *y) { |
| 122 | + double avg_pred_error = 1.f; |
| 123 | + |
| 124 | + int iter; |
| 125 | + for (iter = 0; (iter < MAX_ITER) && (avg_pred_error > accuracy); |
| 126 | + iter++) { |
| 127 | + avg_pred_error = 0.f; |
| 128 | + |
| 129 | + // perform fit for each sample |
| 130 | + for (int i = 0; i < N; i++) { |
| 131 | + double err = fit(X[i], y[i]); |
| 132 | + avg_pred_error += std::abs(err); |
| 133 | + } |
| 134 | + avg_pred_error /= N; |
| 135 | + |
| 136 | + // Print updates every 200th iteration |
| 137 | + // if (iter % 100 == 0) |
| 138 | + std::cout << "\tIter " << iter << ": Training weights: " << *this |
| 139 | + << "\tAvg error: " << avg_pred_error << std::endl; |
| 140 | + } |
| 141 | + |
| 142 | + if (iter < MAX_ITER) |
| 143 | + |
| 144 | + std::cout << "Converged after " << iter << " iterations." |
| 145 | + << std::endl; |
| 146 | + else |
| 147 | + std::cout << "Did not converge after " << iter << " iterations." |
| 148 | + << std::endl; |
| 149 | + } |
| 150 | + |
| 151 | + private: |
| 152 | + /** |
| 153 | + * convenient function to check if input feature vector size matches the |
| 154 | + * model weights size |
| 155 | + * \param[in] x fecture vector to check |
| 156 | + * \returns `true` size matches |
| 157 | + * \returns `false` size does not match |
| 158 | + */ |
| 159 | + bool check_size_match(const std::vector<double> &x) { |
| 160 | + if (x.size() != (weights.size() - 1)) { |
| 161 | + std::cerr << __func__ << ": " |
| 162 | + << "Number of features in x does not match the feature " |
| 163 | + "dimension in model!" |
| 164 | + << std::endl; |
| 165 | + return false; |
| 166 | + } |
| 167 | + return true; |
| 168 | + } |
| 169 | + |
| 170 | + const double eta; ///< learning rate of the algorithm |
| 171 | + const double accuracy; ///< model fit convergence accuracy |
| 172 | + std::vector<double> weights; ///< weights of the neural network |
| 173 | +}; |
| 174 | + |
| 175 | +/** |
| 176 | + * test function to predict points in a 2D coordinate system above the line |
| 177 | + * \f$x=y\f$ as +1 and others as -1. |
| 178 | + * Note that each point is defined by 2 values or 2 features. |
| 179 | + * \param[in] eta learning rate (optional, default=0.01) |
| 180 | + */ |
| 181 | +void test1(double eta = 0.01) { |
| 182 | + adaline ada(2, eta); // 2 features |
| 183 | + |
| 184 | + const int N = 10; // number of sample points |
| 185 | + |
| 186 | + std::vector<double> X[N] = {{0, 1}, {1, -2}, {2, 3}, {3, -1}, |
| 187 | + {4, 1}, {6, -5}, {-7, -3}, {-8, 5}, |
| 188 | + {-9, 2}, {-10, -15}}; |
| 189 | + int y[] = {1, -1, 1, -1, -1, -1, 1, 1, 1, -1}; // corresponding y-values |
| 190 | + |
| 191 | + std::cout << "------- Test 1 -------" << std::endl; |
| 192 | + std::cout << "Model before fit: " << ada << std::endl; |
| 193 | + |
| 194 | + ada.fit(X, y); |
| 195 | + std::cout << "Model after fit: " << ada << std::endl; |
| 196 | + |
| 197 | + int predict = ada.predict({5, -3}); |
| 198 | + std::cout << "Predict for x=(5,-3): " << predict; |
| 199 | + assert(predict == -1); |
| 200 | + std::cout << " ...passed" << std::endl; |
| 201 | + |
| 202 | + predict = ada.predict({5, 8}); |
| 203 | + std::cout << "Predict for x=(5,8): " << predict; |
| 204 | + assert(predict == 1); |
| 205 | + std::cout << " ...passed" << std::endl; |
| 206 | +} |
| 207 | + |
| 208 | +/** |
| 209 | + * test function to predict points in a 2D coordinate system above the line |
| 210 | + * \f$x+y=-1\f$ as +1 and others as -1. |
| 211 | + * Note that each point is defined by 2 values or 2 features. |
| 212 | + * The function will create random sample points for training and test purposes. |
| 213 | + * \param[in] eta learning rate (optional, default=0.01) |
| 214 | + */ |
| 215 | +void test2(double eta = 0.01) { |
| 216 | + adaline ada(2, eta); // 2 features |
| 217 | + |
| 218 | + const int N = 50; // number of sample points |
| 219 | + |
| 220 | + std::vector<double> X[N]; |
| 221 | + int Y[N]; // corresponding y-values |
| 222 | + |
| 223 | + int range = 500; // sample points range |
| 224 | + int range2 = range >> 1; |
| 225 | + for (int i = 0; i < N; i++) { |
| 226 | + double x0 = ((std::rand() % range) - range2) / 100.f; |
| 227 | + double x1 = ((std::rand() % range) - range2) / 100.f; |
| 228 | + X[i] = {x0, x1}; |
| 229 | + Y[i] = (x0 + x1) > -1 ? 1 : -1; |
| 230 | + } |
| 231 | + |
| 232 | + std::cout << "------- Test 1 -------" << std::endl; |
| 233 | + std::cout << "Model before fit: " << ada << std::endl; |
| 234 | + |
| 235 | + ada.fit(X, Y); |
| 236 | + std::cout << "Model after fit: " << ada << std::endl; |
| 237 | + |
| 238 | + int N_test_cases = 5; |
| 239 | + for (int i = 0; i < N_test_cases; i++) { |
| 240 | + double x0 = ((std::rand() % range) - range2) / 100.f; |
| 241 | + double x1 = ((std::rand() % range) - range2) / 100.f; |
| 242 | + |
| 243 | + int predict = ada.predict({x0, x1}); |
| 244 | + |
| 245 | + std::cout << "Predict for x=(" << x0 << "," << x1 << "): " << predict; |
| 246 | + |
| 247 | + int expected_val = (x0 + x1) > -1 ? 1 : -1; |
| 248 | + assert(predict == expected_val); |
| 249 | + std::cout << " ...passed" << std::endl; |
| 250 | + } |
| 251 | +} |
| 252 | + |
| 253 | +/** Main function */ |
| 254 | +int main(int argc, char **argv) { |
| 255 | + std::srand(std::time(nullptr)); // initialize random number generator |
| 256 | + |
| 257 | + double eta = 0.2; // default value of eta |
| 258 | + if (argc == 2) // read eta value from commandline argument if present |
| 259 | + eta = strtof(argv[1], nullptr); |
| 260 | + |
| 261 | + test1(eta); |
| 262 | + |
| 263 | + std::cout << "Press ENTER to continue..." << std::endl; |
| 264 | + std::cin.get(); |
| 265 | + |
| 266 | + test2(eta); |
| 267 | + |
| 268 | + return 0; |
| 269 | +} |
0 commit comments