diff --git a/CMakeLists.txt b/CMakeLists.txt index 66c6aa1..19049ef 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -19,6 +19,7 @@ target_sources(afml src/nn/Modules/Module.cpp src/nn/Modules/Dropout.cpp src/nn/Init.cpp + src/optim/Optimizers.cpp ) target_include_directories(afml diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index b1e2404..11e8d14 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -13,6 +13,6 @@ endfunction(build_example) # build_example(Activations.cpp) # build_example(FFNet.cpp) # build_example(Node.cpp) -build_example(perceptron.cpp) +build_example(xor.cpp) # build_example(Weights.cpp) build_example(autograd.cpp) diff --git a/examples/perceptron.cpp b/examples/xor.cpp similarity index 61% rename from examples/perceptron.cpp rename to examples/xor.cpp index dd3e3d3..535bef1 100644 --- a/examples/perceptron.cpp +++ b/examples/xor.cpp @@ -9,16 +9,29 @@ #include #include +#include + +#include +#include using namespace af; using namespace af::nn; using namespace af::autograd; -int main() +int main(int argc, const char **args) { + int optim_mode = 0; + std::string optimizer_arg = std::string(args[1]); + if (optimizer_arg == "--adam") { + optim_mode = 1; + } else if (optimizer_arg == "--rmsprop") { + optim_mode = 2; + } + const int inputSize = 2; const int outputSize = 1; - const double lr = 0.1; + const double lr = 0.01; + const double mu = 0.1; const int numSamples = 4; float hInput[] = {1, 1, @@ -34,24 +47,35 @@ int main() auto in = af::array(inputSize, numSamples, hInput); auto out = af::array(outputSize, numSamples, hOutput); - nn::Sequential perceptron; + nn::Sequential model; - perceptron.add(nn::Linear(inputSize, outputSize)); - perceptron.add(nn::Sigmoid()); + model.add(nn::Linear(inputSize, outputSize)); + model.add(nn::Sigmoid()); auto loss = nn::MeanSquaredError(); + std::unique_ptr optim; + + if (optimizer_arg == "--rmsprop") { + optim = std::unique_ptr(new optim::RMSPropOptimizer(model.parameters(), lr)); + } else if (optimizer_arg == "--adam") { + optim = std::unique_ptr(new optim::AdamOptimizer(model.parameters(), lr)); + } else { + optim = std::unique_ptr(new optim::SGDOptimizer(model.parameters(), lr, mu)); + } + Variable result, l; for (int i = 0; i < 1000; i++) { for (int j = 0; j < numSamples; j++) { - perceptron.train(); - perceptron.zeroGrad(); + + model.train(); + optim->zeroGrad(); af::array in_j = in(af::span, j); af::array out_j = out(af::span, j); // Forward propagation - result = perceptron(nn::input(in_j)); + result = model(nn::input(in_j)); // Calculate loss l = loss(result, nn::noGrad(out_j)); @@ -60,18 +84,14 @@ int main() l.backward(); // Update parameters - // TODO: Should use optimizer - for (auto ¶m : perceptron.parameters()) { - param.array() -= lr * param.grad().array(); - param.array().eval(); - } + optim->update(); } if ((i + 1) % 100 == 0) { - perceptron.eval(); + model.eval(); // Forward propagation - result = perceptron(nn::input(in)); + result = model(nn::input(in)); // Calculate loss // TODO: Use loss function diff --git a/include/af/autograd/Variable.hpp b/include/af/autograd/Variable.hpp index db2894b..176512a 100644 --- a/include/af/autograd/Variable.hpp +++ b/include/af/autograd/Variable.hpp @@ -9,14 +9,14 @@ #pragma once +#include + #include #include #include #include #include -#include - namespace af { namespace autograd { class Variable @@ -62,6 +62,8 @@ namespace af { af::dim4 dims() const; + af::dtype type() const; + void zeroGrad(); void setCalcGrad(bool calc_grad); diff --git a/include/af/nn/Modules/Container.hpp b/include/af/nn/Modules/Container.hpp index 5ded60c..86f2eab 100644 --- a/include/af/nn/Modules/Container.hpp +++ b/include/af/nn/Modules/Container.hpp @@ -8,10 +8,11 @@ ********************************************************/ #pragma once -#include #include #include +#include + namespace af { namespace nn diff --git a/include/af/nn/Modules/Module.hpp b/include/af/nn/Modules/Module.hpp index 9666ee3..0f0f973 100644 --- a/include/af/nn/Modules/Module.hpp +++ b/include/af/nn/Modules/Module.hpp @@ -8,11 +8,11 @@ ********************************************************/ #pragma once +#include + #include #include -#include - namespace af { namespace nn @@ -35,8 +35,6 @@ namespace af std::vector parameters(); - void zeroGrad(); - void train(); void eval(); diff --git a/include/af/optim.h b/include/af/optim.h new file mode 100644 index 0000000..9b75ed3 --- /dev/null +++ b/include/af/optim.h @@ -0,0 +1,9 @@ +/******************************************************* + * Copyright (c) 2017, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ +#include diff --git a/include/af/optim/Optimizers.hpp b/include/af/optim/Optimizers.hpp new file mode 100644 index 0000000..af0289d --- /dev/null +++ b/include/af/optim/Optimizers.hpp @@ -0,0 +1,90 @@ +/******************************************************* + * Copyright (c) 2017, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once + +#include +#include + +#include + +namespace af +{ + namespace optim + { + + class Optimizer + { + protected: + std::vector m_parameters; + public: + + Optimizer(const std::vector ¶meters); + + virtual void update() = 0; + + void zeroGrad(); + }; + + class SGDOptimizer : public Optimizer + { + bool m_use_nesterov; + double m_lr; + double m_mu; + double m_wd; + std::vector m_velocities; + public: + SGDOptimizer(const std::vector ¶meters, + double learning_rate, double momentum = 0, + double weight_decay = 0, + bool use_nesterov = false); + void update(); + }; + + class AdamOptimizer : public Optimizer + { + double m_lr; + double m_beta1; + double m_beta2; + double m_eps; + double m_wd; + int m_count; + std::vector m_biased_first; + std::vector m_biased_second; + public: + AdamOptimizer(const std::vector ¶meters, + double learning_rate, + double beta1 = 0.9, + double beta2 = 0.999, + double epsilon = 1E-8, + double weight_decay = 0); + void update(); + }; + + class RMSPropOptimizer : public Optimizer + { + bool m_use_first; + double m_lr; + double m_rho; + double m_eps; + double m_wd; + std::vector m_first; + std::vector m_second; + public: + RMSPropOptimizer(const std::vector ¶meters, + double learning_rate, + double rho = 0.99, + double epsilon = 1E-8, + double weight_decay = 0, + bool use_first = false); + void update(); + }; + + } +} diff --git a/src/autograd/Variable.cpp b/src/autograd/Variable.cpp index 61e147b..abc109d 100644 --- a/src/autograd/Variable.cpp +++ b/src/autograd/Variable.cpp @@ -107,6 +107,11 @@ namespace af { return m_shared->m_data.dims(); } + af::dtype Variable::type() const + { + return m_shared->m_data.type(); + } + void Variable::zeroGrad() { m_shared->m_grads.clear(); @@ -144,14 +149,7 @@ namespace af { m_shared->m_grads.resize(1); } - // Remove the graph if not needed - if (!retain_grad_graph) { - // This can be done by extracting af::array and ignoring everything else - auto grad_data = grad.array(); - // Since there's no graph leading this, set calc_grad to false - grad = Variable(grad_data, false); - } - + grad.setCalcGrad(retain_grad_graph); m_shared->m_grads[0] = grad; } diff --git a/src/nn/Modules/Module.cpp b/src/nn/Modules/Module.cpp index c769a02..b79e001 100644 --- a/src/nn/Modules/Module.cpp +++ b/src/nn/Modules/Module.cpp @@ -54,13 +54,6 @@ namespace af return m_parameters; } - void Module::zeroGrad() - { - for (auto ¶meter : m_parameters) { - parameter.zeroGrad(); - } - } - Variable Module::operator()(const Variable &input) { return this->forward(input); diff --git a/src/optim/Optimizers.cpp b/src/optim/Optimizers.cpp new file mode 100644 index 0000000..d9d848c --- /dev/null +++ b/src/optim/Optimizers.cpp @@ -0,0 +1,211 @@ +/******************************************************* + * Copyright (c) 2017, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include + +#include + +using af::autograd::Variable; +using std::vector; + +// References: +// SGD and Momentum: http://cs231n.github.io/neural-networks-3/#sgd +// Adam: https://arxiv.org/pdf/1412.6980.pdf +// RMSProp: https://arxiv.org/pdf/1308.0850v5.pdf + +// Comparision between various update rules: +// https://www.quora.com/What-are-differences-between-update-rules-like-AdaDelta-RMSProp-AdaGrad-and-AdaM + +namespace af +{ + namespace optim + { + Optimizer::Optimizer(const vector ¶meters) + : m_parameters(parameters.begin(), parameters.end()) + { + } + + void Optimizer::zeroGrad() + { + for (auto ¶meter : m_parameters) { + parameter.zeroGrad(); + } + } + + SGDOptimizer::SGDOptimizer(const vector ¶meters, + double learning_rate, double momentum, + double weight_decay, bool use_nesterov) + : Optimizer(parameters), + m_use_nesterov(use_nesterov), + m_lr(learning_rate), + m_mu(momentum), + m_wd(weight_decay), + m_velocities() + { + if (momentum != 0) { + m_velocities.reserve(parameters.size()); + for (const auto ¶meter : m_parameters) { + m_velocities.push_back(af::constant(0, parameter.dims(), parameter.type())); + m_velocities.back().eval(); + } + } + } + + void SGDOptimizer::update() + { + for (size_t i = 0; i < m_parameters.size(); i++) { + + const af::array &grad = m_parameters[i].grad().array(); + af::array &data = m_parameters[i].array(); + + if (m_wd != 0) { + // Weight decay term + data = data - m_wd * data; + } + + if (m_mu != 0) { + af::array &velocity = m_velocities[i]; + + // Regular momentum + velocity = m_mu * velocity - m_lr * grad; + if (m_use_nesterov) { + // Update for nesterov momentum + data = data + velocity * m_mu - m_lr * grad; + } else { + data = data + velocity; + } + + af::eval(velocity, data); + } else { + + data = data - m_lr * grad; + af::eval(data); + } + } + } + + + AdamOptimizer::AdamOptimizer(const vector ¶meters, + double learning_rate, + double beta1, double beta2, + double epsilon, double weight_decay) + : Optimizer(parameters), + m_lr(learning_rate), + m_beta1(beta1), + m_beta2(beta2), + m_eps(epsilon), + m_wd(weight_decay), + m_count(0), + m_biased_first(), + m_biased_second() + { + m_biased_first.reserve(parameters.size()); + m_biased_second.reserve(parameters.size()); + + for (const auto ¶meter : m_parameters) { + m_biased_first.push_back(af::constant(0, parameter.dims(), parameter.type())); + m_biased_second.push_back(af::constant(0, parameter.dims(), parameter.type())); + + m_biased_first.back().eval(); + m_biased_second.back().eval(); + } + } + + void AdamOptimizer::update() + { + for (size_t i = 0; i < m_parameters.size(); i++) { + const af::array &grad = m_parameters[i].grad().array(); + af::array &data = m_parameters[i].array(); + + if (m_wd != 0) { + // Weight decay term + data = data - m_wd * data; + } + + af::array &biased_first = m_biased_first[i]; + af::array &biased_second = m_biased_second[i]; + + biased_first = m_beta1 * biased_first + (1 - m_beta1) * grad; + biased_second = m_beta2 * biased_second + (1 - m_beta2) * grad * grad; + + m_count++; + + double corrected_bias1 = 1 - std::pow(m_beta1, m_count); + double corrected_bias2 = 1 - std::pow(m_beta2, m_count); + double corrected_lr = m_lr * std::sqrt(corrected_bias2) / corrected_bias1; + + data = data - (corrected_lr * biased_first) / (af::sqrt(biased_second) + m_eps); + + af::eval(data, biased_first, biased_second); + } + } + + RMSPropOptimizer::RMSPropOptimizer(const vector ¶meters, + double learning_rate, + double rho, + double epsilon, + double weight_decay, + bool use_first) + : Optimizer(parameters), + m_use_first(use_first), + m_lr(learning_rate), + m_rho(rho), + m_eps(epsilon), + m_wd(weight_decay), + m_first(), + m_second() + { + if (m_use_first) m_first.reserve(parameters.size()); + m_second.reserve(parameters.size()); + + for (const auto ¶meter : m_parameters) { + if (m_use_first) { + m_first.push_back(af::constant(0, parameter.dims(), parameter.type())); + m_first.back().eval(); + } + + m_second.push_back(af::constant(0, parameter.dims(), parameter.type())); + m_second.back().eval(); + } + } + + void RMSPropOptimizer::update() + { + for (size_t i = 0; i < m_parameters.size(); i++) { + const af::array &grad = m_parameters[i].grad().array(); + af::array &data = m_parameters[i].array(); + + if (m_wd != 0) { + // Weight decay term + data = data - m_wd * data; + } + + af::array &second = m_second[i]; + second = m_rho * second + (1 - m_rho) * grad * grad; + + // Create shallow copy of second so that we don't update "second" below + af::array moments = second; + if (m_use_first) { + af::array &first = m_first[i]; + first = m_rho * first + (1 - m_rho) * grad; + moments = moments - first * first; + } + + data = data - (m_lr * grad) / (af::sqrt(moments) + m_eps); + + if (m_use_first) { + af::array &first = m_first[i]; + af::eval(data, first, second); + } else { + af::eval(data, second); + } + } + } + } +}