梯度下降算法（更新于 2020/12/04）

梯度下降法(Gradient descent)又叫最速下降法，是一种局部搜索算法，属于智能算法，类似的有爬山算法、模拟退火等。基本原理是优化理论中的无约束问题，利用梯度搜索函数变化最快的方向，然后不断迭代寻找出极值的方法。

有空我将补上推导过程并给出梯度法的收敛条件，可能还会介绍一些其它无约束问题的优化算法(牛顿法、共轭梯度法等)及其他约束问题的优化算法，届时我们将会看到许多有趣的搜索算法，比如斐波那契法、黄金比例法（\$ 0.618 \$法）等。

代码实现
#

使用方法就是自定义一个多元函数 \$ g \$，然后如下方式调用：

(gradient-descent g init-arguments max-iterations tolerance)

例子： \$ f(x_1, x_2) = exp(x_1^2 + x_2^2) \$

(gradient-descent
  (lambda (vec)
    (exp (+ (square (list-ref vec 0)) (square (list-ref vec 1)))))
  (list 1.0 1.0)
  30 0.00001)

Scheme
#

(define delta 0.00001)

(define (deriv g)
  (lambda (x)
    (/ (- (g (+ x delta)) (g x))
       delta)))

(define (newton-transform g)
  (lambda (x)
    (- x (/ (g x) ((deriv g) x)))))

(define (fixed-point g first-guess)
  (define (close-enough? v1 v2)
    (< (abs (- v1 v2)) delta))
  (define (try guess)
    (let ((next (g guess)))
      (if (close-enough? guess next)
   guess
   (try next))))
  (try first-guess))

(define (newton-method g guess)
  (fixed-point (newton-transform g) guess))

(define (minimized g)
  (newton-method (deriv g) 1.0))

(define (gradient-descent g init-vec max-it tolerance)
  (define len (- (length init-vec) 1))
  (define (partial-derivative g)
    (lambda (vec index)
      (let ((d-vec (list-copy vec)))
 (list-set! d-vec index (+ (list-ref d-vec index) delta))
 (/ (- (g d-vec) (g vec)) delta))))
  (define (gradient-vector g vec)
    (define (iter i res)
      (if (< i 0)
   res
   (iter (- i 1) (cons ((partial-derivative g) vec i) res))))
    (iter len ()))
  (define (distance vec)
    (define (iter i result)
      (if (< i 0)
   result
   (iter (- i 1) (+ result (square (list-ref vec i))))))
    (sqrt (iter len 0)))
  (define (next vec x grad)
    (define (iter i res)
      (if (< i 0)
   res
   (iter (- i 1) (cons (- (list-ref vec i) (* x (list-ref grad i))) res))))
    (iter len ()))
  (define (x-transform vec x grad)
    (next vec x grad))
  (define (iter vec i)
    (let ((grad-vec (gradient-vector g vec)))
      (display vec)
      (newline)
      (cond ((< max-it i) vec)
     ((< (distance grad-vec) tolerance) vec)
     (else (let ((minimum (minimized
      (lambda (x)
        (g (x-transform vec x grad-vec))))))
      (iter (next vec minimum grad-vec) (+ i 1)))))))
  (iter init-vec 0))

C++
#

#include <cmath>
#include <iostream>
#include <vector>
#include <cassert>
#include <functional>

const double h = 1e-5;

auto square(double x) -> double const { return x * x; }
auto cube(double x) -> double const { return x * x * x; }

template<class Fn_Ty>
auto partial_derivative(Fn_Ty g) {
    auto a = [g](const auto& vec, size_t i)
    {
        auto components = vec;
        components[i] += h;
        return (g(components) - g(vec)) / h;
    };
    return a;
}

template<class Fn_Ty, class Arg>
auto fixed_point(Fn_Ty g, Arg first_guess)
{
    auto close_enough = [](auto v1, auto v2)
    {
        return (std::abs(v1 - v2) < 1e-5);
    };
    auto try_again = [g, close_enough](auto guess, auto& try_ref)
    {
        auto next = g(guess);
        if (close_enough(guess, next)) return next;
        else return try_ref(next, try_ref);
    };
    return try_again(first_guess, try_again);
}

template<class Fn_Ty>
auto deriv(Fn_Ty g)
{
    auto a = [g](auto x)
    {
        return (g(x + h) - g(x)) / h;
    };
    return a;
}

template<class Fn_Ty>
auto newton_transform(Fn_Ty g)
{
    auto a = [g](auto x)
    {
        return x - (g(x) / deriv(g)(x));
    };
    return a;
}

template<class Fn_Ty, class Arg>
auto newton_method(Fn_Ty g, Arg guess)
{
    return fixed_point(newton_transform(g), guess);
}

std::vector<double> operator*(double b, const std::vector<double>& a)
{
    std::vector<double> c(a.size());
    for (size_t i = 0; i < a.size(); i++) c[i] = a[i] * b;
    return c;
}

std::vector<double> operator*(const std::vector<double>& a, double b)
{
    return b * a;
}

std::vector<double> operator-(const std::vector<double>& a, double b)
{
    std::vector<double> c(a.size());
    for (size_t i = 0; i < a.size(); i++) c[i] = a[i] - b;
    return c;
}

std::vector<double> operator-(double b, const std::vector<double>& a)
{
    std::vector<double> c(a.size());
    for (size_t i = 0; i < a.size(); i++) c[i] = b - a[i];
    return c;
}

std::vector<double> operator-(const std::vector<double>& a)
{
    std::vector<double> c(a.size());
    for (size_t i = 0; i < a.size(); i++) c[i] = -a[i];
    return c;
}

void operator+=(std::vector<double>& a, const std::vector<double>& b)
{
    assert(a.size() == b.size());
    for (size_t i = 0; i < a.size(); i++) a[i] += b[i];
}

void operator-=(std::vector<double>& a, const std::vector<double>& b)
{
    assert(a.size() == b.size());
    for (size_t i = 0; i < a.size(); i++) a[i] -= b[i];
}

template<class Goal_FnTy>
class GradientDescent {
public:
    GradientDescent(std::function<Goal_FnTy> g, std::vector<double> init_values, const double error, size_t max_iter = 20) :
        error(error), max_it(max_iter), gradient_components(), init_variable(init_values), vec(init_values), goal_fn(g)
    {
    }

    void run(bool output = true)
    {
        size_t step = 0;
        while (step < max_it)
        {
            if (output) step_output(step);

            std::vector<double> gradient_components; // 梯度向量
            for (size_t i = 0; i < vec.size(); i++)
                gradient_components.push_back(partial_derivative(goal_fn)(vec, i));

            if (distance(gradient_components) < error) break;

            auto fg = [this, gradient_components](auto lambda)
            {
                std::vector<double> lambda_components;
                for (size_t i = 0; i < vec.size(); i++)
                    lambda_components.push_back(vec[i] - lambda * gradient_components[i]);
                return goal_fn(lambda_components);
            };

            // 使用牛顿法搜索极小点
            double min_lambda = minimization(fg);    // 极小点
            vec -= gradient_components * min_lambda; // 下一步迭代
            step += 1;
        }
    }

    std::vector<double>& const run() const
    {
        run(false);
        return vec;
    }

    std::vector<double>& const get() const
    {
        return vec;
    }
    
private:
    void step_output(size_t step)
    {
        std::cout << step << " : " << "(";
        for (size_t i = 0; i < vec.size(); i++)
            std::cout << vec[i] << (i + 1 == vec.size() ? "" : ", ");
        std::cout << ")" << std::endl;
    }

    double distance(const std::vector<double>& vec)
    {
        double sum = 0;
        for (auto _ : vec) sum += square(_);
        return std::sqrt(sum);
    }

    template<class Fn_Ty>
    auto minimization(Fn_Ty g)
    {
        return newton_method(deriv(g), 1.0);
    }

    // 误差
    const double error;
    // 最大迭代次数
    size_t max_it;
    // 初始值
    std::vector<double> init_variable;
    // 最终目标函数极值
    std::vector<double> vec;
    // 目标函数
    std::function<Goal_FnTy> goal_fn;
};

测试
#

$$ min \ f(\mathbf{x}) = 2x_1 ^2 + x_2 ^2, \\ x_0 = (1, 1)^T, \epsilon = \frac{1}{100000} $$

问题最优解：(0, 0)

该函数的样子：

Scheme：

(gradient-descent (lambda (vec) (+ (* 2 (square (list-ref vec 0))) (square (list-ref vec 1)))) (list 1.0 1.0) 20 0.00001)

输出：

(1. 1.)
(-.11109809785453906 .44445095107273047)
(.07406415148073814 .0740568166924851)
(-8.226060108502659e-3 3.2915785342297756e-2)
(5.479613029965878e-3 5.473976292833832e-3)
(-6.076458754474259e-4 2.4334748892817517e-3)
(3.998023673891758e-4 3.952652685951883e-4)
(-4.351939188329914e-5 1.760888045421857e-4)
(2.3048444806604486e-5 1.9613203841429633e-5)
(-1.9005725715702401e-6 8.666514023037452e-6)
(-4.13145223743898e-6 3.7481300728847094e-6)
(-4.519310966043615e-6 1.7948473900002134e-6)
(-4.58949997889981e-6 1.2987640626074323e-6)
(-4.602395677437769e-6 1.1998274507463487e-6)
(-4.604772161673363e-6 1.1812992414638758e-6)
(-4.605206860686667e-6 1.1778999304581313e-6)
(-4.605288702309115e-6 1.1772595832695892e-6)
(-4.605301020582233e-6 1.1771631923490466e-6)
(-4.6053101069904185e-6 1.1770920897757949e-6)
(-4.605310439480507e-6 1.1770894879586069e-6)
(-4.6053066942518565e-6 1.177118795311575e-6)
(-4.605302200151839e-6 1.1771539626038494e-6)
;Value: (-4.605302200151839e-6 1.1771539626038494e-6)

C++：

int main()
{
    auto my_func = [](const auto& x)
    {
        return 2. * square(x[0]) + square(x[1]);
    };
    
    GradientDescent<double(const std::vector<double>&)> gd = GradientDescent<double(const std::vector<double>&)>(my_func, {1.0, 1.0}, 1e-5);
    gd.run();
    return 0;
}

输出：

0 : (1, 1)
1 : (-0.111091, 0.444454)
2 : (0.074063, 0.0740505)
3 : (-0.00822527, 0.0329133)
4 : (0.00547914, 0.00547344)
5 : (-0.000607558, 0.00243326)
6 : (0.000399758, 0.0003952)
7 : (-4.35106e-05, 0.000176061)
8 : (2.30431e-05, 1.9607e-05)
9 : (-1.89989e-06, 8.6636e-06)
10 : (-4.13021e-06, 3.74858e-06)
11 : (-4.51828e-06, 1.79693e-06)
12 : (-4.58859e-06, 1.3009e-06)
13 : (-4.60152e-06, 1.20187e-06)
14 : (-4.60391e-06, 1.18329e-06)
15 : (-4.60435e-06, 1.17985e-06)
16 : (-4.60443e-06, 1.17921e-06)
17 : (-4.60445e-06, 1.17909e-06)
18 : (-4.60445e-06, 1.17907e-06)
19 : (-4.60445e-06, 1.17907e-06)

其它语言实现
#

Matlab
#

clc;clear;
figure

f = @(x,y) 3*(1-x).^2.*exp(-(x.^2) - (y+1).^2) ... 
   - 10*(x/5 - x.^3 - y.^5).*exp(-x.^2-y.^2) ... 
   - 1/3*exp(-(x+1).^2 - y.^2);

tolerance = 1e-5;
max_iter = 30;

x0 = 0.;
y0 = 1.5;
x = -2.8:0.001:2.8;
y = x';
X = -2.8:0.05:2.8;
set(gcf, 'Position', get(0,'Screensize'));
surf(X,X,f(X, X'));
xlabel('x');ylabel('y');zlabel('z');
[gx,gy] = gradient(f(x, y), 0.1);
hold on
filename = 'd:/blog/img/gradient_descent_visualization.gif';
for i=0:1:max_iter
%     syms x y
%     g = gradient(f, [x, y]);
%     g1 = subs(g(1), [x, y], {x0, y0});
%     g2 = subs(g(2), [x, y], {x0, y0});
%     d = -[g1 g2];
    t = find(((x < x0 + 1e-3) & (x > x0 - 1e-3)) & ((y < y0 + 1e-3) & (y > y0 - 1e-3)));
    d = [gx(t), gy(t)];
    disp(d)
    if isempty(d)
        break;
    elseif(sqrt(d * d') < tolerance)
        break;   
    end
    plot3(x0,y0,f(x0, y0),'r*');
    lambda = fminbnd(@(c) f(x0 - c*d(1), y0 - c*d(2)), -5, 5);
    x0 = x0 - lambda*d(1);
    y0 = y0 - lambda*d(2);
    drawnow
    
    frame = getframe(gcf);
    im = frame2im(frame);
    [imind,cm] = rgb2ind(im,256);
    if i == 0
        imwrite(imind,cm,filename,'gif', 'Loopcount',inf);
    else
        imwrite(imind,cm,filename,'gif','WriteMode','append');
    end
end

输出：

以下部分是以前写的，暂时没更新。

Python
#

square = lambda x: x ** 2
side = lambda a, b: (a ** 2 + b ** 2) ** (1 / 2)


def d(x_1, x_2):
    return -4 * x_1, -2 * x_2

def evaluate(x_1, x_2, a, b):
    return -(2 * a * x_1 + b * x_2) / (2 * square(a) + square(b))

def gradient_descent(x_1, x_2, epsilon, max_iters):
    iters = 0
    a, b = 1, 1
    while (side(a, b) > epsilon) & (iters < max_iters):
        print('%d (%.6f, %.6f)' % (iters + 1, x_1, x_2))
        a, b = d(x_1, x_2)
        gamma = evaluate(x_1, x_2, a, b)
        (x_1, x_2) = (x_1 + a * gamma, x_2 + b * gamma)
        iters += 1


if __name__ == '__main__':
    gradient_descent(1, 1, 0.001, 100)

输出:

1 (1.000000, 1.000000)
2 (-0.111111, 0.444444)
3 (0.074074, 0.074074)
4 (-0.008230, 0.032922)
5 (0.005487, 0.005487)
6 (-0.000610, 0.002439)
7 (0.000406, 0.000406)
8 (-0.000045, 0.000181)

代码实现 #

Scheme #

C++ #

测试 #

其它语言实现 #

Matlab #

Python #

参考文献 #