1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
//! A [Stochastic Gradient Descent with Momentum][1]
//! [1]: https://en.wikipedia.org/wiki/Stochastic_gradient_descent#Momentum
//!
//! Momentum in solving neural networks works similar to
//! they way it does in physics.
//! If you travel into a a direction with a high velocity,
//! it becomes very hard to change (or reverse)
//! the direction in which you are moving.
//!
//! Similarly when adjusting gradients during solving,
//! keeping a part of the previous gradient update can make solving faster,
//! since if you keep adjusting the gradients
//! into the same direction you will reach the optimum faster.
//! It also makes solving more stable.

use crate::co::prelude::*;
use crate::layer::*;
use crate::solver::*;
use crate::solvers::SGDSolver;
use crate::util::*;
use std::rc::Rc;
use std::sync::{Arc, RwLock};

#[derive(Debug)]
/// Stochastic Gradient Descent with Momentum.
///
/// See [module description][1] for more information.
/// [1]: ./index.html
pub struct Momentum<SolverB: IBackend + SolverOps<f32>> {
    /// The gradient update from the previous iteration for each blob.
    history: Vec<ArcLock<SharedTensor<f32>>>,
    /// The backend used for computing the gradient.
    backend: Rc<SolverB>,

    /// Scalar that temporarily holds learing rate for weight update computations
    lr: SharedTensor<f32>,
    /// Scalar that temporarily holds momentum for weight update computations
    momentum: SharedTensor<f32>,
}

impl<SolverB: IBackend + SolverOps<f32>> Momentum<SolverB> {
    /// Create a new SGD Momentum solver.
    ///
    /// Should not be called directly.
    /// Use [Solver::from_config][2] instead.
    ///
    /// [2]: ../../../solver/struct.Solver.html#method.from_config
    pub fn new(backend: Rc<SolverB>) -> Momentum<SolverB> {
        Momentum {
            history: Vec::new(),
            backend: backend,

            lr: SharedTensor::<f32>::new(&[1]),
            momentum: SharedTensor::<f32>::new(&[1]),
        }
    }
}

impl<B: IBackend + SolverOps<f32>, NetB: IBackend + LayerOps<f32> + 'static> SGDSolver<B, NetB> for Momentum<B> {
    fn compute_update_value(
        &mut self,
        config: &SolverConfig,
        weight_gradient: &ArcLock<SharedTensor<f32>>,
        history_blob_id: usize,
        global_lr: &f32,
        blob_lr: &f32,
    ) {
        // PERF: check if value is changed before writing it
        crate::weight::FillerType::Constant {
            value: global_lr * blob_lr,
        }
        .fill(&mut self.lr);

        crate::weight::FillerType::Constant { value: config.momentum }.fill(&mut self.momentum);

        let backend = ISolver::<B, NetB>::backend(self);
        let device = IBackend::device(backend);

        let history_blob = &self.history[history_blob_id];
        Axpby::axpby(
            backend,
            &self.lr,
            &weight_gradient.read().unwrap(),
            &self.momentum,
            &mut history_blob.write().unwrap(),
        )
        .unwrap();

        backend
            .copy(&history_blob.read().unwrap(), &mut weight_gradient.write().unwrap())
            .unwrap();
    }
}

impl_isolver_sgd!(Momentum<SolverB>);