1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150
//! Provides the trainers for the Layers.
//!
//! The optimal state of a neural network would be the one where
//! for any given input to the network, it would produce an output perfectly
//! matching the target function. In that state the loss function would have its
//! [global minimum][minimum].
//! This statement can also be reversed to *if we manage to minimize
//! the loss function of the network, we map the target function*.
//!
//! We can change the way a network works by adjusting its individual
//! [weights][weight]. So to optimize the network we want to adjust
//! the weights in a way that the loss function will be minimized.
//! If we want to know how to correctly adjust a single weight,
//! we have to get to know the effect of that weight
//! on the loss function (= the *gradient*).
//! This can be done via a method called [*backpropagation*][backprop].
//!
//! There are different methods of how a Solver solves for the minimum of the
//! loss function. They mostly differ in two ways:
//!
//! - How to execute the backpropagation to compute the gradient.
//! - How to comute the weight update from the gradient.
//!
//! [layer]: ../layer/index.html
//! [loss]: ../layers/loss/index.html
//! [weight]: https://en.wikipedia.org/wiki/Synaptic_weight
//! [minimum]: http://mathworld.wolfram.com/GlobalMinimum.html
//! [backprop]: https://en.wikipedia.org/wiki/Backpropagation
#[allow(unused_import_braces)]
pub use self::sgd::Momentum;
pub mod sgd;
use crate::co::{IBackend, SharedTensor};
use crate::layer::*;
use crate::solver::*;
use crate::util::*;
trait SGDSolver<SolverB: IBackend + SolverOps<f32>, NetB: IBackend + LayerOps<f32>>: ISolver<SolverB, NetB> {
fn compute_update_value(
&mut self,
config: &SolverConfig,
weight_blob: &ArcLock<SharedTensor<f32>>,
history_blob_id: usize,
global_lr: &f32,
blob_lr: &f32,
);
/// [Clip gradients][1] when they exceed [SolverConfig.clip_gradients][2].
/// [1]: http://arxiv.org/abs/1211.5063
/// [2]: ../solver/struct.SolverConfig.html
///
/// [Gradient norm clipping][1] is a technique used when dealing with
/// [Recurrent Neural Networks][3].
/// When the [L2 norm][4] of the gradients exceeds a threshold it is "clipped"
/// to that threshold. The naming can be misleading since the gradients are not
/// actually clipped (as in cut off), but rescaled to the threshold.
///
/// [3]: https://en.wikipedia.org/wiki/Recurrent_neural_network
/// [4]: https://en.wikipedia.org/wiki/Norm_(mathematics)#Euclidean_norm
#[allow(unused_must_use)]
fn clip_gradients<B: IBackend + LayerOps<f32> + 'static>(&self, config: &SolverConfig, net: &mut Layer<B>) {
// skip clipping gradients if SolverConfig.clip_gradients is set to None
if let Some(clip_threshold) = config.clip_gradients {
let native = native_backend();
let net_gradients = net.learnable_weights_gradients();
let mut sumsq_diff = 0f32;
let backend = self.backend();
for net_gradient in net_gradients.clone() {
let gradient = net_gradient.read().unwrap();
// PERF: preallocate tensor once
let mut result = SharedTensor::new(&[1]);
// gradient.sumsq_diff(self.backend(), &mut result);
self.backend().dot(&gradient, &gradient, &mut result);
let sumsq_diff_slice = result.read(native.device()).unwrap().as_slice::<f32>();
sumsq_diff += sumsq_diff_slice[0];
}
let l2norm_diff = sumsq_diff.sqrt();
if l2norm_diff > clip_threshold {
let scale_factor = clip_threshold / l2norm_diff;
info!(
"Gradient clipping: scaling down gradients (L2 norm {} > {})
by scale factor {}",
l2norm_diff, clip_threshold, scale_factor
);
let mut scale_shared = native_scalar(scale_factor);
for weight_gradient in net_gradients {
let mut gradient = weight_gradient.write().unwrap();
backend.scal(&mut scale_shared, &mut gradient);
}
}
}
}
/// Scale the gradient to counteract the [SolverConfig.minibatch_size][1]
/// [1]: ../solver/struct.SolverConfig.html
///
/// To counteract that we are accumulating the gradients over multiple samples,
/// we need to scale the gradients down to the equivalent of a single sample.</br>
/// E.g. with a `minibatch_size` of 4 we need to scale the gradient by 0.25 (= 1/4).
fn normalize(&self, config: &SolverConfig, weight_blob: &ArcLock<SharedTensor<f32>>) {
if config.minibatch_size > 1 {
let scale_factor = 1f32 / config.minibatch_size as f32;
let mut gradient = weight_blob.write().unwrap();
let native = native_backend();
let mut scale_factor_shared = native_scalar(scale_factor);
// self.backend().scal_plain(&scale_factor_shared, &mut gradient).unwrap();
self.backend().scal(&mut scale_factor_shared, &mut gradient).unwrap();
}
}
/// [Regularize][1] the gradient according to the configured [RegularizationMethod][2].
/// [1]: https://cs231n.github.io/neural-networks-2/#reg
/// [2]: ../solver/enum.RegularizationMethod.html
fn regularize(
&self,
config: &SolverConfig,
weight_gradient: &ArcLock<SharedTensor<f32>>,
blob_weight_decay: Option<f32>,
) {
if let Some(global_weight_decay) = config.weight_decay {
if let Some(regularization_method) = config.regularization_method {
match blob_weight_decay {
Some(weight_decay_mult) => {
let local_decay = global_weight_decay * weight_decay_mult;
match regularization_method {
RegularizationMethod::L2 => {
let native = native_backend();
let decay_shared = native_scalar(local_decay);
let gradient = &mut weight_gradient.write().unwrap();
// gradient.regularize_l2(self.backend(), &decay_shared);
// backend.axpy_plain(&decay_shared, &self.data, &mut self.diff).unwrap();
// TODO: solver
unimplemented!();
}
}
}
None => {
error!("Weight decay multiplier for gradient missing.");
}
}
}
}
}
}